Notebook to perform data processing, and create interim and clean datasets

# Setup

In [170]:
import sys
import time

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

In [37]:
proj_path = "/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/"
sys.path.insert(0, proj_path)

In [38]:
import bert.modeling
import bert.extract_features
import bert.tokenization

import src.utils as utils
import src.data.data_utils as data_utils
import src.models.bert_model_utils as bert_model_utils

In [107]:
%reload_ext autoreload
%autoreload 2

pd.options.display.max_columns = 20

In [40]:
bert_dir = proj_path + "bert/"
data_raw_dir = proj_path + "data/raw/"
data_interim_dir = proj_path + "data/interim/"
data_clean_dir = proj_path + "data/clean/"

# Read Raw Data

In [41]:
df_train = pd.read_csv(proj_path + "data/raw/gap-test.tsv", sep='\t')
df_valid = pd.read_csv(proj_path + "data/raw/gap-validation.tsv", sep='\t')
df_test = pd.read_csv(proj_path + "data/raw/gap-development.tsv", sep='\t')

# Create Interim Data

In [150]:
# Create BERT input files
df_train.Text.to_csv(data_interim_dir + "bert_input_train.txt", index = False, header = False)
df_valid.Text.to_csv(data_interim_dir + "bert_input_valid.txt", index = False, header = False)
df_test.Text.to_csv(data_interim_dir + "bert_input_test.txt", index = False, header = False)

# Create BERT feature extraction commands
train_bert_extract_features_cmd = bert_model_utils.create_bert_extract_features_cmd(df_train, "train")
valid_bert_extract_features_cmd = bert_model_utils.create_bert_extract_features_cmd(df_valid, "valid")
test_bert_extract_features_cmd = bert_model_utils.create_bert_extract_features_cmd(df_test, "test")

# Execute commands in terminal to create BERT output files
print(train_bert_extract_features_cmd)
print(valid_bert_extract_features_cmd)
print(test_bert_extract_features_cmd)

python /Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/bert/extract_features.py       --input_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/data/interim/bert_input_train.txt       --output_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/data/interim/bert_output_train.json       --vocab_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/vocab.txt       --bert_config_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/bert_config.json       --init_checkpoint=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/bert_model.ckpt       --layers=-1       --max_seq_length=256       --batch_size=8
python /Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/bert/extract_features.

In [153]:
# Read in BERT output files
print("Started at ", time.ctime())
train_bert_features = bert_model_utils.read_in_bert_features(dataset_name="train")
valid_bert_features = bert_model_utils.read_in_bert_features(dataset_name="valid")
test_bert_features = bert_model_utils.read_in_bert_features(dataset_name="test")
print("Finished at ", time.ctime())

Started at  Tue Apr  2 09:58:59 2019
Finished at  Tue Apr  2 10:04:39 2019


In [154]:
# Create word embedding dataframes
print("Started at ", time.ctime())
df_train_emb = bert_model_utils.create_bert_word_embedding_df(
    df=df_train, bert_output=train_bert_features, dataset_name="train")
df_valid_emb = bert_model_utils.create_bert_word_embedding_df(
    df=df_valid, bert_output=valid_bert_features, dataset_name="valid")
df_test_emb =bert_model_utils.create_bert_word_embedding_df(
    df=df_test, bert_output=test_bert_features, dataset_name="test")
print("Finished at ", time.ctime())

Started at  Tue Apr  2 10:10:16 2019


  emb_B /= cnt_B
  emb_A /= cnt_A


Finished at  Tue Apr  2 10:10:42 2019


In [156]:
# Save word embedding dataframes as json files
df_train_emb.to_json(data_interim_dir + "bert_contextual_embeddings_gap_train.json", orient = 'columns')
df_valid_emb.to_json(data_interim_dir + "bert_contextual_embeddings_gap_valid.json", orient = 'columns')
df_test_emb.to_json(data_interim_dir + "bert_contextual_embeddings_gap_test.json", orient = 'columns')

# Create Clean Data

In [159]:
# Read in and parse word embedding json files
X_train, Y_train = bert_model_utils.parse_json(
    pd.read_json(data_interim_dir + "bert_contextual_embeddings_gap_train.json"))
X_valid, Y_valid = bert_model_utils.parse_json(
    pd.read_json(data_interim_dir + "bert_contextual_embeddings_gap_valid.json"))
X_test, Y_test = bert_model_utils.parse_json(
    pd.read_json(data_interim_dir + "bert_contextual_embeddings_gap_test.json"))

In [173]:
# Fill-in missing values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train)

X_train_fill = imp.transform(X_train)
X_valid_fill = imp.transform(X_valid)
X_test_fill = imp.transform(X_test)

In [None]:
# Save clean features
np.savetxt(data_clean_dir + "X_train.csv", X_train_fill, delimiter=",")
np.savetxt(data_clean_dir + "X_valid.csv", X_valid_fill, delimiter=",")
np.savetxt(data_clean_dir + "X_test.csv", X_test_fill, delimiter=",")

# Save clean labels
np.savetxt(data_clean_dir + "Y_train.csv", Y_train, delimiter=",")
np.savetxt(data_clean_dir + "Y_valid.csv", Y_valid, delimiter=",")
np.savetxt(data_clean_dir + "Y_test.csv", Y_test, delimiter=",")