# Setup

In [1]:
import os
import zipfile
import sys
import time

import numpy as np
import pandas as pd
import tensorflow as tf

  return f(*args, **kwds)


In [18]:
proj_path = "/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/"
sys.path.insert(0, proj_path)

In [68]:
import bert.modeling
import bert.extract_features
import bert.tokenization

import src.utils as utils
import src.data.data_utils as data_utils

In [19]:
%reload_ext autoreload
%autoreload 2

pd.options.display.max_columns = 999

In [23]:
models_dir = proj_path + "models/"
bert_dir = proj_path + "bert/"
data_interim_dir = proj_path + "data/interim/"

# Read Data

In [8]:
train_df = pd.read_csv(proj_path + "data/raw/gap-development.tsv", sep='\t')
valid_df = pd.read_csv(proj_path + "data/raw/gap-validation.tsv", sep='\t')
test_df = pd.read_csv(proj_path + "data/raw/gap-test.tsv", sep='\t')

In [9]:
utils.display_df(train_df)
utils.display_df(test_df)
utils.display_df(valid_df)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...


(2000, 11)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner


(2000, 11)

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,validation-1,He admitted making four trips to China and pla...,him,256,Jose de Venecia Jr,208,False,Abalos,241,False,http://en.wikipedia.org/wiki/Commission_on_Ele...


(454, 11)

In [10]:
train_df.Text[0]

"Zoe Telford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after he slept with Jenny, and is not seen again. Phoebe Thomas played Cheryl Cassidy, Pauline's friend and also a year 11 pupil in Simon's class. Dumped her boyfriend following Simon's advice after he wouldn't have sex with her but later realised this was due to him catching crabs off her friend Pauline."

# Modeling

In [36]:
# Specifiy dataset
data = valid_df

In [37]:
text = data["Text"]
text.to_csv(data_interim_dir + "input.txt", index = False, header = False)

extract_features_cmd = "python {extract_features_script} \
  --input_file={input_file} \
  --output_file={output_file} \
  --vocab_file={vocab_file} \
  --bert_config_file={bert_config_file} \
  --init_checkpoint={init_checkpoint} \
  --layers=-1 \
  --max_seq_length=256 \
  --batch_size=8".format(
    extract_features_script = bert_dir + "extract_features.py",
    input_file = data_interim_dir + "input.txt",
    output_file = data_interim_dir + "output.json",
    vocab_file = models_dir + "uncased_L-12_H-768_A-12/vocab.txt",
    bert_config_file = models_dir + "uncased_L-12_H-768_A-12/bert_config.json",
    init_checkpoint = models_dir + "uncased_L-12_H-768_A-12/bert_model.ckpt"    
)

In [38]:
# Execute command in terminal
extract_features_cmd

'python /Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/bert/extract_features.py   --input_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/data/interim/input.txt   --output_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/data/interim/output.json   --vocab_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/vocab.txt   --bert_config_file=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/bert_config.json   --init_checkpoint=/Users/aarontrefler_temp2/Documents/My_Documents/Kaggle/kaggle-gendered-pronoun/models/uncased_L-12_H-768_A-12/bert_model.ckpt   --layers=-1   --max_seq_length=256   --batch_size=8'

In [69]:
bert_output = pd.read_json(data_interim_dir + "output.json", lines = True)

index = data.index
columns = ["emb_A", "emb_B", "emb_P", "label"]
emb = pd.DataFrame(index = index, columns = columns)
emb.index.name = "ID"
    
for i in range(len(data)): # For each line in the data file
    # get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
    P = data.loc[i,"Pronoun"].lower()
    A = data.loc[i,"A"].lower()
    B = data.loc[i,"B"].lower()

    # For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
    P_offset = data_utils.compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"Pronoun-offset"])
    A_offset = data_utils.compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"A-offset"])
    B_offset = data_utils.compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"B-offset"])
    # Figure out the length of A, B, not counting spaces or special characters
    A_length = count_length_no_special(A)
    B_length = count_length_no_special(B)

    # Initialize embeddings with zeros
    emb_A = np.zeros(768)
    emb_B = np.zeros(768)
    emb_P = np.zeros(768)

    # Initialize counts
    count_chars = 0
    cnt_A, cnt_B, cnt_P = 0, 0, 0

    features = pd.DataFrame(bert_output.loc[i,"features"]) # Get the BERT embeddings for the current line in the data file
    for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
        token = features.loc[j,"token"]

        # See if the character count until the current token matches the offset of any of the 3 target words
        if count_chars  == P_offset: 
            # print(token)
            emb_P += np.array(features.loc[j,"layers"][0]['values'])
            cnt_P += 1
        if count_chars in range(A_offset, A_offset + A_length): 
            # print(token)
            emb_A += np.array(features.loc[j,"layers"][0]['values'])
            cnt_A +=1
        if count_chars in range(B_offset, B_offset + B_length): 
            # print(token)
            emb_B += np.array(features.loc[j,"layers"][0]['values'])
            cnt_B +=1                               
        # Update the character count
        count_chars += count_length_no_special(token)
    # Taking the average between tokens in the span of A or B, so divide the current value by the count 
    emb_A /= cnt_A
    emb_B /= cnt_B

    # Work out the label of the current piece of text
    label = "Neither"
    if (data.loc[i,"A-coref"] == True):
        label = "A"
    if (data.loc[i,"B-coref"] == True):
        label = "B"

    # Put everything together in emb
    emb.iloc[i] = [emb_A, emb_B, emb_P, label]


In [70]:
emb.head()

Unnamed: 0_level_0,emb_A,emb_B,emb_P,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[0.022817833333333315, -0.4209143333333332, 0....","[-0.3670055, -0.4148985, 0.5928515, 0.31750449...","[-0.123571, -0.16237000000000001, 0.040803, -0...",Neither
1,"[-0.49702399999999997, -0.457437, 0.176571, -0...","[-0.862622, 0.055244999999999995, 0.7887909999...","[-0.035507, -0.293537, -0.376274, -0.156367, 1...",B
2,"[0.067909, -0.3523786666666666, 0.296469, -0.4...","[0.975849, -1.198573, 0.427452, 0.289501, 0.73...","[-0.084656, -0.338914, 0.096026, -0.269828, 0....",B
3,"[0.06080499999999999, -0.21616800000000003, 0....","[-0.49502033333333334, -0.056551999999999984, ...","[-0.044270000000000004, 0.393868, 0.5250009999...",A
4,"[-0.009341999999999998, -0.13718666666666665, ...","[0.16866633333333333, -0.7929363333333334, 0.1...","[-0.393549, -0.395325, 0.112192, -0.1078519999...",B
