In [46]:
import pandas as pd
import spacy
import numpy as np
spacy_model = "en_core_web_sm"
nlp = spacy.load(spacy_model)

In [2]:
gap_train= pd.read_pickle('./temp_result/train_kaggle_cropped')
gap_test= pd.read_pickle('./temp_result/test_kaggle_cropped')
gap_valid= pd.read_pickle('./temp_result/valid_kaggle_cropped')

In [3]:
def get_token_map(sentence,token_list):
    token_map = {}
    i = 0
    #print (token_list)
    token_list = token_list[1:-1]
    #print (token_list)
    for t in token_list:
        #print (i)
        if t!= "#":
            t = t.strip("#")
        while sentence[i:i+len(t)].lower()!=t:
            #print (sentence[i:i+len(t)].lower())
            i = i + 1
        token_map[i] = t
        #print (token_map)
        i = i + len(t)
    return token_map

In [19]:
def get_sentence_map(sentence):
    doc = nlp(sentence)
    sentence_map = {}
    i = 0

    for s in doc.sents:
        s = str(s)
        while sentence[i:i+len(s)]!=s:
            i = i + 1
        sentence_map[i] = s
        i = i + len(s)
    return sentence_map

In [4]:
gap_train['token_map'] = gap_train.apply(lambda x: get_token_map(x.Text, x.tokens), axis=1)
gap_test['token_map'] = gap_test.apply(lambda x: get_token_map(x.Text, x.tokens), axis=1)
gap_valid['token_map'] = gap_valid.apply(lambda x: get_token_map(x.Text, x.tokens), axis=1)

In [20]:
gap_train['sentence_map'] = gap_train.Text.map(get_sentence_map)
gap_test['sentence_map'] = gap_test.Text.map(get_sentence_map)
gap_valid['sentence_map'] = gap_valid.Text.map(get_sentence_map)

In [30]:
def get_distance(sentence,A,B):
    start = min(A,B)
    end = max(A,B)
    dist = nlp(sentence[start:end])
    return (B-A)/abs(B-A)*len(dist)/500

In [32]:
gap_train['A_dist'] = gap_train.apply(lambda x: get_distance(x.Text, x["A-offset"],x['Pronoun-offset']), axis=1)
gap_test['A_dist'] = gap_test.apply(lambda x: get_distance(x.Text, x["A-offset"],x['Pronoun-offset']), axis=1)
gap_valid['A_dist'] = gap_valid.apply(lambda x: get_distance(x.Text, x["A-offset"],x['Pronoun-offset']), axis=1)
gap_train['B_dist'] = gap_train.apply(lambda x: get_distance(x.Text, x["B-offset"],x['Pronoun-offset']), axis=1)
gap_test['B_dist'] = gap_test.apply(lambda x: get_distance(x.Text, x["B-offset"],x['Pronoun-offset']), axis=1)
gap_valid['B_dist'] = gap_valid.apply(lambda x: get_distance(x.Text, x["B-offset"],x['Pronoun-offset']), axis=1)

In [34]:
def get_relative_pos(sentence,offset,sentence_map):
    for i in sorted(sentence_map.keys()):
        if offset >= i:
            break
    return len(nlp(sentence[i:offset]))/len(sentence_map[i])

In [35]:
gap_train['A_pos'] = gap_train.apply(lambda x: get_relative_pos(x.Text, x["A-offset"],x['sentence_map']), axis=1)
gap_test['A_pos'] = gap_test.apply(lambda x: get_relative_pos(x.Text, x["A-offset"],x['sentence_map']), axis=1)
gap_valid['A_pos'] = gap_valid.apply(lambda x: get_relative_pos(x.Text, x["A-offset"],x['sentence_map']), axis=1)
gap_train['B_pos'] = gap_train.apply(lambda x: get_relative_pos(x.Text, x["B-offset"],x['sentence_map']), axis=1)
gap_test['B_pos'] = gap_test.apply(lambda x: get_relative_pos(x.Text, x["B-offset"],x['sentence_map']), axis=1)
gap_valid['B_pos'] = gap_valid.apply(lambda x: get_relative_pos(x.Text, x["B-offset"],x['sentence_map']), axis=1)
gap_train['pron_pos'] = gap_train.apply(lambda x: get_relative_pos(x.Text, x["Pronoun-offset"],x['sentence_map']), axis=1)
gap_test['pron_pos'] = gap_test.apply(lambda x: get_relative_pos(x.Text, x["Pronoun-offset"],x['sentence_map']), axis=1)
gap_valid['pron_pos'] = gap_valid.apply(lambda x: get_relative_pos(x.Text, x["Pronoun-offset"],x['sentence_map']), axis=1)

In [47]:
def get_vector_index(name,offset,token_map):
    name = "".join(name.lower().split(" "))
    idx = 0
    s = ""
    res = []
    for i in sorted(token_map.keys()):
        idx = idx + 1
        if i < offset:
            continue
        else:
            s = s+token_map[i]
            res.append(idx)
            if s == name:
                break
    return np.array(res)        

In [52]:
gap_train['A_idx'] = gap_train.apply(lambda x: get_vector_index(x.A, x["A-offset"],x['token_map']), axis=1)
gap_test['A_idx'] = gap_test.apply(lambda x: get_vector_index(x.A, x["A-offset"],x['token_map']), axis=1)
gap_valid['A_idx'] = gap_valid.apply(lambda x: get_vector_index(x.A, x["A-offset"],x['token_map']), axis=1)
gap_train['B_idx'] = gap_train.apply(lambda x: get_vector_index(x.B, x["B-offset"],x['token_map']), axis=1)
gap_test['B_idx'] = gap_test.apply(lambda x: get_vector_index(x.B, x["B-offset"],x['token_map']), axis=1)
gap_valid['B_idx'] = gap_valid.apply(lambda x: get_vector_index(x.B, x["B-offset"],x['token_map']), axis=1)
gap_train['pron_idx'] = gap_train.apply(lambda x: get_vector_index(x.Pronoun, x["Pronoun-offset"],x['token_map']), axis=1)
gap_test['pron_idx'] = gap_test.apply(lambda x: get_vector_index(x.Pronoun, x["Pronoun-offset"],x['token_map']), axis=1)
gap_valid['pron_idx'] = gap_valid.apply(lambda x: get_vector_index(x.Pronoun, x["Pronoun-offset"],x['token_map']), axis=1)

In [54]:
gap_train['A_vector'] = gap_train.apply(lambda x: x["vector"][x['A_idx'],:], axis=1)
gap_test['A_vector'] = gap_test.apply(lambda x: x["vector"][x['A_idx'],:], axis=1)
gap_valid['A_vector'] = gap_valid.apply(lambda x: x["vector"][x['A_idx'],:], axis=1)
gap_train['B_vector'] = gap_train.apply(lambda x: x["vector"][x['B_idx'],:], axis=1)
gap_test['B_vector'] = gap_test.apply(lambda x: x["vector"][x['B_idx'],:], axis=1)
gap_valid['B_vector'] = gap_valid.apply(lambda x: x["vector"][x['B_idx'],:], axis=1)
gap_train['pron_vector'] = gap_train.apply(lambda x: x["vector"][x['pron_idx'],:], axis=1)
gap_test['pron_vector'] = gap_test.apply(lambda x: x["vector"][x['pron_idx'],:], axis=1)
gap_valid['pron_vector'] = gap_valid.apply(lambda x: x["vector"][x['pron_idx'],:], axis=1)

In [56]:
gap_train.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,...,B_dist,A_pos,B_pos,pron_pos,A_idx,B_idx,pron_idx,A_vector,B_vector,pron_vector
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,...,0.008,0.444444,0.464052,0.490196,"[96, 97, 98, 99]","[101, 102, 103]",[109],"[[-0.23477754, -0.6348008, -0.22518, -0.033234...","[[-0.4742566, 0.17008828, -0.39835194, -0.6357...","[[-0.58112746, -0.31262234, 0.6013951, -0.5986..."
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,...,0.014,0.253623,0.275362,0.300725,"[92, 93, 94]","[100, 101, 102, 103, 104, 105, 106, 107, 108]",[113],"[[-0.37184137, -1.0916334, -0.3613571, -0.4031...","[[0.02601213, -0.046142176, 0.22424094, -0.601...","[[-0.311719, 0.7609918, 0.34107757, 0.17696574..."
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,...,0.008,0.408333,0.475,0.508333,"[63, 64, 65, 66, 67, 68]","[75, 76]",[81],"[[0.27122185, -0.44208205, -0.7428727, 0.18834...","[[-0.637481, 0.540418, 0.4081558, -1.191327, 0...","[[-0.08557334, 0.08147873, 0.049174372, 0.4888..."
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,...,-0.004,0.222951,0.321311,0.314754,"[108, 109, 110]","[143, 144, 145, 146]",[141],"[[-0.2239024, 0.60971785, -0.035572127, -1.431...","[[0.970224, -0.96855867, -0.72124153, -0.06507...","[[-0.40841633, 0.23993859, 0.16797814, -0.2843..."
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,...,-0.056,0.473118,0.607527,0.456989,"[113, 114]","[142, 143, 144, 145]",[110],"[[0.16017668, 0.596208, 0.15257946, -0.0950187...","[[0.31907678, 0.81342745, -1.005262, 0.2917817...","[[-0.6242132, 0.7426739, -0.61988443, -0.65061..."


In [57]:
gap_train.to_pickle('./temp_result/train_kaggle_processed')
gap_test.to_pickle('./temp_result/test_kaggle_processed')
gap_valid.to_pickle('./temp_result/valid_kaggle_processed')

In [58]:
gap_train.columns
# vector all bert vectors
# tokens the list of tokens
# token_map  a dict from the position of the tokens to tokens
# sentence_map a dict from the position of the entence to entence
# A_dist A_dist = (the number of words between the A and the pron)/500. If the pron. appears after the A, then it is positive and vice versa.
# A_pos (the number of words between the A and the head of the sentence contains A)/the number of the words of the sentence contains A
# A_idx the index of the vector of A
# A_vector the bert vector which corresponds to A

Index(['ID', 'Text', 'Pronoun', 'Pronoun-offset', 'A', 'A-offset', 'A-coref',
       'B', 'B-offset', 'B-coref', 'URL', 'vector', 'tokens', 'token_map',
       'sentence_map', 'A_dist', 'B_dist', 'A_pos', 'B_pos', 'pron_pos',
       'A_idx', 'B_idx', 'pron_idx', 'A_vector', 'B_vector', 'pron_vector'],
      dtype='object')