In [1]:
import numpy as np, pandas as pd
import json
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')

In [2]:
# !conda update pandas --y

In [3]:
train = pd.read_csv("data/train.csv")

In [4]:
train.shape

(87599, 4)

In [7]:
# paras = list(train["context"].drop_duplicates().reset_index(drop= True))

In [8]:
# len(paras)

In [9]:
# blob = TextBlob(" ".join(paras))
# sentences = [item.raw for item in blob.sentences]

In [10]:
# len(sentences)

In [11]:
# questions = list(train["question"])
# blob = TextBlob(" ".join(questions))
# questions = [item.raw for item in blob.sentences]

In [12]:
# for i in questions: sentences.append(i)

In [13]:
# len(questions)

### Building Vocabulary

In [14]:
# infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
# infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")
# infersent.build_vocab(sentences, tokenize=True)

### Loading Embedding dictionary

In [15]:
with open("data/dict_embeddings1.pickle", "rb") as f:
    d1 = pickle.load(f)

In [16]:
with open("data/dict_embeddings2.pickle", "rb") as f:
    d2 = pickle.load(f)

In [17]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [18]:
len(dict_emb)

179862

In [19]:
del d1, d2

## Data Processing

In [20]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [21]:
train.head(3)

Unnamed: 0,answer_start,context,question,text
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building


In [22]:
train.shape

(87599, 4)

In [23]:
train.dropna(inplace=True)

In [24]:
train.shape

(87598, 4)

In [31]:
def process_data(train):
    
    print("step 1")
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    print("step 2")
    train["target"] = train.apply(get_target, axis = 1)
    
    print("step 3")
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    print("step 4")
    train['quest_emb'] = train['question'].apply(lambda x: dict_emb[x] if x in dict_emb else np.zeros(4096) )
        
    return train   

In [32]:
train = process_data(train)

step 1
step 2
step 3
step 4


In [33]:
train.head(3)

Unnamed: 0,answer_start,context,question,text,sentences,quest_emb,target,sent_emb
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...","[[0.11010079, 0.11422941, 0.11560896, 0.054894...",5,"[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...","[[0.10951651, 0.110306226, 0.052100066, 0.0305...",2,"[[0.055199966, 0.05013141, 0.047870383, 0.0162..."
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"[Architecturally, the school has a Catholic ch...","[[0.011956469, 0.14930707, 0.026600493, 0.0527...",3,"[[0.055199966, 0.05013141, 0.047870383, 0.0162..."


## Predicted Cosine & Euclidean Index

In [34]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [35]:
def pred_idx(distances):
    return np.argmin(distances)   

In [36]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: list(np.sum(x, axis = 1)))
    del train["diff"]
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train
    

In [37]:
predicted = predictions(train)

In [38]:
predicted.head(3)

Unnamed: 0,answer_start,context,question,text,sentences,quest_emb,target,sent_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"[Architecturally, the school has a Catholic ch...","[[0.11010079, 0.11422941, 0.11560896, 0.054894...",5,"[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.424736299052452, 0.36405004106069117, 0.347...","[14.563858, 15.262212, 17.398178, 14.272491, 1...",5,5
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"[Architecturally, the school has a Catholic ch...","[[0.10951651, 0.110306226, 0.052100066, 0.0305...",2,"[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.45407456884452513, 0.32262004808444933, 0.3...","[12.889506, 12.285219, 16.843704, 8.361172, 11...",3,3
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"[Architecturally, the school has a Catholic ch...","[[0.011956469, 0.14930707, 0.026600493, 0.0527...",3,"[[0.055199966, 0.05013141, 0.047870383, 0.0162...","[0.39585783692319865, 0.29170832145169434, 0.3...","[11.857297, 11.392319, 15.061656, 7.1847134, 8...",3,3


In [39]:
predicted["cosine_sim"][0]

[0.424736299052452,
 0.36405004106069117,
 0.3477550016687636,
 0.3942415731988862,
 0.37102476524939887,
 0.1856902254140269,
 0.35192069116776403]

In [40]:
predicted["euclidean_dis"][0]

[14.563858, 15.262212, 17.398178, 14.272491, 13.339654, 9.336262, 15.720997]

## Accuracy

In [41]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [42]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

0.44856046941711


### Accuracy for Cosine Similarity

In [43]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

0.6338843352587958


In [44]:
predicted.to_csv("train_detect_sent.csv", index=None)

In [None]:
train["target"] = train.apply(lambda x: get_index(x["text"], x["sentence"]))

train["cosine_sim"] = train.apply(lambda x: cosine_sim(x["quest_emb"],x["sent_emb"]))
train["euclidean_dis"] = train.apply(lambda x: euclidean_dis(x["quest_emb"],x["sent_emb"]))
train["pred_idx_cos"] = train.apply(lambda x: pred_idx(x["cosine_sim"]))
train["pred_idx_euc"] = train.apply(lambda x: pred_idx(x["euclidean_dist"]))