In [133]:
import numpy as np, pandas as pd
import json
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')

In [134]:
# !conda update pandas --y

In [135]:
train = pd.read_csv("data/train.csv")

In [136]:
paras = list(train["context"].drop_duplicates().reset_index(drop= True))

In [137]:
len(paras)

5068

In [138]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [139]:
len(sentences)

20075

In [140]:
questions = list(train["question"])
blob = TextBlob(" ".join(questions))
questions = [item.raw for item in blob.sentences]

In [141]:
for i in questions: sentences.append(i)

In [142]:
len(sentences)

42692

### Building Vocabulary

In [143]:
infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")
infersent.build_vocab(questions, tokenize=True)

Found 17779(/18913) words with glove vectors
Vocab size : 17779


### Loading Embedding dictionary

In [144]:
with open("data/dict_embeddings.pickle", "rb") as f:
    dict_emb = pickle.load(f)

## Data Processing

In [145]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [146]:
train.head(3)

Unnamed: 0,answer_start,context,question,text
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building


In [147]:
def process_data(train):
    train['sentences'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    train["target"] = train.apply(get_target, axis = 1)
    
    train['sent_emb'] = train['sentences'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    
    train['quest_emb'] = train['question'].apply(lambda x: infersent.encode([x], tokenize=True))
        
    return train   

In [None]:
train = process_data(train)

In [None]:
train.head(3)

## Predicted Cosine & Euclidean Index

In [None]:
def cosine_sim(x):
    li = []
    for item in x["sent_emb"]:
        li.append(spatial.distance.cosine(item,x["quest_emb"][0]))
    return li   

In [None]:
def pred_idx(distances):
    return np.argmin(distances)   

In [None]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(cosine_sim, axis = 1)
    train["diff"] = (train["quest_emb"] - train["sent_emb"])**2
    train["euclidean_dis"] = train["diff"].apply(lambda x: np.sum(x, axis = 1))
    del train["diff"]
    
    train["pred_idx_cos"] = train["cosine_sim"].apply(lambda x: pred_idx(x))
    train["pred_idx_euc"] = train["euclidean_dis"].apply(lambda x: pred_idx(x))
    
    return train
    

In [None]:
predicted = predictions(train)

In [None]:
predicted.head()

## Accuracy

In [None]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [None]:
print(accuracy(predicted["target"], predicted["pred_idx_euc"]))

### Accuracy for Cosine Similarity

In [None]:
print(accuracy(predicted["target"], predicted["pred_idx_cos"]))

In [None]:
predicted.to_csv("train_detect_sent.csv", index=None)

### Finidng lengths of paras

In [None]:
train = pd.read_csv("train_detect_sent.csv")

In [None]:
train.shape

In [46]:
train['sentence'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])

In [47]:
train['sentence'][0]

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [None]:
train.to_csv("train_detect_sent.csv", index= False)

In [None]:
train["target"] = train.apply(lambda x: get_index(x["text"], x["sentence"]))

train["cosine_sim"] = train.apply(lambda x: cosine_sim(x["quest_emb"],x["sent_emb"]))
train["euclidean_dis"] = train.apply(lambda x: euclidean_dis(x["quest_emb"],x["sent_emb"]))
train["pred_idx_cos"] = train.apply(lambda x: pred_idx(x["cosine_sim"]))
train["pred_idx_euc"] = train.apply(lambda x: pred_idx(x["euclidean_dist"]))