In [42]:
import numpy as np, pandas as pd
import json
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
paras = list(train["context"].drop_duplicates().reset_index(drop= True))

In [4]:
len(paras)

5068

In [5]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [6]:
len(sentences)

20075

In [None]:
questions = list(train["question"])
blob = TextBlob(" ".join(questions))
questions = [item.raw for item in blob.sentences]

In [None]:
sentences.append(questions)

### Building Vocabulary

In [7]:
infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")
infersent.build_vocab(sentences, tokenize=True)

Found 37124(/42258) words with glove vectors
Vocab size : 37124


### Loading Embedding dictionary

In [48]:
with open("data/dict_embeddings.pickle", "rb") as f:
    dict_emb = pickle.load(f)

## Data Processing

In [None]:
def get_target(questions,sentences):
    idx = -1
    for i in range(len(sentences)):
        if a in b[i]: idx = i
    return idx

In [None]:
def process_data(train):
    train['sentence'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])
    
    train['sent_emb'] = train['sentence'].apply(lambda x: [dict_emb[item][0] if item in\
                                                           dict_emb else np.zeros(4096) for item in x])
    train['quest_emb'] = train['question'].apply(lambda x: infersent.encode([x], tokenize=True))
    
    train["target"] = train.apply(lambda x: get_target(x["text"], x["sentence"]))
    
    return train   

## Predicted Cosine & Euclidean Index

In [None]:
def cosine_sim(quest_emb,sent_emb):
    li = []
    for item in sent_emb:
        li.append(spatial.distance.cosine(item,quest_emb[0]))
    return li   

In [None]:
def euclidean_dis(quest_emb,sent_emb):
    
    diff = quest_emb - embeddings_sentences
    distance = np.sum(diff**2,axis = 2)
    
    return distance      

In [None]:
def pred_idx(distances):
    return np.argmin(distances)   

In [None]:
def predictions(train):
    
    train["cosine_sim"] = train.apply(lambda x: cosine_sim(x["quest_emb"],x["sent_emb"]))
    train["euclidean_dis"] = train.apply(lambda x: euclidean_dis(x["quest_emb"],x["sent_emb"]))
    train["pred_idx_cos"] = train.apply(lambda x: pred_idx(x["cosine_sim"]))
    train["pred_idx_euc"] = train.apply(lambda x: pred_idx(x["euclidean_dist"]))
    
    return train
    

## Accuracy

In [None]:
def accuracy(target, predicted):
    
    acc = (target==predicted).sum()/len(target)
    
    return acc

### Accuracy for  euclidean Distance

In [None]:
print(accuracy(train["target"], train["pred_idx_euc"])

### Accuracy for Cosine Similarity

In [None]:
print(accuracy(train["target"], train["pred_idx_cos"])

## Sentence Prediction basis Minimum Euclidean Distance

In [49]:
for i in range(train.shape[0]):
    
    blob = TextBlob(train['context'][i])
    sentences = [item.raw for item in blob.sentences]
    embeddings_sentences = [dict_emb[item] if item in dict_emb else np.zeros((1,4096)) for item in sentences]
    
    embeddings_question = infersent.encode([(train['question'][i])], tokenize=True)
    
    diff = embeddings_question - embeddings_sentences
    distance = np.sum(diff**2,axis = 2)
    
    train.loc[i,"predicted_sent"] = sentences[np.argmin(distance)]

### Accuracy

In [50]:
for i in range(0,train.shape[0]):
    train.loc[i,"flag"] = (train["text"][i] in train["predicted_sent"][i])*1

In [51]:
train['flag'].sum()/train.shape[0]

0.5355026969670175

## Sentence Prediction basis Cosine Similarity

In [52]:
for i in range(train.shape[0]):
    
    blob = TextBlob(train['context'][i])
    sentences = [item.raw for item in blob.sentences]
    embeddings_sentences = [dict_emb[item] if item in dict_emb else np.zeros(4096) for item in sentences]
    
    embeddings_question = infersent.encode([(train['question'][i])], tokenize=True)
    
    cosine_distance = [spatial.distance.cosine(u,embeddings_question[0]) for u in embeddings_sentences]
    
    train.loc[i,"predicted_sent_cosine"] = sentences[np.argmin(cosine_distance)]

### Accuracy

In [53]:
train['flag_cos'] = None
for i in range(0,train.shape[0]):
    train.loc[i,"flag_cos"] = (train["text"][i] in train["predicted_sent_cosine"][i])*1

In [54]:
train['flag_cos'].sum()/train.shape[0]

0.7093465381554515

In [55]:
train.to_csv("train_detect_sent.csv", index= False)

### Finidng lengths of paras

In [44]:
train = pd.read_csv("train_detect_sent.csv")

In [45]:
train.shape

(22618, 8)

In [46]:
train['sentence'] = train['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])

In [47]:
train['sentence'][0]

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [52]:
train['sent_emb'] = train['sentence'].apply(lambda x: [dict_emb[item][0] if item in dict_emb else np.zeros(4096) for item in x])


In [55]:
train['quest_emb'] = train['question'].apply(lambda x: infersent.encode([x], tokenize=True))


In [None]:
train["target"] = train.apply(lambda x: get_index(x["text"], x["sentence"]))

train["cosine_sim"] = train.apply(lambda x: cosine_sim(x["quest_emb"],x["sent_emb"]))
train["euclidean_dis"] = train.apply(lambda x: euclidean_dis(x["quest_emb"],x["sent_emb"]))
train["pred_idx_cos"] = train.apply(lambda x: pred_idx(x["cosine_sim"]))
train["pred_idx_euc"] = train.apply(lambda x: pred_idx(x["euclidean_dist"]))

In [None]:
train.to_csv("train_detect_sent.csv", index= False)