In [1]:
import numpy as np, pandas as pd
import json
from textblob import TextBlob
import nltk
import torch
import pickle
from scipy import spatial
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
paras = list(train["context"].drop_duplicates().reset_index(drop= True))

In [4]:
len(paras)

5068

In [5]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [6]:
len(sentences)

20075

### Building Vocabulary

In [7]:
infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")
infersent.build_vocab(sentences, tokenize=True)

Found 37124(/42258) words with glove vectors
Vocab size : 37124


### Loading Embedding dictionary

In [8]:
with open("data/dict_embeddings.pickle", "rb") as f:
    dict_emb = pickle.load(f)

## Sentence Prediction basis Minimum Euclidean Distance

In [48]:
train["predicted_sent"] = None

In [49]:
for i in range(train.shape[0]):
    
    blob = TextBlob(train['context'][i])
    sentences = [item.raw for item in blob.sentences]
    embeddings_sentences = [dict_emb[item] if item in dict_emb else np.zeros((1,4096)) for item in sentences]
    
    embeddings_question = infersent.encode([(train['question'][i])], tokenize=True)
    
    diff = embeddings_question - embeddings_sentences
    distance = np.sum(diff**2,axis = 2)
    
    train.loc[i,"predicted_sent"] = sentences[np.argmin(distance)]

### Accuracy

In [50]:
for i in range(0,train.shape[0]):
    train.loc[i,"flag"] = (train["text"][i] in train["predicted_sent"][i])*1

In [51]:
train['flag'].sum()/train.shape[0]

0.5355026969670175

## Sentence Prediction basis Cosine Similarity

In [52]:
for i in range(train.shape[0]):
    
    blob = TextBlob(train['context'][i])
    sentences = [item.raw for item in blob.sentences]
    embeddings_sentences = [dict_emb[item] if item in dict_emb else np.zeros(4096) for item in sentences]
    
    embeddings_question = infersent.encode([(train['question'][i])], tokenize=True)
    
    cosine_distance = [spatial.distance.cosine(u,embeddings_question[0]) for u in embeddings_sentences]
    
    train.loc[i,"predicted_sent_cosine"] = sentences[np.argmin(cosine_distance)]

### Accuracy

In [53]:
train['flag_cos'] = None
for i in range(0,train.shape[0]):
    train.loc[i,"flag_cos"] = (train["text"][i] in train["predicted_sent_cosine"][i])*1

In [54]:
train['flag_cos'].sum()/train.shape[0]

0.7093465381554515

In [55]:
train.to_csv("train_detect_sent.csv", index= False)