In [39]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
en_nlp = spacy.load('en')

## Convert Json to Pandas Dataframe

In [2]:
train = pd.read_json("data/train-v1.1.json")

In [3]:
valid = pd.read_json("data/dev-v1.1.json")

In [6]:
train.shape, valid.shape

((442, 2), (48, 2))

In [3]:
train.head(3)

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1


In [4]:
train.iloc[1,0]['paragraphs'][0]

{'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'qas': [{'answers': [{'answer_start': 269, 'text': 'in the late 1990s'}],
   'id': '56be85543aeaaa14008c9063',
   'question': 'When did Beyonce start becoming popular?'},
  {'answers': [{'answer_start': 207, 'text': 'singing and dancing'}],
   'id': '56be85543aeaaa14008c9065',
   'ques

In [6]:
# valid.iloc[1,0]['paragraphs'][0]

In [7]:
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            answers_start.append(q_a['answers'][0]['answer_start'])
            answers_text.append(q_a['answers'][0]['text'])
            contexts.append(sub_para['context'])   
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})

In [9]:
df.shape

(87599, 4)

In [10]:
df.to_csv("data/train.csv", index = None)

## Create dictionary of sentence embeddings for faster computation

In [8]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [9]:
len(paras)

18891

In [10]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [14]:
len(sentences)

92659

In [32]:
infersent = torch.load('InferSent/encoder/infersent.allnli.pickle', map_location=lambda storage, loc: storage)
infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")

In [33]:
infersent.build_vocab(sentences, tokenize=True)

Found 88993(/109718) words with glove vectors
Vocab size : 88993


In [None]:
dict_embeddings = {}
for i in range(len(sentences)):
    print(i)
    dict_embeddings[sentences[i]] = infersent.encode([sentences[i]], tokenize=True)

In [22]:
questions = list(df["question"])

In [23]:
len(questions)

87599

In [None]:
for i in range(len(questions)):
    print(i)
    dict_embeddings[questions[i]] = infersent.encode([questions[i]], tokenize=True)

In [20]:
dict_embeddings['Architecturally, the school has a Catholic character.'][0]

array([ 0.05519997,  0.05013141,  0.04787038, ...,  0.00821208,
       -0.03642813,  0.04468501], dtype=float32)

In [31]:
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [32]:
d1

{'Architecturally, the school has a Catholic character.': array([[ 0.05519997,  0.05013141,  0.04787038, ...,  0.00821208,
         -0.03642813,  0.04468501]], dtype=float32),
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".': array([[0.11262652, 0.1114684 , 0.14750297, ..., 0.00293285, 0.03322018,
         0.06657629]], dtype=float32),
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.': array([[ 0.04149356,  0.07033059,  0.03724371, ...,  0.01096805,
         -0.02892282,  0.0428066 ]], dtype=float32),
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.': array([[ 0.04795522,  0.16508996,  0.0938353 , ...,  0.05321151,
         -0.01826633,  0.10806957]], dtype=float32),
 'The nine student-run outlets include three newspapers, both a radio and television sta

In [33]:
d2

{"Atop the Main Building's gold dome is a golden statue of the Virgin Mary.": array([[ 0.07475325,  0.11794458,  0.06240867, ...,  0.01915886,
         -0.02436746,  0.10806957]], dtype=float32),
 'Next to the Main Building is the Basilica of the Sacred Heart.': array([[ 0.08010551,  0.11775322,  0.02186232, ...,  0.01656765,
         -0.01024127,  0.04706628]], dtype=float32),
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.': array([[ 0.10776819,  0.0805801 ,  0.10461736, ...,  0.01522135,
         -0.03814263,  0.14945611]], dtype=float32),
 "As at most other universities, Notre Dame's students run a number of news media outlets.": array([[0.09720325, 0.09345725, 0.05466025, ..., 0.08443642, 0.00817084,
         0.02197512]], dtype=float32),
 'Begun as a one-page journal in September 1876, the Scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in t

In [34]:
with open('data/dict_embeddings1.pickle', 'wb') as handle:
    pickle.dump(d1, handle)

In [35]:
with open('data/dict_embeddings2.pickle', 'wb') as handle:
    pickle.dump(d2, handle)

In [None]:
del dict_embeddings