In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy
en_nlp = spacy.load('en')

# Convert Json to Pandas Dataframe

In [2]:
train = pd.read_json("data/train-v2.0.json")

In [3]:
valid = pd.read_json("data/dev-v2.0.json")

In [4]:
train.shape, valid.shape

((442, 2), (35, 2))

In [5]:
train.head(3)

Unnamed: 0,data,version
0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{...",v2.0
1,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'...",v2.0
2,{'title': 'Sino-Tibetan_relations_during_the_M...,v2.0


In [6]:
#train.iloc[1,0]['paragraphs'][0]

In [9]:
# add data to a dataframe
contexts = []
questions = []
answers = []
answers_text = []
answers_start = []
for i in range(train.shape[0]):
    topic = train.iloc[i,0]['paragraphs']
    for sub_para in topic:
        for q_a in sub_para['qas']:
            questions.append(q_a['question'])
            if q_a['answers']:
                answers.append("yes")
                answers_start.append(q_a['answers'][0]['answer_start'])
                answers_text.append(q_a['answers'][0]['text'])
            elif q_a['plausible_answers']:
                answers_start.append(q_a['plausible_answers'][0]['answer_start'])
                answers_text.append(q_a['plausible_answers'][0]['text'])
                answers.append("no")
            contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answers":answers, "answer_start": answers_start, "text": answers_text})

In [10]:
# df = df.iloc[:1000]
# load data in csv file
df.to_csv("data/train_with_answers.csv", index = None)

# Create dictionary of sentence embeddings for faster computation


In [60]:
paras = list(df["context"].drop_duplicates().reset_index(drop= True))

In [61]:
len(paras)

19029

In [62]:
blob = TextBlob(" ".join(paras))
sentences = [item.raw for item in blob.sentences]

In [63]:
from InferSent.models import InferSent 
MODEL_PATH =  'InferSent/encoder/infersent1.pkl'
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
model = InferSent(params_model)

W2V_PATH = 'InferSent/dataset/GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)
#infersent.set_glove_path("InferSent/dataset/GloVe/glove.840B.300d.txt")

In [64]:
model.build_vocab(sentences, tokenize=True)

Found 89447(/110298) words with w2v vectors
Vocab size : 89447


In [65]:
len(sentences)

93280

In [None]:
import time
t0 = time.time()
dict_embeddings = {}
for i in range(len(sentences)):
    if i ==0:
        print(sentences[i])
    if i%1000 == 0:
        t1 = time.time()
        total = t1-t0
        print("encoding question ",i," time since beginning:", total)
    dict_embeddings[sentences[i]] = model.encode([sentences[i]], tokenize=True)

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.
encoding question  0  time since beginning: 0.0004284381866455078
encoding question  1000  time since beginning: 525.504912853241
encoding question  2000  time since beginning: 1070.9987246990204


In [43]:
# encode questions
questions = list(df["question"])
len(questions)

130319

In [44]:
dict_embeddings['Architecturally, the school has a Catholic character.'][0]

array([0.01324056, 0.02752344, 0.03426428, ..., 0.04948811, 0.02405655,
       0.02240689], dtype=float32)

In [45]:
t0 = time.time()
for i in range(len(questions)):
    if i ==0:
        print(questions[i])
    if i%1000 == 0:
        t1 = time.time()
        total = t1-t0
        print("encoding number ",i," time since beginning:", total)
    dict_embeddings[questions[i]] = model.encode([questions[i]], tokenize=True)

When did Beyonce start becoming popular?
encoding number  0  time since beginning: 0.0007388591766357422
encoding number  1000  time since beginning: 189.55568599700928
encoding number  2000  time since beginning: 379.7590928077698
encoding number  3000  time since beginning: 561.4019029140472
encoding number  4000  time since beginning: 746.5095179080963
encoding number  5000  time since beginning: 947.409910440445
encoding number  6000  time since beginning: 1133.6747341156006
encoding number  7000  time since beginning: 1330.9012627601624
encoding number  8000  time since beginning: 1518.382853269577
encoding number  9000  time since beginning: 1712.0913274288177
encoding number  10000  time since beginning: 1896.5042510032654
encoding number  11000  time since beginning: 2084.3207371234894
encoding number  12000  time since beginning: 2274.1478412151337
encoding number  13000  time since beginning: 2450.114104986191
encoding number  14000  time since beginning: 2630.552731513977
en

encoding number  127000  time since beginning: 24582.220937252045
encoding number  128000  time since beginning: 24784.46029639244
encoding number  129000  time since beginning: 24961.029406547546
encoding number  130000  time since beginning: 25146.891056537628


In [46]:
#dict_embeddings['Architecturally, the school has a Catholic character.'][0]

In [47]:
d1 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 0}
d2 = {key:dict_embeddings[key] for i, key in enumerate(dict_embeddings) if i % 2 == 1}

In [48]:
d2

{'The Wii version makes use of what kind of sensors?': array([[ 0.02203822, -0.00702705,  0.0667559 , ...,  0.02912565,
          0.02496875,  0.02337021]], dtype=float32),
 'They may feel certain pain treatment is against their religion.': array([[0.02041637, 0.01720555, 0.03322755, ..., 0.02729459, 0.0281477 ,
         0.06075768]], dtype=float32),
 'Why was that? ': array([[ 0.03173946, -0.00754874,  0.03960577, ...,  0.01644378,
          0.00585065,  0.02240689]], dtype=float32),
 'When did his sister come to stay with Chopin?': array([[0.02208563, 0.05010815, 0.03100789, ..., 0.02270504, 0.02317609,
         0.02240689]], dtype=float32),
 'Gaddafi\'s government was frequently criticized for not being democratic, with Freedom House consistently giving Libya under Gaddafi the "Not Free" ranking for civil liberties and political rights.': array([[0.04876262, 0.02497183, 0.04418193, ..., 0.0215456 , 0.02838127,
         0.04938416]], dtype=float32),
 "How many square miles is is Detr

In [49]:
with open('data/dict_embeddings1.pickle', 'wb') as handle:
    pickle.dump(d1, handle)

In [50]:
with open('data/dict_embeddings2.pickle', 'wb') as handle:
    pickle.dump(d2, handle)

In [51]:
del dict_embeddings