In [1]:
import json
import pandas as pd 
import tensorflow as tf
from pandas.io.json import json_normalize
import numpy as np
from scripts import MagnitudeVectors
from tqdm import tqdm_notebook
import operator

In [2]:
# https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part1-eda/data
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm_notebook(vocab):
        if word in embeddings_index:
            a[word] = embeddings_index.query(word)
            k += vocab[word]
        else:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
        
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x

def build_vocab(sentences, verbose = True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm_notebook(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [3]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
df = squad_json_to_dataframe_train('./data/train-v1.1.json')

Reading the json file
processing...
shape of the dataframe is (87599, 6)
Done


In [5]:
df.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [6]:
vectors = MagnitudeVectors(50).load_vectors()

Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..


In [7]:
from nltk import word_tokenize
vocab = build_vocab(list(df['question'].apply(lambda x: word_tokenize(x))))
oov = check_coverage(vocab,vectors)
oov[:10]

HBox(children=(IntProgress(value=0, max=87599), HTML(value='')))




HBox(children=(IntProgress(value=0, max=44718), HTML(value='')))


Found embeddings for 90.73% of vocab
Found embeddings for 99.44% of all text


[("'The", 38),
 ('BeiDou', 29),
 ('BeiDou-1', 22),
 ('BeiDou-2', 17),
 ('KInsey', 14),
 ('IHDI', 14),
 ('assention', 13),
 ("'war", 13),
 ("'War", 11),
 ('ring-porous', 9)]

In [None]:
"what" in vectors