In [1]:
import numpy as np
import pandas as pd
import json
import torch
import pickle, time
import re, os, string, typing, gc, json
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from collections import Counter

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())
    print("Title: ", data['data'][0]['title'])
    
    return data

In [11]:
def parse_data(data:dict)->list:
    data = data['data']
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                if 'why' not in question and 'how' not in question: #need to remake df file
                  for ans in qa['answers']:
                      answer = ans['text']
                      ans_start = ans['answer_start']
                      ans_end = ans_start + len(answer)
                      qa_dict = {}
                      qa_dict['id'] = id
                      qa_dict['context'] = context
                      qa_dict['question'] = question
                      qa_dict['label'] = [ans_start, ans_end]

                      qa_dict['answer'] = answer
                      qa_list.append(qa_dict)    

    
    return qa_list

In [12]:
train_file = os.path.join( './','train-v2.0.json')
train_data = load_json(train_file)
train_list = parse_data(train_data)
train_df = pd.DataFrame(train_list)

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Beyoncé


In [13]:
dev_file = os.path.join( './','dev-v2.0.json')
dev_data = load_json(dev_file)
dev_list = parse_data(dev_data)
dev_df = pd.DataFrame(dev_list)

Length of data:  35
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Normans


In [69]:
dev_df.iloc[0,1]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [14]:
def preprocess_df(df):
    def to_lower(text):
        return text.lower()
    df.context = df.context.apply(to_lower)
    df.question = df.question.apply(to_lower)
    df.answer = df.answer.apply(to_lower)

In [15]:
preprocess_df(train_df)
preprocess_df(dev_df)

In [18]:
print(len(train_df))
print(len(dev_df))

84548
19880


In [19]:
total_df = train_df.append(dev_df)

  total_df = train_df.append(dev_df)


In [188]:
len(total_df)

104428

In [189]:
def text_for_vocab(df):    
    text = []
    total = 0
    unique_contexts = list(df.context.unique())
    unique_questions = list(df.question.unique())
    total += df.context.nunique() + df.question.nunique()
    text.extend(unique_contexts + unique_questions)
    return text

In [190]:
vocab_text =text_for_vocab(total_df)

In [28]:
def vocabbuilder(vocab_text):
    words = []
    for sent in vocab_text:
        for word in nlp(sent, disable=['parser','tagger','ner']):
            words.append(word.text)

    word_counter = Counter(words)
    word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
    print(f"raw-vocab: {len(word_vocab)}")
    #word_vocab = list(set(word_vocab).intersection(set(glove_words)))
    word_vocab.insert(0, '<unk>')
    word_vocab.insert(1, '<pad>')
    print(f"vocab-length: {len(word_vocab)}")
    word2idx = {word:idx for idx, word in enumerate(word_vocab)}
    print(f"word2idx-length: {len(word2idx)}")
    idx2word = {v:k for k,v in word2idx.items()}
    return word2idx, idx2word, word_vocab

In [29]:
word2idx, idx2word, word_vocab = vocabbuilder(vocab_text)



raw-vocab: 94198
vocab-length: 94200
word2idx-length: 94200


In [32]:
def context_to_ids(text, word2idx):
    #converts words to numerical index in the text
    context_tokens = [w.text for w in nlp(text, disable=['parser','tagger','ner'])]
    context_ids = [word2idx[word] for word in context_tokens]
    
    assert len(context_ids) == len(context_tokens)
    return context_ids

In [33]:
train_df['context_ids'] = train_df.context.apply(context_to_ids, word2idx=word2idx)

In [34]:
def question_to_ids(text, word2idx):
    question_tokens = [w.text for w in nlp(text, disable=['parser','tagger','ner'])]
    question_ids = [word2idx[word] for word in question_tokens]
    
    assert len(question_ids) == len(question_tokens)
    return question_ids

In [35]:
train_df['question_ids'] = train_df.question.apply(question_to_ids, word2idx=word2idx)



In [39]:
def test_indices(df, idx2word):
    '''
    Performs the tests mentioned above. This method also gets the start and end of the answers
    with respect to the context_ids for each example.
    
    :param dataframe df: SQUAD df
    :returns
        list start_value_error: example idx where the start idx is not found in the start spans
                                of the text
        list end_value_error: example idx where the end idx is not found in the end spans
                              of the text
        list assert_error: examples that fail assertion errors. A majority are due to the above errors
        
    '''

    start_value_error = []
    end_value_error = []
    assert_error = []
    for index, row in df.iterrows():

        answer_tokens = [w.text for w in nlp(row['answer'], disable=['parser','tagger','ner'])]
       
        try:
          start_token = answer_tokens[0]
          end_token = answer_tokens[-1]
          #print(start_token,end_token)
        except:
          assert_error.append(index)
          #print(index)
          #print(answer_tokens)
          continue

        
        context_span  = [(word.idx, word.idx + len(word.text)) 
                         for word in nlp(row['context'], disable=['parser','tagger','ner'])]
        #print(row['context'])

        starts, ends = zip(*context_span)
        

        answer_start, answer_end = row['label']
        #print(starts)

        try:
            start_idx = starts.index(answer_start)
            
        except:
            start_value_error.append(index)
            #print('start err')
            #print(index)
            #print(answer_tokens)
            #print(starts)
            #print(ends)
            #print(answer_start)
            
        try:
            end_idx  = ends.index(answer_end)
            
        except:
            end_value_error.append(index)
            #print("enderror")
            #print(index)
            #print(answer_tokens)
            #print(starts,ends)
            #print(answer_end)
            

        try:
            
            assert idx2word[row['context_ids'][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][end_idx]] == answer_tokens[-1]
        except:
            assert_error.append(index)
            #print("index err")
            #print(index)
            
        # print("TESTING.......")
        # print(idx2word[row['context_ids'][start_idx]])
        # print(idx2word[row['context_ids'][end_idx]])
        # print(answer_tokens)
        # print(answer_tokens[0])
        # print(answer_tokens[-1])  
            #print(starts,ends)
                   

    
    return start_value_error, end_value_error, assert_error

In [40]:
def get_error_indices(df, idx2word):
    '''
    Gets error indices from the method above and returns a 
    set of those indices.
    '''
    
    start_value_error, end_value_error, assert_error = test_indices(df,idx2word)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    print(f"Error indices: {len(err_idx)}")
    
    return err_idx

In [130]:
train_df[:1]

Unnamed: 0,id,context,question,label,answer,context_ids,question_ids
0,56be85543aeaaa14008c9063,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...,when did beyonce start becoming popular?,"[269, 286]",in the late 1990s,"[1117, 40900, 56297, 56298, 56299, 838, 429, 8...","[30, 20, 1741, 448, 1109, 269, 6]"


In [41]:
train_err = get_error_indices(train_df, idx2word)



Error indices: 879


In [42]:
train_df.drop(train_err, inplace=True)

In [45]:
def index_answer(row, idx2word):
    '''
    Takes in a row of the dataframe or one training example and
    returns a tuple of start and end positions of answer by calculating 
    spans.
    ''' 
    context_span = [(word.idx, word.idx + len(word.text)) for word in nlp(row.context, disable=['parser','tagger','ner'])]
    starts, ends = zip(*context_span)
    
    answer_start, answer_end = row.label
    start_idx = starts.index(answer_start)
 
    end_idx  = ends.index(answer_end)
    
    ans_toks = [w.text for w in nlp(row.answer,disable=['parser','tagger','ner'])]
    ans_start = ans_toks[0]
    ans_end = ans_toks[-1]  
    assert idx2word[row.context_ids[start_idx]] == ans_start
    assert idx2word[row.context_ids[end_idx]] == ans_end
    
    return [start_idx, end_idx]  #this needs to be end_idx +1

In [46]:
train_label_idx = train_df.apply(index_answer, axis=1, idx2word=idx2word)



In [47]:
train_df['label_idx'] = train_label_idx

In [48]:
train_df[:5]

Unnamed: 0,id,context,question,label,answer,context_ids,question_ids,label_idx
0,56be85543aeaaa14008c9063,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,when did beyonce start becoming popular?,"[269, 286]",in the late 1990s,"[929, 39213, 17687, 15, 10123, 22, 52767, 1217...","[36, 26, 1447, 471, 1175, 289, 7]","[56, 59]"
1,56be85543aeaaa14008c9065,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,what areas did beyonce compete in when she was...,"[207, 226]",singing and dancing,"[929, 39213, 17687, 15, 10123, 22, 52767, 1217...","[11, 221, 26, 1447, 3152, 6, 36, 326, 13, 1178...","[44, 46]"
2,56be85543aeaaa14008c9066,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,when did beyonce leave destiny's child and bec...,"[526, 530]",2003,"[929, 39213, 17687, 15, 10123, 22, 52767, 1217...","[36, 26, 1447, 1701, 4665, 18, 663, 8, 184, 10...","[112, 112]"
3,56bf6b0f3aeaaa14008c9601,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,in what city and state did beyonce grow up?,"[166, 180]","houston, texas","[929, 39213, 17687, 15, 10123, 22, 52767, 1217...","[6, 11, 53, 8, 76, 26, 1447, 296, 1947, 106, 7]","[36, 38]"
4,56bf6b0f3aeaaa14008c9602,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,in which decade did beyonce become famous?,"[276, 286]",late 1990s,"[929, 39213, 17687, 15, 10123, 22, 52767, 1217...","[6, 28, 1115, 26, 1447, 184, 601, 7]","[58, 59]"


In [221]:
def make_glove_dict():
    '''
    Parses the glove word vectors text file and returns a dictionary with the words as
    keys and their respective pretrained word vectors as values.

    '''
    glove_dict = {}
    with open(os.path.join( './','glove.6B.100d.txt'), "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove_dict[word] = vector
    glove_dict["<NULL>"] = np.asarray([0. for _ in range(100)])
    glove_dict["<unk>"] = np.asarray([0. for _ in range(100)])
            
    f.close()   
    return glove_dict  

In [222]:
glove_dict = make_glove_dict()

In [223]:
def create_weights_matrix(glove_dict):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = np.zeros((len(word_vocab), 100))
    words_found = 0
    for i, word in enumerate(word_vocab):
        try:
            weights_matrix[word2idx[word]] = glove_dict[word]
            words_found += 1
        except:
            pass
        
    return weights_matrix, words_found

In [224]:
weights_matrix, words_found = create_weights_matrix(glove_dict)
print("Words found in the GloVe vocab: " ,words_found)

Words found in the GloVe vocab:  71069


In [225]:
len(word_vocab)

94200

In [None]:
def clean_text(text):
    text = text.replace("]", " ] ")
    text = text.replace("[", " [ ")
    text = text.replace("\n", " ")
    text = text.replace("''", '" ').replace("``", '" ')

    return text

In [52]:
with open(os.path.join('./data','word_vocab.pkl'), "wb") as wv, \
      open(os.path.join('./data','word2index.pkl'), "wb") as wi, \
         open(os.path.join('./data','index2word.pkl'), "wb") as iw:
         pickle.dump(word_vocab, wv)
         pickle.dump(word2idx, wi)
         pickle.dump(idx2word,iw)

In [53]:
train_df.to_pickle('traindata.pkl')

In [236]:
np.save('weights_matrix_100D.npy', weights_matrix)

In [27]:
with open(os.path.join('./data','vocab_text.pkl'), "wb") as vt:
    pickle.dump(vocab_text, vt)