In [None]:
import ipynb.fs
import pandas as pd
import spacy
from collections import Counter
from tqdm.auto import tqdm
from .defs.utils import *
nlp = spacy.load('en_core_web_sm')
tqdm.pandas()

In [None]:
train_data = load_json('../data/train-v2.0.json')
valid_data = load_json('../data/dev-v2.0.json')

In [None]:
train_df = pd.DataFrame(parse_data(train_data))#.head(500)
valid_df = pd.DataFrame(parse_data(valid_data))#.head(500)

In [None]:
train_df.context = train_df.context.apply(normalize_spaces)
valid_df.context = valid_df.context.apply(normalize_spaces)

In [None]:
vocab_text = []
for df in [train_df,valid_df]:
    unique_contexts = list(df.context.unique())
    unique_questions = list(df.question.unique())
    vocab_text.extend(unique_contexts + unique_questions)

In [None]:
words = []
for sent in tqdm(vocab_text):
    for word in nlp(sent, disable=['parser','tagger','ner']):
        words.append(word.text)
word_counter = Counter(words)
word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
print(f"raw-vocab: {len(word_vocab)}")
word_vocab.insert(0, '<unk>')
word_vocab.insert(1, '<pad>')
print(f"vocab-length: {len(word_vocab)}")
word2idx = {word:idx for idx, word in enumerate(word_vocab)}
print(f"word2idx-length: {len(word2idx)}")
idx2word = {v:k for k,v in word2idx.items()}

In [None]:
import pickle
with open('/scratch/arjunth2001/drqa/drqastoi.pickle','wb') as handle:
    pickle.dump(word2idx, handle)

In [None]:
import pickle
with open('/scratch/arjunth2001/drqa/drqastoi.pickle','rb') as handle:
    word2idx = pickle.load(handle)

In [None]:
%time train_df['context_ids'] = train_df.context.progress_apply(text_to_ids, word2idx=word2idx, nlp=nlp)
%time valid_df['context_ids'] = valid_df.context.progress_apply(text_to_ids, word2idx=word2idx, nlp=nlp)

%time train_df['question_ids'] = train_df.question.progress_apply(text_to_ids,  word2idx=word2idx, nlp=nlp)
%time valid_df['question_ids'] = valid_df.question.progress_apply(text_to_ids,  word2idx=word2idx, nlp=nlp)

In [None]:
train_err = get_error_indices(train_df, idx2word , nlp)
valid_err = get_error_indices(valid_df, idx2word, nlp)
train_df.drop(train_err, inplace=True)
valid_df.drop(valid_err, inplace=True)

In [None]:
train_label_idx = train_df.progress_apply(index_answer, axis=1, idx2word=idx2word, nlp= nlp)
valid_label_idx = valid_df.progress_apply(index_answer, axis=1, idx2word=idx2word,nlp = nlp)
train_df['label_idx'] = train_label_idx
valid_df['label_idx'] = valid_label_idx

In [None]:
import pickle
with open('/scratch/arjunth2001/drqa/drqastoi.pickle','wb') as handle:
    pickle.dump(word2idx, handle)
    
train_df.to_pickle('/scratch/arjunth2001/drqa/drqatrain.pkl')
valid_df.to_pickle('/scratch/arjunth2001/drqa/drqavalid.pkl')

In [None]:
import numpy as np
glove_dict = {}
with open("/scratch/arjunth2001/glove.840B.300d.txt", "r", encoding="utf-8") as f:
    for line in tqdm(f):
        values = line.split(' ')
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_dict[word] = vector

In [None]:
weights_matrix = np.zeros((len(word_vocab), 300))
for i, word in enumerate(word_vocab):
    try:
        weights_matrix[i] = glove_dict[word]
    except:
        pass

In [None]:
np.save('/scratch/arjunth2001/drqa/drqaglove_vt.npy',weights_matrix)