In [155]:
import json
import nltk
import collections
import fastText
import re
import numpy as np
import gzip
from tqdm import tqdm_notebook

In [56]:
with open('../data/SQuAD/train-v1.1.json', 'rt') as f:
    train_json = json.load(f)
with open('../data/SQuAD/dev-v1.1.json', 'rt') as f:
    dev_json = json.load(f)

In [107]:
def word_tokenize(text):
    for w1 in nltk.word_tokenize(text):
        for w2 in re.split(r'([^\w0-9]|[\u4E00-\u9FA5])', w1):
            if len(w2) > 0:
                yield w2.lower()

In [108]:
counter = collections.Counter()
for page in tqdm_notebook(train_json['data'] + dev_json['data']):
    for para in page['paragraphs']:
        counter.update(word_tokenize(para['context']))
        for qa in para['qas']:
            counter.update(word_tokenize(qa['question']))




In [109]:
in_vocab = [word for word in counter.keys() if fasttext_model.get_word_id(word) >= 0]
out_of_vocab = [word for word in counter.keys() if fasttext_model.get_word_id(word) < 0]
len(in_vocab), len(out_of_vocab)

(79677, 8115)

In [15]:
fasttext_model = fastText.FastText.load_model('/home/achang/Downloads/wiki.en.bin')

In [147]:
id_to_word = [word for word, _ in counter.most_common()]
word_to_id = { word: word_id for word_id, word in enumerate(id_to_word) }

In [151]:
embeddings = np.zeros([len(counter), 300])
for i, word in enumerate(id_to_word):
    embeddings[i, :] = fasttext_model.get_word_vector(word)

In [156]:
with gzip.open('../data/SQuAD/vocab_1.embeddings.npy.gz', 'wb') as f:
    np.save(f, embeddings)

In [157]:
with open('../data/SQuAD/vocab_1.txt', 'wt') as f:
    for word in id_to_word:
        print(word, file = f)