In [1]:
import json
import nltk
import collections
import fastText
import fastText_tokenize
import re
import gzip
import random
import os
import shutil
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook
from intervaltree import Interval, IntervalTree

  from ._conv import register_converters as _register_converters


In [3]:
with open('../../data/SQuAD/train-v1.1.json', 'rt') as f:
    train_json = json.load(f)
with open('../../data/SQuAD/dev-v1.1.json', 'rt') as f:
    dev_json = json.load(f)

# Vocab / Embeddings

In [4]:
counter = collections.Counter()
for page in tqdm_notebook(train_json['data'] + dev_json['data']):
    for para in page['paragraphs']:
        counter.update(fastText_tokenize.word_tokenize(para['context']))
        for qa in para['qas']:
            counter.update(fastText_tokenize.word_tokenize(qa['question']))




In [5]:
id_to_word = ['<NULL>'] + [word for word, _ in counter.most_common()]
word_to_id = { word: word_id for word_id, word in enumerate(id_to_word) }

In [11]:
fasttext_model = fastText.FastText.load_model('../../data/SQuAD/wiki.en.bin')

In [14]:
in_vocab = [word for word in counter.keys() if fasttext_model.get_word_id(word) >= 0]
out_of_vocab = [word for word in counter.keys() if fasttext_model.get_word_id(word) < 0]
len(in_vocab), len(out_of_vocab)

(80727, 9724)

In [15]:
embeddings = np.zeros([len(id_to_word), 300])
for i, word in enumerate(id_to_word):
    if i > 0:
        embeddings[i, :] = fasttext_model.get_word_vector(word)

In [17]:
with gzip.open('../../data/SQuAD/data_1.vocab.embeddings.npy.gz', 'wb') as f:
    np.save(f, embeddings)

In [18]:
with open('../../data/SQuAD/data_1.vocab.txt', 'wt') as f:
    for word in id_to_word:
        print(word, file = f)

# Examples

In [6]:
def generate_examples(json):
    for page in tqdm_notebook(json['data']):
        for ex in generate_examples_page(page):
            yield ex

def generate_examples_page(page):
    for para in page['paragraphs']:
        for ex in generate_examples_para(para, page['title']):
            yield ex
        
def generate_examples_para(para, title):
    # grab context
    context_text = para['context']

    # tokenize context
    context_tokens = list(fastText_tokenize.full_tokenize(context_text))

    # index context spans
    context_spans = IntervalTree()
    for token_index, (_, (start, end)) in enumerate(context_tokens):
        context_spans[start:end] = token_index

    # convert context
    context = []
    for token, _ in context_tokens:
        context.append(word_to_id[token])

    # questions
    for qa in para['qas']:
        # grab question text/id
        question_text = qa['question']
        question_id = qa['id']

        # convert question
        question = []
        for token, _ in fastText_tokenize.full_tokenize(question_text):
            question.append(word_to_id[token])

        # convert answers
        answers = []
        for answer in qa['answers']:
            # grab answer
            answer_start = answer['answer_start']
            answer_end = answer_start + len(answer['text'])

            # convert to token indices
            min_index = min(i.data for i in context_spans[answer_start:answer_end])
            max_index = max(i.data for i in context_spans[answer_start:answer_end])

            answers.append((min_index, max_index))

        yield {
            'id': question_id,
            'title': title,
            'context': context,
            'question': question,
            'answers': answers }

In [None]:
train_examples = list(generate_examples(train_json))
dev_examples = list(generate_examples(dev_json))

In [None]:
len(train_examples)

In [21]:
random.shuffle(train_examples)
random.shuffle(dev_examples)

In [22]:
def convert_to_tfrecord(example):
    ex_id = example['id']
    title = example['title']
    context = example['context']
    question = example['question']
    answers = example['answers']
    
    answer_starts = [s[0] for s in answers]
    answer_ends = [s[1] for s in answers]
    
    return tf.train.Example(features = tf.train.Features(feature = {
        'id': tf.train.Feature(bytes_list = tf.train.BytesList(value = [bytes(ex_id, 'utf-8')])),
        'title': tf.train.Feature(bytes_list = tf.train.BytesList(value = [bytes(title, 'utf-8')])),
        'context': tf.train.Feature(int64_list = tf.train.Int64List(value = context)),
        'question': tf.train.Feature(int64_list = tf.train.Int64List(value = question)),
        'answer_starts': tf.train.Feature(int64_list = tf.train.Int64List(value = answer_starts)),
        'answer_ends': tf.train.Feature(int64_list = tf.train.Int64List(value = answer_ends)),
    }))

def write_examples(examples, path, batch_size = 1000):
    # remove old directory
    shutil.rmtree(path, ignore_errors = True)
    
    # make directory
    os.makedirs(path, exist_ok = True)
    
    # write batches
    for offset in tqdm_notebook(range(0, len(examples), batch_size)):
        batch = examples[offset:offset + batch_size]
        batch_path = os.path.join(path, 'examples.%010d.tfrecords.gz' % offset)
        options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)
        with tf.python_io.TFRecordWriter(batch_path, options = options) as writer:
            for example in batch:
                writer.write(convert_to_tfrecord(example).SerializeToString())

In [23]:
write_examples(train_examples, '../../data/SQuAD/data_1.train')
write_examples(dev_examples, '../../data/SQuAD/data_1.dev')







In [24]:
max(len(e['context']) for e in train_examples + dev_examples)

846

In [25]:
max(len(e['question']) for e in train_examples + dev_examples)

60

In [26]:
max(len(e['answers']) for e in train_examples + dev_examples)

6