In [96]:
import nltk
import re
import gzip
import json
import collections
import tensorflow as tf
import random
from tqdm import tqdm_notebook
from intervaltree import Interval, IntervalTree

In [2]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.norm.json.gz', 'rt', encoding='utf-8') as f:
    wiki = json.load(f)

In [38]:
def align_tokens(tokens, text):
    point = 0
    offsets = []
    for token in tokens:
        if token == '``' or token == "''":
            token = '"'
        try:
            start = text.index(token, point)
        except ValueError:
            raise ValueError('substring "{}" not found in "{}"'.format(token, text))
        point = start + len(token)
        offsets.append((start, point))
    return offsets

def span_tokenize(text):
    return align_tokens(nltk.word_tokenize(text), text)

def word_tokenize(text):
    return (('"' if word == '``' or word == "''" else word) for word in nltk.word_tokenize(text))

In [32]:
word_freqs = collections.Counter()
for _, page in tqdm_notebook(wiki.items()):
    word_freqs.update(word_tokenize(page['text']))




Exception in thread Thread-9:
Traceback (most recent call last):
  File "/home/achang/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/achang/anaconda3/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [81]:
id_to_word_30k = [word for word, _ in word_freqs.most_common(29998)] + ['<UNK>', '<OOB>']
word_to_id_30k = dict((word, word_id) for word_id, word in enumerate(id_to_word_30k))

In [115]:
def generate_examples_with_words(page, context_width):
    page_id = page['id']
    page_links = page['links']
    page_text = page['text']
    
    link_spans = IntervalTree()
    for link in page_links:
        link_spans[link['start']:link['end']] = link['target']
        
    word_spans = span_tokenize(page_text)
    for i, word_span in enumerate(word_spans):
        for target in link_spans[word_span[0]:word_span[1]]:
            context = []
            for j in range(i - context_width, i + context_width + 1):
                if j < 0 or j >= len(word_spans):
                    context.append('<OOB>')
                else:
                    s = word_spans[j]
                    w = page_text[s[0]:s[1]]
                    context.append(w)
            yield (page_id, target.data, context)

def generate_examples_with_ids(page, context_width):
    for page_id, target, words in generate_examples_with_words(page, context_width):
        target_id = wiki[target]['id']
        target_word = words[context_width]
        
        if not target_word in word_to_id_30k:
            continue
            
        word_ids = [word_to_id_30k.get(word) or word_to_id_30k['<UNK>'] for word in words]
        yield (page_id, target_id, word_ids)

In [117]:
examples = []
for _, page in tqdm_notebook(wiki.items()):
    examples.extend(generate_examples_with_ids(page, 40))




Exception in thread Thread-11:
Traceback (most recent call last):
  File "/home/achang/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/achang/anaconda3/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/achang/anaconda3/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [118]:
random.shuffle(examples)

In [119]:
dev_set_size = 30000
test_set_size = 30000

dev_set = examples[:dev_set_size]
test_set = examples[dev_set_size:dev_set_size+test_set_size]
train_set = examples[dev_set_size+test_set_size:]

len(dev_set), len(test_set), len(train_set)

(30000, 30000, 1323897)

In [132]:
def write_tfrecords(examples, filename):
    options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)
    with tf.python_io.TFRecordWriter(filename, options = options) as writer:
        for page_id, target_id, word_ids in tqdm_notebook(examples):
            example = tf.train.Example(features = tf.train.Features(feature = {
                'page_id': tf.train.Feature(int64_list = tf.train.Int64List(value = [page_id])),
                'target_id': tf.train.Feature(int64_list = tf.train.Int64List(value = [target_id])),
                'word_ids': tf.train.Feature(int64_list = tf.train.Int64List(value = word_ids)),
            }))
            writer.write(example.SerializeToString())

In [133]:
write_tfrecords(dev_set, '../data/simplewiki/simplewiki-20171103.entity_linking.dev.tfrecords')
write_tfrecords(test_set, '../data/simplewiki/simplewiki-20171103.entity_linking.test.tfrecords')
write_tfrecords(train_set, '../data/simplewiki/simplewiki-20171103.entity_linking.train.tfrecords')










In [131]:
tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)

['GZIP',
 'NONE',
 'ZLIB',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__']