In [21]:
import json
import gzip
import nltk.data
import nltk.corpus
import nltk.stem
import re
import collections
import random
import hashlib
import os
import shutil
import tensorflow as tf
import numpy as np
from tqdm import tqdm_notebook

In [2]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.norm.json.gz', 'rt', encoding='utf-8') as f:
    wiki = json.load(f)

In [3]:
def compute_degrees(wiki):
    # sort pages by ID
    pages = sorted(wiki.values(), key = lambda page: page['id'])
    
    # compute in-degrees
    counter = collections.Counter()
    for _, page in tqdm_notebook(wiki.items(), leave = False):
        for link in page['links']:
            target_page_id = wiki[link['target']]['id']
            counter[target_page_id] += 1
    in_degrees = [counter.get(i) or 0 for i in range(len(pages))]
    
    # compute out-degrees
    out_degrees = [len(page['links']) for page in pages]
    
    return in_degrees, out_degrees

In [4]:
in_degrees, out_degrees = compute_degrees(wiki)
non_empty_pages = (page for page in wiki.values() if len(page['text']) > 0)
top_10k_pages = sorted(non_empty_pages, key = lambda page: in_degrees[page['id']], reverse = True)[:10000]



In [5]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def word_tokenize(text):
    for word in nltk.word_tokenize(text):
        # skip stopwords
        if word in stopwords:
            continue
        
        # apply lemmatization
        word = lemmatizer.lemmatize(word)

        # apply stemming
        word = stemmer.stem(word)
        
        # skip non-words
        if not re.match(r'[a-z]', word):
            continue
        
        yield word

In [12]:
page_tfs = []
for page in tqdm_notebook(top_10k_pages, leave=False):
    page_tfs.append((page['id'], collections.Counter(word_tokenize(page['text']))))



In [13]:
wiki_tfs = collections.Counter()
for _, counter in tqdm_notebook(page_tfs, leave=False):
    wiki_tfs.update(counter)



In [15]:
word_to_id_10k = {}
id_to_word_10k = []
for word_id, (word, _) in enumerate(wiki_tfs.most_common(10000)):
    word_to_id_10k[word] = word_id
    id_to_word_10k.append(word)

In [36]:
def compute_word_idfs(page_tfs, words):
    # compute word -> page index
    words_set = set(words)
    word_to_page = collections.defaultdict(set)
    for page_id, counter in page_tfs:
        for word in counter.keys():
            if word in words_set:
                word_to_page[word].add(page_id)

    # compute IDFs
    word_idfs = []
    for word in words:
        n = len(word_to_page[word])
        word_idf = -np.log(n / len(page_tfs))
        word_idfs.append(word_idf)
    
    return word_idfs

In [37]:
word_idfs = compute_word_idfs(page_tfs, id_to_word_10k)

In [41]:
def compute_tf_idf_vector(counter):
    norm = sum(counter.values())
    tuples = []
    
    for word, freq in counter.items():
        # get word ID
        word_id = word_to_id_10k.get(word)
        if word_id is None:
            continue
        
        # compute TF-IDF
        tf = freq / norm
        idf = word_idfs[word_id]
        
        # update outputs
        tuples.append((word_id, tf * idf))
    
    # extract parallel arrays
    tuples.sort()
    word_ids = [p[0] for p in tuples]
    word_tf_idfs = [p[1] for p in tuples]
    
    return word_ids, word_tf_idfs

In [49]:
page_tf_idfs = [(page_id,) + compute_tf_idf_vector(counter) for page_id, counter in page_tfs]

In [51]:
shuffled = list(page_tf_idfs)
random.shuffle(shuffled)

test_set_size = 300
dev_set_size = 300

test_set  = shuffled[:test_set_size]
dev_set   = shuffled[test_set_size:dev_set_size+test_set_size]
train_set = shuffled[dev_set_size+test_set_size:]

len(test_set), len(dev_set), len(train_set)

(300, 300, 9400)

In [59]:
def write_tfrecords(page_tf_idfs, directory, batch_size = 10000):
    # delete directory
    shutil.rmtree(directory, ignore_errors = True)
    
    # make directory
    os.makedirs(directory, exist_ok = True)
    
    # write all batches
    for i in tqdm_notebook(range(0, len(page_tfs), batch_size), leave = False):
        batch = page_tf_idfs[i:i + batch_size]
        
        # write a single batch
        options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)
        with tf.python_io.TFRecordWriter('%s/examples.%010d.tfrecords.gz' % (directory, i), options = options) as writer:
            for page_id, word_ids, word_tf_idfs in batch:
                # create protobuf
                example = tf.train.Example(features = tf.train.Features(feature = {
                    'page_id': tf.train.Feature(int64_list = tf.train.Int64List(value = [page_id])),
                    'word_ids': tf.train.Feature(int64_list = tf.train.Int64List(value = word_ids)),
                    'word_tf_idfs': tf.train.Feature(float_list = tf.train.FloatList(value = word_tf_idfs)),
                }))

                # write protobuf
                writer.write(example.SerializeToString())

In [60]:
write_tfrecords(test_set, '../data/simplewiki/simplewiki-20171103.autoencoder_1.test')
write_tfrecords(dev_set, '../data/simplewiki/simplewiki-20171103.autoencoder_1.dev')
write_tfrecords(train_set, '../data/simplewiki/simplewiki-20171103.autoencoder_1.train')





