In [1]:
import tensorflow as tf
import json
import gzip
import numpy as np
from tqdm import tqdm_notebook

In [2]:
sess = None

In [3]:
def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

In [4]:
class HyperParameters():
    # adam learning rate
    learning_rate = 1e-3

    # number of distinct terms (term indices are expected in 0..range(num_terms))
    vocab_size = 30000
    
    # number of dimensions in hidden layer
    hidden_size = 512

    # number of dimensions in document embedding
    embedding_size = 128
    
    # dropout rate
    dropout_rate = 0.1
    
    # number of sequences per batch
    pipeline_batch_size = 32
    
    # number of parsing threads in data pipeline
    pipeline_num_parallel_calls = 4
    
    # size of prefetch in data pipeline
    pipeline_prefetch_size = pipeline_batch_size * 16
    
    # shuffle buffer size
    pipeline_shuffle_size = 256

hp = HyperParameters()

In [5]:
sess = reset_tf(sess)

# Pipeline
# --------

# TODO: use SparseTensor / don't use dataset API for speed?

def parse_example(example_proto):
    features = {
        'page_id': tf.FixedLenFeature([1], dtype=tf.int64),
        'para_id': tf.FixedLenFeature([1], dtype=tf.int64),
        'indices': tf.VarLenFeature(tf.int64),
        'freqs': tf.VarLenFeature(tf.int64)
    }
    parsed = tf.parse_single_example(example_proto, features)
    page_id = parsed['page_id']
    para_id = parsed['para_id']
    indices = tf.sparse_tensor_to_dense(parsed['indices'])
    freqs = tf.sparse_tensor_to_dense(parsed['freqs'])
    return page_id, para_id, tf.cast(tf.sparse_to_dense(indices, [hp.vocab_size], freqs), tf.float32)

dataset_filenames = tf.placeholder(tf.string, shape = [None], name = 'dataset_filenames')

dataset = tf.data.TFRecordDataset(dataset_filenames)
dataset = dataset.map(parse_example,
                      num_parallel_calls = hp.pipeline_num_parallel_calls)
dataset = dataset.shuffle(hp.pipeline_shuffle_size)
dataset = dataset.prefetch(hp.pipeline_prefetch_size)
dataset = dataset.batch(hp.pipeline_batch_size)

dataset_iterator = dataset.make_initializable_iterator()

input_page_id_iter, input_para_id_iter, input_tf_vector_iter = dataset_iterator.get_next()

input_page_id = tf.placeholder_with_default(input_page_id_iter, [None, 1], name = 'input_page_id')
input_para_id = tf.placeholder_with_default(input_para_id_iter, [None, 1], name = 'input_para_id')
input_tf_vector = tf.placeholder_with_default(input_tf_vector_iter, 
                                              [None, hp.vocab_size],
                                              name = 'input_tf_vector')
input_tf_vector_count = tf.shape(input_tf_vector)[0]

input_tf_vector_norm = tf.reduce_sum(input_tf_vector, axis = -1,  keep_dims = True)
input_tf_vector_normalized = input_tf_vector / (input_tf_vector_norm + 1e-8)

# Model
# -----

def layer_dense_with_norm(x, num_units, scope, reuse=None, epsilon=1e-6):
    x = tf.layers.dense(x, num_units, activation = tf.nn.relu, name=scope)
    return x

layer = input_tf_vector_normalized

layer = layer_dense_with_norm(layer, hp.hidden_size, 'input_hidden_layer')
layer = layer_dense_with_norm(layer, hp.embedding_size, 'input_embedding_layer')
layer = layer_dense_with_norm(layer, hp.hidden_size, 'output_hidden_layer')

with tf.variable_scope('output_layer'):
    output_tf_vector_normalized = tf.layers.dense(layer,hp.vocab_size)

# Loss
# ----

indiv_loss = tf.losses.mean_squared_error(input_tf_vector_normalized,
                                          output_tf_vector_normalized,
                                          reduction = tf.losses.Reduction.NONE)
total_loss = tf.reduce_sum(indiv_loss, name = 'total_loss')
mean_loss = tf.div(total_loss, 
                   (tf.cast(input_tf_vector_count, tf.float32) * hp.vocab_size),
                   name = 'mean_loss')

# Optimization
# ------------

optimizer = tf.train.AdamOptimizer(learning_rate = hp.learning_rate)
train_op = optimizer.minimize(mean_loss)
# gradients, variables = zip(*optimizer.compute_gradients(mean_loss))
# gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
# train_op = optimizer.apply_gradients(zip(gradients, variables))

# Stats
# -----

total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    print('parameters for "%s": %d' % (variable.name, variable_parameters))
    total_parameters += variable_parameters
print('total parameters: %d' % total_parameters)

parameters for "input_hidden_layer/kernel:0": 15360000
parameters for "input_hidden_layer/bias:0": 512
parameters for "input_embedding_layer/kernel:0": 65536
parameters for "input_embedding_layer/bias:0": 128
parameters for "output_hidden_layer/kernel:0": 65536
parameters for "output_hidden_layer/bias:0": 512
parameters for "output_layer/dense/kernel:0": 15360000
parameters for "output_layer/dense/bias:0": 30000
total parameters: 30882224


In [6]:
sess.run(tf.global_variables_initializer())

In [7]:
def evaluate_dataset(dataset_filename,
                     header = 'results',
                     train = False,
                     show_progress = True):
    cum_loss = 0
    cum_count = 0

    sess.run(dataset_iterator.initializer, feed_dict={
        dataset_filenames: [dataset_filename]
    })

    if show_progress:
        progress = tqdm_notebook()

    while True:
        try:
            (_,
             curr_loss,
             curr_count) = sess.run((train_op if train else [],
                                     total_loss,
                                     input_tf_vector_count))
        except tf.errors.OutOfRangeError:
            break

        if show_progress:
            progress.update(curr_count)

        cum_loss += curr_loss
        cum_count += curr_count

    if show_progress:
        progress.close()

    print('%s: loss=%g (%g/%d)' % (header, cum_loss/cum_count, cum_loss, cum_count))

In [8]:
for epoch in range(50):
    evaluate_dataset('../data/simplewiki/simplewiki-20171103.topic_model.30k.train.tfrecords',
                     header = 'train %d' % epoch,
                     train = True,
                     show_progress = True)
    evaluate_dataset('../data/simplewiki/simplewiki-20171103.topic_model.30k.dev.tfrecords',
                     header = 'dev   %d' % epoch,
                     train = False,
                     show_progress = False)


train 0: loss=0.0250402 (2152.13/85947)
dev   0: loss=0.0196415 (392.831/20000)



train 1: loss=0.0181709 (1561.74/85947)
dev   1: loss=0.0165593 (331.185/20000)



train 2: loss=0.01587 (1363.98/85947)
dev   2: loss=0.0152249 (304.499/20000)



train 3: loss=0.0149247 (1282.73/85947)
dev   3: loss=0.0145011 (290.023/20000)



train 4: loss=0.0142829 (1227.58/85947)
dev   4: loss=0.0139927 (279.855/20000)



train 5: loss=0.0138241 (1188.14/85947)
dev   5: loss=0.0136265 (272.529/20000)



train 6: loss=0.0134551 (1156.42/85947)
dev   6: loss=0.0132969 (265.937/20000)



train 7: loss=0.0131189 (1127.53/85947)
dev   7: loss=0.0130015 (260.03/20000)



train 8: loss=0.0128325 (1102.91/85947)
dev   8: loss=0.0127507 (255.014/20000)



train 9: loss=0.0125981 (1082.77/85947)
dev   9: loss=0.0125443 (250.887/20000)



train 10: loss=0.0124071 (1066.35/85947)
dev   10: loss=0.0123749 (247.499/20000)



train 11: loss=0.0122469 (1052.59/85947)
dev   11: loss=0.0122336 (244.673/20000)



train 12: loss=0.0121064 (1040.51/85947)
dev   12: loss=0.0121087 (242.174/20000)



train 13: loss=0.0119828 (1029.89/85947)
dev   13: loss=0.011996 (239.921/20000)



train 14: loss=0.0118759 (1020.7/85947)
dev   14: loss=0.0119014 (238.028/20000)



train 15: loss=0.0117847 (1012.86/85947)
dev   15: loss=0.0118212 (236.424/20000)



train 16: loss=0.0117068 (1006.16/85947)
dev   16: loss=0.0117492 (234.983/20000)



train 17: loss=0.0116385 (1000.3/85947)
dev   17: loss=0.0116842 (233.684/20000)



train 18: loss=0.0115766 (994.974/85947)
dev   18: loss=0.0116256 (232.512/20000)



train 19: loss=0.0115192 (990.038/85947)
dev   19: loss=0.011574 (231.48/20000)



train 20: loss=0.0114661 (985.474/85947)
dev   20: loss=0.0115282 (230.564/20000)



train 21: loss=0.011417 (981.258/85947)
dev   21: loss=0.0114877 (229.753/20000)



train 22: loss=0.0113713 (977.327/85947)
dev   22: loss=0.0114505 (229.01/20000)



train 23: loss=0.0113281 (973.617/85947)
dev   23: loss=0.0114155 (228.311/20000)



train 24: loss=0.0112862 (970.011/85947)
dev   24: loss=0.0113801 (227.602/20000)



train 25: loss=0.011244 (966.39/85947)
dev   25: loss=0.0113464 (226.929/20000)



train 26: loss=0.011202 (962.778/85947)
dev   26: loss=0.0113145 (226.29/20000)



train 27: loss=0.0111605 (959.211/85947)
dev   27: loss=0.0112828 (225.656/20000)



train 28: loss=0.01112 (955.734/85947)
dev   28: loss=0.0112489 (224.978/20000)



train 29: loss=0.0110799 (952.286/85947)
dev   29: loss=0.0112151 (224.302/20000)



train 30: loss=0.0110402 (948.875/85947)
dev   30: loss=0.0111798 (223.597/20000)



train 31: loss=0.0110012 (945.52/85947)
dev   31: loss=0.0111428 (222.857/20000)



train 32: loss=0.0109638 (942.309/85947)
dev   32: loss=0.0111106 (222.212/20000)



train 33: loss=0.0109269 (939.135/85947)
dev   33: loss=0.0110752 (221.504/20000)



train 34: loss=0.0108904 (935.999/85947)
dev   34: loss=0.0110418 (220.837/20000)



train 35: loss=0.0108552 (932.968/85947)
dev   35: loss=0.0110095 (220.19/20000)



train 36: loss=0.010821 (930.035/85947)
dev   36: loss=0.0109789 (219.579/20000)



train 37: loss=0.0107884 (927.234/85947)
dev   37: loss=0.0109488 (218.976/20000)



train 38: loss=0.0107583 (924.641/85947)
dev   38: loss=0.0109227 (218.454/20000)



train 39: loss=0.0107304 (922.245/85947)
dev   39: loss=0.0108971 (217.942/20000)



train 40: loss=0.0107044 (920.014/85947)
dev   40: loss=0.010873 (217.46/20000)



train 41: loss=0.0106802 (917.927/85947)
dev   41: loss=0.0108509 (217.018/20000)



train 42: loss=0.0106581 (916.031/85947)
dev   42: loss=0.010828 (216.56/20000)



train 43: loss=0.0106371 (914.229/85947)
dev   43: loss=0.0108059 (216.117/20000)



train 44: loss=0.0106162 (912.432/85947)
dev   44: loss=0.0107862 (215.723/20000)



train 45: loss=0.0105957 (910.671/85947)
dev   45: loss=0.0107671 (215.341/20000)



train 46: loss=0.010576 (908.979/85947)
dev   46: loss=0.0107489 (214.978/20000)



train 47: loss=0.0105567 (907.317/85947)
dev   47: loss=0.0107332 (214.665/20000)



train 48: loss=0.0105383 (905.735/85947)
dev   48: loss=0.0107166 (214.333/20000)



train 49: loss=0.0105216 (904.304/85947)
dev   49: loss=0.0107015 (214.029/20000)


In [9]:
builder = tf.saved_model.builder.SavedModelBuilder('../models/simplewiki/topic_model_1_128')
builder.add_meta_graph_and_variables(sess, ['training'])
builder.save()

INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'../models/simplewiki/topic_model_1_128/saved_model.pb'


b'../models/simplewiki/topic_model_1_128/saved_model.pb'

In [6]:
# sess = reset_tf()
# tf.saved_model.loader.load(sess, ['training'], '../models/simplewiki/topic_model_1_256')
# print('loaded')

INFO:tensorflow:Restoring parameters from b'../models/simplewiki/topic_model_1_256/variables/variables'
loaded


In [7]:
# dataset_filenames = tf.get_default_graph().get_operation_by_name('dataset_filenames').outputs[0]
# embedding_layer = tf.get_default_graph().get_operation_by_name('input_embedding_layer/Relu').outputs[0]
# input_page_id = tf.get_default_graph().get_operation_by_name('input_page_id').outputs[0]
# input_para_id = tf.get_default_graph().get_operation_by_name('input_para_id').outputs[0]
# make_iterator = tf.get_default_graph().get_operation_by_name('MakeIterator')

In [10]:
make_iterator = dataset_iterator.initializer
embedding_layer = tf.get_default_graph().get_operation_by_name('input_embedding_layer/Relu').outputs[0]

In [14]:
def extract_embeddings(dataset_filename):
    sess.run(make_iterator, feed_dict={
        dataset_filenames: [dataset_filename]
    })
    
    progress = tqdm_notebook()
    
    result = []
    
    while True:
        try:
            (curr_input_page_id, 
             curr_input_para_id, 
             curr_embedding_layer) = sess.run((input_page_id, input_para_id, embedding_layer))
        except tf.errors.OutOfRangeError:
            break
        for i in range(curr_input_page_id.shape[0]):
            page_id = int(curr_input_page_id[i][0])
            embedding = curr_embedding_layer[i].tolist()
            result.append((page_id, embedding))
            
        progress.update(curr_input_page_id.shape[0])
    
    progress.close()
    
    return result

In [15]:
all_embeddings = []

all_embeddings.extend(extract_embeddings('../data/simplewiki/simplewiki-20171103.topic_model.30k.dev.tfrecords'))
all_embeddings.extend(extract_embeddings('../data/simplewiki/simplewiki-20171103.topic_model.30k.test.tfrecords'))
all_embeddings.extend(extract_embeddings('../data/simplewiki/simplewiki-20171103.topic_model.30k.train.tfrecords'))

all_embeddings.sort()










In [21]:
for i, (j, _) in enumerate(all_embeddings):
    assert i == j

In [27]:
all_embeddings = np.array([v for _, v in all_embeddings])

In [30]:
with gzip.open('../data/simplewiki/simplewiki-20171103.topic_model_1_128.embedding.npy.gz', 'wb') as f:
    np.save(f, all_embeddings)