In [1]:
import tensorflow as tf
import numpy as np
import math
import gzip
import json
import datetime
import shutil
from tqdm import tqdm

In [2]:
def reset_tf():
    global sess
    sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [3]:
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [4]:
class HyperParameters():
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 128
    
    # number of sequences per batch
    batch_size = 256
    
    # number of target classes
    num_target_classes = 2
    
    # number of combined (attention + feed forward) layers
    num_layers = 1
    
    # number of parsing threads in data pipeline
    dataset_pipeline_parallel_calls = 4
    
    # size of prefetch in data pipeline
    dataset_pipeline_prefetch = batch_size * 16
    
    # shuffle buffer size
    dataset_pipeline_shuffle_buffer_size = 10000
    
    # dropout rate
    dropout_rate = 0.1

hp = HyperParameters()

In [5]:
def parse_example(example_proto, max_sequence_length=hp.max_sequence_length):
    features = {
        'inputs': tf.VarLenFeature(tf.int64),
        'word_endings': tf.VarLenFeature(tf.int64),
        'targets': tf.VarLenFeature(tf.int64)
    }
    
    parsed = tf.parse_single_example(example_proto, features)
    
    def convert_and_pad(sparse_tensor):
        result = tf.sparse_tensor_to_dense(sparse_tensor)
        # TODO: properly ignore elements which are too large (right now we just clip)
        result = result[:max_sequence_length]
        result = tf.pad(result, [[0, max_sequence_length - tf.shape(result)[0]]])
        return result
    
    return (convert_and_pad(parsed['inputs']),
            tf.shape(parsed['inputs'])[0],
            convert_and_pad(parsed['word_endings']),
            convert_and_pad(parsed['targets']))

In [6]:
reset_tf()

# Data pipeline
# -------------

dataset_filenames = tf.placeholder(tf.string, shape=[None])

dataset = tf.data.TFRecordDataset(dataset_filenames)
dataset = dataset.map(parse_example, 
                      num_parallel_calls = hp.dataset_pipeline_parallel_calls)
dataset = dataset.shuffle(hp.dataset_pipeline_shuffle_buffer_size)
dataset = dataset.prefetch(hp.dataset_pipeline_prefetch)
dataset = dataset.batch(hp.batch_size)

dataset_iterator = dataset.make_initializable_iterator()
(input_sequences_it,
 input_lengths_it,
 input_word_endings_it,
 target_sequences_it) = dataset_iterator.get_next()

# Placeholders
# ------------

input_sequences = tf.placeholder_with_default(input_sequences_it,
                                              shape = [None, hp.max_sequence_length],
                                              name = 'input_sequences')
input_lengths = tf.placeholder_with_default(input_lengths_it,
                                            shape = [None],
                                            name = 'input_lengths')
input_word_endings = tf.placeholder_with_default(input_word_endings_it,
                                                 shape = [None, hp.max_sequence_length],
                                                 name = 'input_word_endings')
target_sequences = tf.placeholder_with_default(target_sequences_it,
                                               shape = [None, hp.max_sequence_length],
                                               name = 'target_sequences')

# sequences of input positions (not a placeholder)
input_positions = tf.range(hp.max_sequence_length, dtype=tf.int32)
input_positions = tf.tile(input_positions, [tf.shape(input_sequences)[0]])
input_positions = tf.reshape(input_positions, 
                             (tf.shape(input_sequences)[0], hp.max_sequence_length), 
                             name = 'input_positions')

is_training = tf.placeholder(tf.bool)

# Embeddings
# ----------

# sequences of input embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                            (hp.vocab_size, hp.embedding_size))
input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                  input_sequences,
                                                  name = 'input_sequences_embedded')

# sequences of input position embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                            (hp.max_sequence_length, hp.embedding_size))
input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, input_positions)

# sequences of word ending embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_word_ending_embeddings = tf.get_variable('input_word_ending_embeddings',
                                               (2, hp.embedding_size))
input_word_endings_embedded = tf.nn.embedding_lookup(input_word_ending_embeddings, 
                                                     input_word_endings,
                                                     name = 'input_word_endings_embedded')

# Sequence mask
# -------------

sequence_mask = tf.sequence_mask(input_lengths,
                                 hp.max_sequence_length,
                                 dtype = tf.float32)
# expand dimensions to support broadcasting
expanded_sequence_mask = tf.expand_dims(sequence_mask, 
                                        2, 
                                        name = 'sequence')

input_combined_embedded = tf.add_n([input_sequences_embedded, 
                                    input_positions_embedded, 
                                    input_word_endings_embedded])
# TODO: is this necessary?
input_combined_embedded = tf.multiply(input_combined_embedded,
                                      expanded_sequence_mask)
input_combined_embedded = tf.layers.dropout(input_combined_embedded,
                                            rate = hp.dropout_rate,
                                            training = is_training)

# Layer normalization
# -------------------

def layer_norm(x, num_units, scope, reuse=None, epsilon=1e-6):
    with tf.variable_scope(scope, reuse=reuse):
        scale = tf.get_variable(
            "layer_norm_scale", [num_units], initializer=tf.ones_initializer())
        bias = tf.get_variable(
            "layer_norm_bias", [num_units], initializer=tf.zeros_initializer())
        result = layer_norm_compute(x, epsilon, scale, bias)
        return result

def layer_norm_compute(x, epsilon, scale, bias):
    # TODO: incorporate length into layer normalization?
    mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    return norm_x * scale + bias
    
# Attention
# ---------

def attention_layer(A, dropout_rate, is_training):
    A_T = tf.transpose(A, perm=[0, 2, 1])
    scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
    result = tf.matmul(tf.nn.softmax(scaled_logits), A)
    result = tf.layers.dropout(result,
                               rate = dropout_rate,
                               training = is_training)
    return result

# Feed-forward
# ------------

def feed_forward_layer(A, num_units, dropout_rate, is_training, scope, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        A = tf.layers.dense(A, num_units, activation=tf.nn.relu, name='fc1')
        A = tf.layers.dense(A, num_units, name='fc2')
        A = tf.layers.dropout(A, rate = dropout_rate, training = is_training)
        return A

# Layers
# ------

def combined_layer(A, num_units, mask, dropout_rate, is_training, scope, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        A = layer_norm(A + attention_layer(A, dropout_rate, is_training), num_units, scope='attention_norm')
        A = layer_norm(A + feed_forward_layer(A, num_units, dropout_rate, is_training, 'ff'), num_units, scope='ff_norm')
        A *= mask
        return A
    
layer = input_combined_embedded
for i in range(hp.num_layers):
    layer = combined_layer(layer, 
                           hp.embedding_size, 
                           expanded_sequence_mask, 
                           hp.dropout_rate,
                           is_training,
                           'layer_%d' % i)

# Softmax
# -------

output_logits = tf.layers.dense(layer, hp.num_target_classes, name='softmax')

# TODO: is the softmax here really necessary?
output_sequences = tf.nn.softmax(output_logits)
output_sequences = tf.argmax(output_sequences, axis=-1)
output_sequences *= tf.cast(sequence_mask, tf.int64)

# Loss
# ----

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_sequences,
                                                        logits=output_logits)
losses *= sequence_mask

total_loss = tf.reduce_sum(losses)
total_input_length = tf.reduce_sum(input_lengths)
mean_loss  = total_loss / tf.cast(total_input_length, tf.float32)

true_positives = tf.reduce_sum(output_sequences * target_sequences)
false_positives = tf.reduce_sum(tf.maximum(output_sequences - target_sequences, 0))
false_negatives = tf.reduce_sum(tf.maximum(target_sequences - output_sequences, 0))

# Training
# --------

global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_op = optimizer.minimize(mean_loss, global_step=global_step)

# Summaries
# ---------

tf.summary.scalar('mean_loss', mean_loss)

merged_summaries = tf.summary.merge_all()

In [7]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    print('parameters for "%s": %d' % (variable.name, variable_parameters))
    total_parameters += variable_parameters
print('total parameters: %d' % total_parameters)

parameters for "input_sequence_embeddings:0": 1280000
parameters for "input_position_embeddings:0": 5120
parameters for "input_word_ending_embeddings:0": 256
parameters for "layer_0/attention_norm/layer_norm_scale:0": 128
parameters for "layer_0/attention_norm/layer_norm_bias:0": 128
parameters for "layer_0/ff/fc1/kernel:0": 16384
parameters for "layer_0/ff/fc1/bias:0": 128
parameters for "layer_0/ff/fc2/kernel:0": 16384
parameters for "layer_0/ff/fc2/bias:0": 128
parameters for "layer_0/ff_norm/layer_norm_scale:0": 128
parameters for "layer_0/ff_norm/layer_norm_bias:0": 128
parameters for "softmax/kernel:0": 256
parameters for "softmax/bias:0": 2
total parameters: 1319170


In [8]:
sess.run(tf.global_variables_initializer())

In [9]:
def evaluate_dataset(filename, header='results', train=False, show_progress=True):
    cum_loss = 0
    cum_input_length = 0
    
    cum_true_positives = 0
    cum_false_positives = 0
    cum_false_negatives = 0
    
    sess.run(dataset_iterator.initializer, feed_dict={
        dataset_filenames: [filename]
    })
    
    start = datetime.datetime.now()
    
    if show_progress:
        progress = tqdm()
        
    while True:
        try:
            (_,
             curr_loss, 
             curr_input_length, 
             curr_true_positives,
             curr_false_positives,
             curr_false_negatives) = sess.run((train_op if train else [],
                                               total_loss,
                                               total_input_length,
                                               true_positives,
                                               false_positives,
                                               false_negatives),
                                              feed_dict = { is_training: train })
        except tf.errors.OutOfRangeError:
            break

        if show_progress:
            progress.update(curr_input_length)

        cum_loss += curr_loss
        cum_input_length += curr_input_length
        cum_true_positives += curr_true_positives
        cum_false_positives += curr_false_positives
        cum_false_negatives += curr_false_negatives
    
    if show_progress:
        progress.close()

    finish = datetime.datetime.now()
    elapsed = (finish - start).total_seconds()
    
    precision = cum_true_positives / (cum_true_positives + cum_false_positives)
    recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    
    print('%s: loss=%g, elapsed=%gs, precision=%g, recall=%g, F1=%g' % (header,
                                                                        cum_loss/cum_input_length, 
                                                                        elapsed, 
                                                                        precision, 
                                                                        recall, 
                                                                        F1))

In [10]:
# builder = tf.saved_model.builder.SavedModelBuilder('../models/simplewiki/attention_1_layer')
# builder.add_meta_graph_and_variables(sess, ['training'])

num_epochs = 100

for epoch in range(num_epochs):
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                              header='train %d' % epoch,
                              train=True,
                              show_progress=True)
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                              header='dev %d' % epoch,
                              train=False,
                              show_progress=False)
    #builder.save()

1856792it [00:05, 395129.18it/s]

KeyboardInterrupt: 