In [1]:
import tensorflow as tf
import numpy as np
import datetime
import json
import gzip
import matplotlib.pyplot as plt
import re
import os
import shutil
from tqdm import tqdm_notebook

In [2]:
sess = None

In [3]:
def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [4]:
class HyperParameters:
    learning_rate = 1e-3
    
    vocab_size = 30000
    num_targets = 2000
    
    dropout_rate = 0.1
    
    context_size = 81
    
    d_embedding_position = 32
    d_embedding_word = 128
    
    d_attention = 128
    d_attention_ff = 512
    
    attention_num_layers = 4

    dataset_context_size = 121
    dataset_batch_size = 256
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 4096
    dataset_shuffle_size = 4096
    
    gradient_clip_norm = 5.0

In [5]:
class EntityRecognitionModel:
    def __init__(self, session, hp):
        self._session = session
        self._hp = hp
        
    def _parse_example(self, example_proto):
        parsed = tf.parse_single_example(example_proto, features = {
            'page': tf.FixedLenFeature([1], tf.int64),
            'context': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_left': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_right': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64) })
        
        # apply random crop
        offset = (self._dataset_random_seed + parsed['page'][0]) % (
            (self._hp.dataset_context_size - self._hp.context_size) // 2)
#         offset = tf.random_uniform(
#             [],
#             maxval = self._hp.dataset_context_size - self._hp.context_size,
#             seed = seed,
#             dtype = tf.int32)

        # apply croppings
        context = parsed['context']
        context = context[offset:offset + self._hp.context_size]
        targets = parsed['targets']
        targets = targets[offset:offset + self._hp.context_size]
        targets_left = parsed['targets_left']
        targets_left = targets_left[offset:offset + self._hp.context_size]
        targets_right = parsed['targets_right']
        targets_right = targets_right[offset:offset + self._hp.context_size]
        
        return (context, targets, targets_left, targets_right)

    def _build_data_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')
            self._dataset_random_seed = tf.placeholder_with_default(
                tf.constant(0, tf.int64),
                shape = [],
                name = 'dataset_random_seed')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hp.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)
            
            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (context, targets, targets_left, targets_right) = self._dataset_iterator.get_next()

            # give key tensors names
            self._context = tf.identity(context, 'context')
            self._targets = tf.identity(targets, 'targets')
            self._targets_left = tf.identity(targets_left, 'targets_left')
            self._targets_right = tf.identity(targets_right, 'targets_right')

            # minibatch size
            self._minibatch_size = tf.shape(self._context)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
            # positions
            p = tf.range(self._hp.context_size, dtype = tf.int64)
            p = tf.tile(p, [self._minibatch_size])
            p = tf.reshape(
                p,
                [self._minibatch_size, self._hp.context_size],
                name = 'context_positions')
            self._context_positions = p
            
    def _attention_self(self, layer):
        with tf.variable_scope('self'):
            # variables
            kernels = tf.get_variable(
                'kernels',
                [2, self._hp.d_attention, self._hp.d_attention])
            
            # compute weights
            k0 = tf.tensordot(layer, kernels[0], axes = 1) # [batch_size, context_size, d_attention]
            k0.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k1 = tf.tensordot(layer, kernels[1], axes = 1) # [batch_size, context_size, d_attention]
            k1.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k1 = tf.transpose(k1, perm = [0, 2, 1])        # [batch_size, d_attention, context_size]
            w = tf.matmul(k0, k1)                          # [batch_size, context_size, context_size]
            mask = tf.diag([-1e20] * self._hp.context_size)
            mask = tf.expand_dims(mask, axis = 0)          # [1, context_size, context_size]
            w += mask
            w /= np.sqrt(self._hp.d_attention)
            w = tf.nn.softmax(w)
            
            # apply weights
            layer += tf.matmul(w, layer)
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)

            # dropout
            layer = tf.layers.dropout(
                layer,
                rate = self._hp.dropout_rate,
                training = self._training)
            
            return layer

    def _attention_feed_forward(self, layer):
        with tf.variable_scope('ff'):
            # hidden layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention_ff,
                activation = tf.nn.relu,
                name = 'fc1')
            
            # output
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention,
                name = 'fc2')
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)
            
            # dropout
            layer = tf.layers.dropout(
                layer, 
                rate = self._hp.dropout_rate, 
                training = self._training)
        
            return layer
            
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholder: training flag
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # embed context words
            word_embeddings = tf.get_variable(
                'word_embeddings', 
                [self._hp.vocab_size, self._hp.d_embedding_word])
            context_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._context)

            # embed context positions
            position_embeddings = tf.get_variable(
                'position_embeddings',
                [self._hp.context_size, self._hp.d_embedding_position],
                dtype=tf.float32)
            context_positions_embedded = tf.nn.embedding_lookup(
                position_embeddings,
                self._context_positions)

            # build full context vector (concat embeddings)
            context_full = tf.concat(
                [context_embedded, context_positions_embedded], 
                axis = -1)
            
            # build attention layers
            with tf.variable_scope('attention'):
                # build input vector
                context_attention = tf.layers.dense(
                    context_full,
                    self._hp.d_attention,
                    activation = tf.nn.relu,
                    name = 'input')
                context_attention = tf.layers.batch_normalization(
                    context_attention,
                    training = self._training,
                    name = 'input')
                context_attention = tf.layers.dropout(
                    context_attention,
                    rate = self._hp.dropout_rate,
                    training = self._training)
                
                layer = context_attention
                for i in range(self._hp.attention_num_layers):
                    with tf.variable_scope('layer_%d' % i):
                        layer = self._attention_self(layer)
#                         layer = self._attention_feed_forward(layer)
            
            # build final softmax layer
            self._output_logits = tf.layers.dense(
                layer,
                2,
                name = 'softmax')

    def _build_training_model(self):
        with tf.variable_scope('train'):
            labels = tf.cast(tf.logical_not(tf.less(self._targets, 0)), tf.int64)
            
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = labels,
                logits = self._output_logits)
            
            self._total_loss = tf.reduce_sum(losses, name = 'total_loss')
            self._mean_loss = tf.reduce_mean(losses, name = 'mean_loss')
            
            # precision/recall
            # N.B., tf.nn.softmax here is unnecessary?
            output_probs = tf.nn.softmax(self._output_logits)
            output_labels = tf.argmax(output_probs, axis=-1)
            self._true_positives = tf.reduce_sum(output_labels * labels)
            self._false_positives = tf.reduce_sum(output_labels * (1 - labels))
            self._false_negatives = tf.reduce_sum((1 - output_labels) * labels)
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name='global_step', trainable=False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate=self._hp.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hp.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)
    
    def build_model(self):
        self._build_data_pipeline()
        self._build_model()
        self._build_training_model()

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_num_examples = 0
        cum_true_positives = 0
        cum_false_positives = 0
        cum_false_negatives = 0
        
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss, 
                 curr_minibatch_size,
                 curr_true_positives,
                 curr_false_positives,
                 curr_false_negatives) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._minibatch_size,
                     self._true_positives,
                     self._false_positives,
                     self._false_negatives),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update stats/progress
            cum_loss += curr_total_loss
            cum_num_examples += curr_minibatch_size
            cum_true_positives += curr_true_positives
            cum_false_positives += curr_false_positives
            cum_false_negatives += curr_false_negatives
            progress.update(curr_minibatch_size)

        progress.close()
        finish = datetime.datetime.now()
        
        # precision
        precision = 0
        if cum_true_positives + cum_false_positives > 0:
            precision = cum_true_positives / (cum_true_positives + cum_false_positives)
            
        # recall
        recall = 0
        if cum_true_positives + cum_false_negatives > 0:
            recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
            
        # F1
        F1 = 0
        if precision + recall > 0:
            F1 = 2 * precision * recall / (precision + recall)
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g, precision=%g, recall=%g, F=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            precision,
            recall,
            F1)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [6]:
sess = reset_tf(sess)

model = EntityRecognitionModel(sess, HyperParameters())
model.build_model()
dump_statistics()

parameters for "model/word_embeddings:0": 3840000
parameters for "model/position_embeddings:0": 2592
parameters for "model/attention/input/kernel:0": 20480
parameters for "model/attention/input/bias:0": 128
parameters for "model/attention/input/gamma:0": 128
parameters for "model/attention/input/beta:0": 128
parameters for "model/attention/layer_0/self/kernels:0": 32768
parameters for "model/attention/layer_0/self/batch_normalization/gamma:0": 128
parameters for "model/attention/layer_0/self/batch_normalization/beta:0": 128
parameters for "model/attention/layer_1/self/kernels:0": 32768
parameters for "model/attention/layer_1/self/batch_normalization/gamma:0": 128
parameters for "model/attention/layer_1/self/batch_normalization/beta:0": 128
parameters for "model/attention/layer_2/self/kernels:0": 32768
parameters for "model/attention/layer_2/self/batch_normalization/gamma:0": 128
parameters for "model/attention/layer_2/self/batch_normalization/beta:0": 128
parameters for "model/attentio

In [7]:
sess.run(tf.global_variables_initializer())

In [None]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.train')
dev_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.dev')
test_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.test')

In [None]:
with open('../logs/simplewiki/mediawiki_er_softmax_1.log', 'wt') as f:
    for i in range(10):
        model.process(
            train_set,
            header = 'train %d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev %d' % i,
            train = False,
            log_file = f)

train 0: time=0:01:35.782300, step=1929, loss=9.80121, precision=0.595679, recall=0.523342, F=0.557172


dev 0: time=0:00:01.488481, step=1929, loss=7.29219, precision=0.823297, recall=0.463839, F=0.593375
