In [1]:
import tensorflow as tf
import numpy as np
import datetime
import json
import gzip
import matplotlib.pyplot as plt
import re
import os
import shutil
from tqdm import tqdm_notebook

  from ._conv import register_converters as _register_converters


In [2]:
sess = tf.InteractiveSession()

In [3]:
sess = None

In [4]:
def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [5]:
class HyperParameters:
    learning_rate = 1e-3
    
    vocab_size = 30000
    num_target_classes = 2000
    
    dropout_rate = 0.1
    
    context_size = 81
    
    d_embedding_position = 32
    d_embedding_word = 128
    
    d_attention = 128
    d_attention_ff = 512
    
    num_encoder_layers = 3
    num_decoder_layers = 1

    dataset_context_size = 121
    dataset_batch_size = 512
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 4096
    dataset_shuffle_size = 4096
    
    gradient_clip_norm = 5.0
    
    link_loss_pos_weight = 3.0
    link_loss_scale = 1.0 / 2.0
    
    max_distance_bias = 10.0

In [6]:
class EntityRecognitionModel:
    def __init__(self, session, hp):
        self._session = session
        self._hp = hp
        
    def _parse_example(self, example_proto):
        parsed = tf.parse_single_example(example_proto, features = {
            'page': tf.FixedLenFeature([1], tf.int64),
            'context': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_left': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_right': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64) })
        
        # apply random crop
        offset = (self._dataset_random_seed + parsed['page'][0]) % (
            (self._hp.dataset_context_size - self._hp.context_size) // 2)

        # apply croppings
        context = parsed['context']
        context = context[offset:offset + self._hp.context_size]
        targets = parsed['targets']
        targets = targets[offset:offset + self._hp.context_size]
        
        return (context, targets)

    def _build_data_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')
            self._dataset_random_seed = tf.placeholder_with_default(
                tf.constant(0, tf.int64),
                shape = [],
                name = 'dataset_random_seed')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hp.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)
            
            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            context, targets = self._dataset_iterator.get_next()

            # give key tensors names
            self._context = tf.identity(context, 'context')
            self._targets = tf.identity(targets, 'targets')

            # minibatch size
            self._minibatch_size = tf.shape(self._context)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
            # positions
            p = tf.range(self._hp.context_size, dtype = tf.int64)
            p = tf.tile(p, [self._minibatch_size])
            p = tf.reshape(
                p,
                [self._minibatch_size, self._hp.context_size],
                name = 'context_positions')
            self._context_positions = p
            
    def _attention_layer(self, keys, queries, values, direction):
        with tf.variable_scope('attention'):
            # variables
            key_projection = tf.get_variable(
                'key_projection',
                [keys.shape[-1].value, self._hp.d_attention])
            query_projection = tf.get_variable(
                'query_projection',
                [queries.shape[-1].value, self._hp.d_attention])
            
            # compute weights
            q = tf.tensordot(keys, key_projection, axes = 1)      # [batch_size, context_size, d_attention]
            q.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k = tf.tensordot(queries, query_projection, axes = 1) # [batch_size, context_size, d_attention]
            k.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k = tf.transpose(k, perm = [0, 2, 1])                 # [batch_size, d_attention, context_size]
            w = tf.matmul(q, k)                                   # [batch_size, context_size, context_size]
            w /= np.sqrt(self._hp.d_attention)
            
            # apply distance mask
            mask = tf.constant(
                [[-max(float(np.abs(i - j)), self._hp.max_distance_bias)
                    for j in range(self._hp.context_size)]
                    for i in range(self._hp.context_size)])
            mask = tf.expand_dims(mask, axis = 0)                 # [1, context_size, context_size]
            mask *= self._distance_scaling_factor
            w += mask
            
            # apply directional mask
            if direction is not None:
                infinity= 1e25
                if direction == 'f':
                    mask = [[-infinity if i <= j else infinity
                        for j in range(self._hp.context_size)]
                        for i in range(self._hp.context_size)]
                    mask[0][0] = infinity
                    mask = tf.constant(mask)
                else:
                    mask = [[-infinity if i >= j else infinity
                        for j in range(self._hp.context_size)]
                        for i in range(self._hp.context_size)]
                    mask[-1][-1] = infinity
                    mask = tf.constant(mask)
                mask = tf.expand_dims(mask, axis = 0)             # [1, context_size, context_size]
                w = tf.minimum(w, mask)

            # softmax
            w = tf.nn.softmax(w, name = 'weights')
            
#             # apply weights
#             layer = values + tf.matmul(w, values)
            
            # apply weights
            layer = tf.concat([values, tf.matmul(w, values)], axis = -1)

            # feed-forward hidden layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention_ff,
                activation = tf.nn.relu,
                name = 'ff_hidden')
          
            # feed-forward output layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention,
                name = 'ff_output')
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)

            # dropout
            layer = tf.layers.dropout(
                layer,
                rate = self._hp.dropout_rate,
                training = self._training)
            
            return layer
    
    def _attention_layer_self(self, layer, direction):
        return self._attention_layer(layer, layer, layer, direction)
            
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholder: training flag
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # embed context words
            word_embeddings = tf.get_variable(
                'word_embeddings', 
                [self._hp.vocab_size, self._hp.d_embedding_word])
            context_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._context)

            # embed context positions
            position_embeddings = tf.get_variable(
                'position_embeddings',
                [self._hp.context_size, self._hp.d_embedding_position],
                dtype=tf.float32)
            context_positions_embedded = tf.nn.embedding_lookup(
                position_embeddings,
                self._context_positions)

            # build full context vector (concat embeddings)
            context_full = tf.concat(
                [context_embedded, context_positions_embedded], 
                axis = -1)
            
            # encoder
            with tf.variable_scope('encoder'):
                self._distance_scaling_factor = tf.get_variable(
                    'distance_scaling_factor',
                    shape = [],
                    initializer = tf.constant_initializer([0.3]))
                
                # build input vector
                context_attention = tf.layers.dense(
                    context_full,
                    self._hp.d_attention,
                    activation = tf.nn.relu,
                    name = 'input')
                context_attention = tf.layers.batch_normalization(
                    context_attention,
                    training = self._training,
                    name = 'input')
                context_attention = tf.layers.dropout(
                    context_attention,
                    rate = self._hp.dropout_rate,
                    training = self._training)
                
                # attention layers
                layer_f = context_attention
                layer_b = context_attention
                for i in range(self._hp.num_encoder_layers):
                    with tf.variable_scope('layer_f_%d' % i):
                        layer_f = self._attention_layer_self(layer_f, direction = 'f')
                    with tf.variable_scope('layer_b_%d' % i):
                        layer_b = self._attention_layer_self(layer_b, direction = 'b')
                
                self._encoder_output = tf.concat(
                    [layer_f, layer_b],
                    axis = -1,
                    name = 'encoder_output')
                
            # decoder
            with tf.variable_scope('decoder'):
                # build history from targets
                sentinels = -tf.ones([self._minibatch_size, 1], dtype = tf.int64)
                layer = tf.concat([sentinels, self._targets[:, :-1]], axis = -1)
                # N.B., convert -1 -> 0
                layer = layer + 1
                
                # DEBUG: remove me
                self._decoder_input = layer
                
                # embedding
                decoder_embeddings = tf.get_variable(
                    'decoder_embeddings',
                    [self._hp.num_target_classes + 1, self._hp.d_attention])
                layer = tf.nn.embedding_lookup(
                    decoder_embeddings,
                    layer)

#                 # self attention
#                 for i in range(self._hp.num_decoder_layers):
#                     with tf.variable_scope('layer_b_%d' % i):
#                         layer = self._attention_layer_self(layer, direction = 'b')

                # encoder-decoder attention
                layer = self._attention_layer(
                    keys = layer,
                    queries = self._encoder_output,
                    values = self._encoder_output,
                    direction = None)
            
            # link detection
            self._output_link_logits = tf.layers.dense(layer, 1)
            self._output_link_logits = tf.squeeze(
                self._output_link_logits,
                axis = -1,
                name = 'output_link_logits')

            # class identification
            self._output_class_logits = tf.layers.dense(
                layer,
                self._hp.num_target_classes,
                name = 'output_class_logits')

    def _build_training_model(self):
        with tf.variable_scope('train'):
            # link detection losses
            target_links = tf.cast(tf.logical_not(tf.less(self._targets, 0)), tf.int64)
            link_losses = tf.nn.weighted_cross_entropy_with_logits(
                targets = tf.cast(target_links, tf.float32),
                logits = self._output_link_logits,
                pos_weight = self._hp.link_loss_pos_weight)
            link_losses *= self._hp.link_loss_scale
            
            # class identification losses
            targets = tf.maximum(self._targets, 0) # mask out -1s
            class_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = targets,
                logits = self._output_class_logits)
            # (mask off losses for non-links)
            class_losses *= tf.cast(target_links, tf.float32)
            
            # total loss
            self._total_link_loss = tf.reduce_sum(link_losses, name = 'total_link_loss')
            self._total_class_loss = tf.reduce_sum(class_losses, name = 'total_class_loss')
            self._total_loss = self._total_link_loss + self._total_class_loss
            self._total_loss = tf.identity(self._total_loss, name = 'total_loss')

            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, name = 'mean_loss')
            
            # precision/recall
            # N.B., tf.nn.softmax here is unnecessary?
            output_link_probs = tf.sigmoid(self._output_link_logits)
            self._output_links = output_link_probs > 0.5
            self._output_links = tf.cast(
                self._output_links,
                tf.int64,
                name = 'output_links')
            self._true_positives = tf.reduce_sum(
                self._output_links * target_links,
                name = 'true_positives')
            self._false_positives = tf.reduce_sum(
                self._output_links * (1 - target_links),
                name = 'false_positives')
            self._false_negatives = tf.reduce_sum(
                (1 - self._output_links) * target_links,
                name = 'false_negatives')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name='global_step', trainable=False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate=self._hp.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hp.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)
    
    def build_model(self):
        self._build_data_pipeline()
        self._build_model()
        self._build_training_model()

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_link_loss = 0
        cum_class_loss = 0
        cum_num_examples = 0
        cum_true_positives = 0
        cum_false_positives = 0
        cum_false_negatives = 0
        
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss, 
                 curr_total_link_loss,
                 curr_total_class_loss,
                 curr_minibatch_size,
                 curr_true_positives,
                 curr_false_positives,
                 curr_false_negatives) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_link_loss,
                     self._total_class_loss,
                     self._minibatch_size,
                     self._true_positives,
                     self._false_positives,
                     self._false_negatives),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update stats/progress
            cum_loss += curr_total_loss
            cum_link_loss += curr_total_link_loss
            cum_class_loss += curr_total_class_loss
            cum_num_examples += curr_minibatch_size
            cum_true_positives += curr_true_positives
            cum_false_positives += curr_false_positives
            cum_false_negatives += curr_false_negatives
            progress.update(curr_minibatch_size)

        progress.close()
        finish = datetime.datetime.now()
        
        # precision
        precision = 0
        if cum_true_positives + cum_false_positives > 0:
            precision = cum_true_positives / (cum_true_positives + cum_false_positives)
            
        # recall
        recall = 0
        if cum_true_positives + cum_false_negatives > 0:
            recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
            
        # F1
        F1 = 0
        if precision + recall > 0:
            F1 = 2 * precision * recall / (precision + recall)
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g (%g + %g), precision=%g, recall=%g, F=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_link_loss / cum_num_examples,
            cum_class_loss / cum_num_examples,
            precision,
            recall,
            F1)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [7]:
sess = reset_tf(sess)

model = EntityRecognitionModel(sess, HyperParameters())
model.build_model()
dump_statistics()

parameters for "model/word_embeddings:0": 3840000
parameters for "model/position_embeddings:0": 2592
parameters for "model/encoder/distance_scaling_factor:0": 1
parameters for "model/encoder/input/kernel:0": 20480
parameters for "model/encoder/input/bias:0": 128
parameters for "model/encoder/input/gamma:0": 128
parameters for "model/encoder/input/beta:0": 128
parameters for "model/encoder/layer_f_0/attention/key_projection:0": 16384
parameters for "model/encoder/layer_f_0/attention/query_projection:0": 16384
parameters for "model/encoder/layer_f_0/attention/ff_hidden/kernel:0": 131072
parameters for "model/encoder/layer_f_0/attention/ff_hidden/bias:0": 512
parameters for "model/encoder/layer_f_0/attention/ff_output/kernel:0": 65536
parameters for "model/encoder/layer_f_0/attention/ff_output/bias:0": 128
parameters for "model/encoder/layer_f_0/attention/batch_normalization/gamma:0": 128
parameters for "model/encoder/layer_f_0/attention/batch_normalization/beta:0": 128
parameters for "mo

In [8]:
sess.run(tf.global_variables_initializer())

In [9]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.train')
dev_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.dev')
test_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.test')

In [10]:
# sess.run(model._dataset_iterator.initializer, feed_dict = {
#     model._dataset_filenames: train_set[:1],
#     model._training: False})
# x = tf.get_default_graph().get_tensor_by_name('model/attention/layer_back_0/self/weights:0')
# y = sess.run(x, feed_dict = { model._training: False })
# for i in range(20):
#     for j in range(20):
#         print('%0.02f ' % y[6, i+30, j+30], end = '')
#     print()  

In [11]:
with open('../logs/simplewiki/mediawiki_er_softmax_2.log', 'wt') as f:
    for i in range(5):
        model.process(
            train_set,
            header = 'train %d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set, 
            header = 'dev %d' % i,
            train = False,
            log_file = f)

train 0: time=0:04:47.841262, step=965, loss=18.2051 (7.37094 + 10.8341), precision=0.564352, recall=0.811492, F=0.665725


dev 0: time=0:00:05.071193, step=965, loss=7.4759 (5.00079 + 2.47511), precision=0.749689, recall=0.810566, F=0.77894


train 1: time=0:04:48.273748, step=1930, loss=5.91535 (4.32222 + 1.59313), precision=0.708634, recall=0.873518, F=0.782485


dev 1: time=0:00:04.677835, step=1930, loss=6.1505 (4.46686 + 1.68364), precision=0.77002, recall=0.833448, F=0.80048


KeyboardInterrupt: 

In [12]:
shutil.rmtree('../models/simplewiki/er_softmax_2', ignore_errors = True)
builder = tf.saved_model.builder.SavedModelBuilder('../models/simplewiki/er_softmax_2')
builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.TRAINING])
builder.add_meta_graph([tf.saved_model.tag_constants.SERVING])
builder.save()

INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'../models/simplewiki/er_softmax_2/saved_model.pb'


b'../models/simplewiki/er_softmax_2/saved_model.pb'

# Error Analysis

In [12]:
model._output_link_probs = tf.sigmoid(model._output_link_logits)
model._output_class_probs = tf.nn.softmax(model._output_class_logits)
model._output_classes = tf.argmax(model._output_class_probs, axis = -1, name = 'output_class_probs')

In [13]:
with open('../data/simplewiki/simplewiki-20171103.er_softmax_1.vocab.txt', 'rt') as f:
    id_to_word = [w.strip() for w in f]

In [14]:
with open('../data/simplewiki/simplewiki-20171103.er_softmax_1.targets.txt', 'rt') as f:
    id_to_target = [t.strip() for t in f]

In [15]:
def compute_examples(filenames, limit = None):
    # initialize dataset iterator
    sess.run(model._dataset_iterator.initializer, feed_dict = {
        model._dataset_filenames: filenames,
        model._training: False })
    
    examples = []
    
    while True:
        # encoder output
        try:
            context, targets, encoder_output = sess.run(
                [model._context, model._targets, model._encoder_output],
                feed_dict = { model._training: False, model._dataset_limit: limit })
        except tf.errors.OutOfRangeError:
            break

        # decode
        output_links = np.zeros(targets.shape, dtype = np.int64)
        output_link_probs = np.zeros(targets.shape)
        output_classes = np.zeros(targets.shape, dtype = np.int64)
        output_class_probs = np.zeros(targets.shape + (model._hp.num_target_classes,))
        decode_targets = -np.ones(targets.shape, dtype = np.int64)
        for i in range(model._hp.context_size):
            ols, olps, ocs, ocps = sess.run(
                [model._output_links,
                 model._output_link_probs,
                 model._output_classes,
                 model._output_class_probs],
                feed_dict = {
                    model._context: context,
                    model._targets: decode_targets,
                    model._training: False })
            output_links[:, i] = ols[:, i]
            output_link_probs[:, i] = olps[:, i]
            output_classes[:, i] = ocs[:, i]
            output_class_probs[:, i, :] = ocps[:, i, :]
            for j in range(targets.shape[0]):
                if ols[j, i]:
                    decode_targets[j, i] = ocs[j, i]

        # loop through examples
        for cs, ts, ols, olps, ocs, ocps in zip(context, targets, output_links, output_link_probs, output_classes, output_class_probs):
            # stop if limit reached
            if limit and len(examples) >= limit:
                return examples
            
            # decode context
            decoded_context = [id_to_word[wid] for wid in cs]
            decoded_targets = [id_to_target[tid] if tid >= 0 else None for tid in ts]
            decoded_output_classes = [id_to_target[ocs[i]] if ols[i] else None for i in range(len(ols))]
            decoded_class_probs = [ocps[i, ocs[i]] if ols[i] else 0.0 for i in range(len(ols))]

            # decode example
            examples.append([
                decoded_context,
                decoded_targets,
                decoded_output_classes,
                olps,
                decoded_class_probs])
    
    return examples

In [16]:
examples = compute_examples(dev_set[:1], limit = 10)

In [17]:
def print_example(e):
    context, targets, outputs, link_probs, class_probs = e
    for word, target, output, link_prob, class_prob in zip(context, targets, outputs, link_probs, class_probs):
        class_prob = ' %0.3f' % class_prob if class_prob else ''
        print('%20.20s %20.20s %20.20s  %0.3f %s' % (word, target, output, link_prob, class_prob))

In [21]:
print_example(examples[5])

               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
      