In [1]:
import tensorflow as tf
import numpy as np
import datetime
import json
import gzip
import matplotlib.pyplot as plt
import re
import os
import shutil
from tqdm import tqdm_notebook

In [2]:
sess = None

In [3]:
def reset_tf(sess = None, log_device_placement = False):
    if sess:
        sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    return tf.InteractiveSession(config = tf.ConfigProto(log_device_placement = log_device_placement))

def dump_statistics():
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        print('parameters for "%s": %d' % (variable.name, variable_parameters))
        total_parameters += variable_parameters
    print('total parameters: %d' % total_parameters)

In [4]:
class HyperParameters:
    learning_rate = 1e-3
    
    vocab_size = 30000
    num_targets = 2000
    
    dropout_rate = 0.1
    
    context_size = 81
    
    d_embedding_position = 32
    d_embedding_word = 128
    
    d_attention = 128
    d_attention_ff = 512
    
    attention_num_layers = 4

    dataset_context_size = 121
    dataset_batch_size = 512
    dataset_num_parallel_calls = 4
    dataset_prefetch_size = 4096
    dataset_shuffle_size = 4096
    
    gradient_clip_norm = 5.0
    
    link_loss_pos_weight = 3.0
    link_loss_scale = 2.0

In [5]:
class EntityRecognitionModel:
    def __init__(self, session, hp):
        self._session = session
        self._hp = hp
        
    def _parse_example(self, example_proto):
        parsed = tf.parse_single_example(example_proto, features = {
            'page': tf.FixedLenFeature([1], tf.int64),
            'context': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_left': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64),
            'targets_right': tf.FixedLenFeature([self._hp.dataset_context_size], tf.int64) })
        
        # apply random crop
        offset = (self._dataset_random_seed + parsed['page'][0]) % (
            (self._hp.dataset_context_size - self._hp.context_size) // 2)
#         offset = tf.random_uniform(
#             [],
#             maxval = self._hp.dataset_context_size - self._hp.context_size,
#             seed = seed,
#             dtype = tf.int32)

        # apply croppings
        context = parsed['context']
        context = context[offset:offset + self._hp.context_size]
        targets = parsed['targets']
        targets = targets[offset:offset + self._hp.context_size]
        targets_left = parsed['targets_left']
        targets_left = targets_left[offset:offset + self._hp.context_size]
        targets_right = parsed['targets_right']
        targets_right = targets_right[offset:offset + self._hp.context_size]
        
        return (context, targets, targets_left, targets_right)

    def _build_data_pipeline(self):
        with tf.variable_scope('dataset'):
            # placeholders
            self._dataset_filenames = tf.placeholder(
                tf.string,
                shape = [None],
                name = 'dataset_filenames')
            self._dataset_limit = tf.placeholder_with_default(
                tf.constant(-1, tf.int64),
                shape = [],
                name = 'dataset_limit')
            self._dataset_shuffle_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_shuffle_size')
            self._dataset_batch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_batch_size, tf.int64),
                shape = [],
                name = 'dataset_batch_size')
            self._dataset_prefetch_size = tf.placeholder_with_default(
                tf.constant(self._hp.dataset_prefetch_size, tf.int64),
                shape = [],
                name = 'dataset_prefetch_size')
            self._dataset_random_seed = tf.placeholder_with_default(
                tf.constant(0, tf.int64),
                shape = [],
                name = 'dataset_random_seed')

            # build dataset
            dataset = tf.data.TFRecordDataset(
                tf.random_shuffle(self._dataset_filenames),
                compression_type='GZIP')
            dataset = dataset.take(self._dataset_limit)
            dataset = dataset.map(
                self._parse_example,
                num_parallel_calls = self._hp.dataset_num_parallel_calls)
            dataset = dataset.shuffle(self._dataset_shuffle_size)
            dataset = dataset.prefetch(self._dataset_prefetch_size)
            dataset = dataset.batch(self._dataset_batch_size)
            
            # build iterator
            self._dataset_iterator = dataset.make_initializable_iterator()
            (context, targets, targets_left, targets_right) = self._dataset_iterator.get_next()

            # give key tensors names
            self._context = tf.identity(context, 'context')
            self._targets = tf.identity(targets, 'targets')
            self._targets_left = tf.identity(targets_left, 'targets_left')
            self._targets_right = tf.identity(targets_right, 'targets_right')

            # minibatch size
            self._minibatch_size = tf.shape(self._context)[0]
            self._minibatch_size = tf.identity(self._minibatch_size, 'minibatch_size')
            
            # positions
            p = tf.range(self._hp.context_size, dtype = tf.int64)
            p = tf.tile(p, [self._minibatch_size])
            p = tf.reshape(
                p,
                [self._minibatch_size, self._hp.context_size],
                name = 'context_positions')
            self._context_positions = p
            
    def _attention_self(self, layer):
        with tf.variable_scope('self'):
            # variables
            kernels = tf.get_variable(
                'kernels',
                [2, self._hp.d_attention, self._hp.d_attention])
            
            # compute weights
            k0 = tf.tensordot(layer, kernels[0], axes = 1) # [batch_size, context_size, d_attention]
            k0.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k1 = tf.tensordot(layer, kernels[1], axes = 1) # [batch_size, context_size, d_attention]
            k1.set_shape([None, self._hp.context_size, self._hp.d_attention])
            k1 = tf.transpose(k1, perm = [0, 2, 1])        # [batch_size, d_attention, context_size]
            w = tf.matmul(k0, k1)                          # [batch_size, context_size, context_size]
            mask = tf.diag([-1e20] * self._hp.context_size)
            mask = tf.expand_dims(mask, axis = 0)          # [1, context_size, context_size]
            w += mask
            w /= np.sqrt(self._hp.d_attention)
            w = tf.nn.softmax(w)
            
            # apply weights
            layer += tf.matmul(w, layer)
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)

            # dropout
            layer = tf.layers.dropout(
                layer,
                rate = self._hp.dropout_rate,
                training = self._training)
            
            return layer

    def _attention_feed_forward(self, layer):
        with tf.variable_scope('ff'):
            # hidden layer
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention_ff,
                activation = tf.nn.relu,
                name = 'fc1')
            
            # output
            layer = tf.layers.dense(
                layer,
                self._hp.d_attention,
                name = 'fc2')
            
            # batch norm
            layer = tf.layers.batch_normalization(
                layer,
                training = self._training)
            
            # dropout
            layer = tf.layers.dropout(
                layer, 
                rate = self._hp.dropout_rate, 
                training = self._training)
        
            return layer
            
    def _build_model(self):
        with tf.variable_scope('model'):
            # placeholder: training flag
            self._training = tf.placeholder(tf.bool, name = 'training')
            
            # embed context words
            word_embeddings = tf.get_variable(
                'word_embeddings', 
                [self._hp.vocab_size, self._hp.d_embedding_word])
            context_embedded = tf.nn.embedding_lookup(
                word_embeddings,
                self._context)

            # embed context positions
            position_embeddings = tf.get_variable(
                'position_embeddings',
                [self._hp.context_size, self._hp.d_embedding_position],
                dtype=tf.float32)
            context_positions_embedded = tf.nn.embedding_lookup(
                position_embeddings,
                self._context_positions)

            # build full context vector (concat embeddings)
            context_full = tf.concat(
                [context_embedded, context_positions_embedded], 
                axis = -1)
            
            # build attention layers
            with tf.variable_scope('attention'):
                # build input vector
                context_attention = tf.layers.dense(
                    context_full,
                    self._hp.d_attention,
                    activation = tf.nn.relu,
                    name = 'input')
                context_attention = tf.layers.batch_normalization(
                    context_attention,
                    training = self._training,
                    name = 'input')
                context_attention = tf.layers.dropout(
                    context_attention,
                    rate = self._hp.dropout_rate,
                    training = self._training)
                
                layer = context_attention
                for i in range(self._hp.attention_num_layers):
                    with tf.variable_scope('layer_%d' % i):
                        layer = self._attention_self(layer)
                        layer = self._attention_feed_forward(layer)
            
            # link detection
            self._output_link_logits = tf.layers.dense(layer, 1)
            self._output_link_logits = tf.squeeze(
                self._output_link_logits,
                axis = -1,
                name = 'output_link_logits')

            # class identification
            self._output_class_logits = tf.layers.dense(
                layer,
                self._hp.num_targets,
                name = 'output_class_logits')

    def _build_training_model(self):
        with tf.variable_scope('train'):
            # link detection losses
            target_links = tf.cast(tf.logical_not(tf.less(self._targets, 0)), tf.int64)
            link_losses = tf.nn.weighted_cross_entropy_with_logits(
                targets = tf.cast(target_links, tf.float32),
                logits = self._output_link_logits,
                pos_weight = self._hp.link_loss_pos_weight)
            link_losses *= self._hp.link_loss_scale
            
            # DEBUG: remove me
            self._link_losses = link_losses

            # class identification losses
            targets = tf.maximum(self._targets, 0) # prevent NaNs
            class_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = targets,
                logits = self._output_class_logits)
            # (mask off losses for non-links)
            class_losses *= tf.cast(target_links, tf.float32)

            # DEBUG: remove me
            self._class_losses = class_losses
            
            # total loss
            self._total_link_loss = tf.reduce_sum(link_losses, name = 'total_link_loss')
            self._total_class_loss = tf.reduce_sum(class_losses, name = 'total_class_loss')
            self._total_loss = self._total_link_loss + self._total_class_loss
            self._total_loss = tf.identity(self._total_loss, name = 'total_loss')

            # mean loss
            self._mean_loss = self._total_loss / tf.cast(self._minibatch_size, tf.float32)
            self._mean_loss = tf.identity(self._mean_loss, name = 'mean_loss')
            
            # precision/recall
            # N.B., tf.nn.softmax here is unnecessary?
            output_link_probs = tf.sigmoid(self._output_link_logits)
            self._output_links = output_link_probs > 0.5
            self._output_links = tf.cast(
                self._output_links,
                tf.int64,
                name = 'output_links')
            self._true_positives = tf.reduce_sum(
                self._output_links * target_links,
                name = 'true_positives')
            self._false_positives = tf.reduce_sum(
                self._output_links * (1 - target_links),
                name = 'false_positives')
            self._false_negatives = tf.reduce_sum(
                (1 - self._output_links) * target_links,
                name = 'false_negatives')
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._global_step = tf.Variable(0, name='global_step', trainable=False)
                self._optimizer = tf.train.AdamOptimizer(learning_rate=self._hp.learning_rate)
                
                # gradient clipping
                gradients, variables = zip(*self._optimizer.compute_gradients(self._mean_loss))
                gradients, _ = tf.clip_by_global_norm(
                    gradients, 
                    self._hp.gradient_clip_norm)
                
                self._train_op = self._optimizer.apply_gradients(
                    zip(gradients, variables),
                    global_step = self._global_step)
    
    def build_model(self):
        self._build_data_pipeline()
        self._build_model()
        self._build_training_model()

    def process(self,
                dataset_filenames,
                dataset_limit = -1,
                header = 'results',
                train = False,
                log_file = None):
        # initialize dataset to files
        self._session.run(self._dataset_iterator.initializer, feed_dict={
            self._dataset_filenames: dataset_filenames,
            self._dataset_limit: dataset_limit })

        cum_loss = 0
        cum_link_loss = 0
        cum_class_loss = 0
        cum_num_examples = 0
        cum_true_positives = 0
        cum_false_positives = 0
        cum_false_negatives = 0
        
        start = datetime.datetime.now()
        progress = tqdm_notebook(leave = False, desc = header)

        while True:
            # process a minibatch
            try:
                (_,
                 curr_total_loss, 
                 curr_total_link_loss,
                 curr_total_class_loss,
                 curr_minibatch_size,
                 curr_true_positives,
                 curr_false_positives,
                 curr_false_negatives) = self._session.run(
                    (self._train_op if train else (),
                     self._total_loss,
                     self._total_link_loss,
                     self._total_class_loss,
                     self._minibatch_size,
                     self._true_positives,
                     self._false_positives,
                     self._false_negatives),
                    feed_dict = { self._training: train })
            except tf.errors.OutOfRangeError:
                break

            # update stats/progress
            cum_loss += curr_total_loss
            cum_link_loss += curr_total_link_loss
            cum_class_loss += curr_total_class_loss
            cum_num_examples += curr_minibatch_size
            cum_true_positives += curr_true_positives
            cum_false_positives += curr_false_positives
            cum_false_negatives += curr_false_negatives
            progress.update(curr_minibatch_size)

        progress.close()
        finish = datetime.datetime.now()
        
        # precision
        precision = 0
        if cum_true_positives + cum_false_positives > 0:
            precision = cum_true_positives / (cum_true_positives + cum_false_positives)
            
        # recall
        recall = 0
        if cum_true_positives + cum_false_negatives > 0:
            recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
            
        # F1
        F1 = 0
        if precision + recall > 0:
            F1 = 2 * precision * recall / (precision + recall)
        
        # print/log output
        message = '%s: time=%s, step=%d, loss=%g (%g + %g), precision=%g, recall=%g, F=%g' % (
            header,
            finish - start,
            tf.train.global_step(sess, self._global_step),
            cum_loss / cum_num_examples,
            cum_link_loss / cum_num_examples,
            cum_class_loss / cum_num_examples,
            precision,
            recall,
            F1)
        print(message)
        if log_file:
            print(message, file=log_file)
            log_file.flush()

In [6]:
sess = reset_tf(sess)

model = EntityRecognitionModel(sess, HyperParameters())
model.build_model()
dump_statistics()

parameters for "model/word_embeddings:0": 3840000
parameters for "model/position_embeddings:0": 2592
parameters for "model/attention/input/kernel:0": 20480
parameters for "model/attention/input/bias:0": 128
parameters for "model/attention/input/gamma:0": 128
parameters for "model/attention/input/beta:0": 128
parameters for "model/attention/layer_0/self/kernels:0": 32768
parameters for "model/attention/layer_0/self/batch_normalization/gamma:0": 128
parameters for "model/attention/layer_0/self/batch_normalization/beta:0": 128
parameters for "model/attention/layer_0/ff/fc1/kernel:0": 65536
parameters for "model/attention/layer_0/ff/fc1/bias:0": 512
parameters for "model/attention/layer_0/ff/fc2/kernel:0": 65536
parameters for "model/attention/layer_0/ff/fc2/bias:0": 128
parameters for "model/attention/layer_0/ff/batch_normalization/gamma:0": 128
parameters for "model/attention/layer_0/ff/batch_normalization/beta:0": 128
parameters for "model/attention/layer_1/self/kernels:0": 32768
parame

In [7]:
sess.run(tf.global_variables_initializer())

In [8]:
def list_files(path):
    return sorted([os.path.join(path, file) for file in os.listdir(path)])

train_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.train')
dev_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.dev')
test_set = list_files('../data/simplewiki/simplewiki-20171103.er_softmax_1.test')

In [9]:
with open('../logs/simplewiki/mediawiki_er_softmax_1.log', 'wt') as f:
    for i in range(10):
        model.process(
            train_set,
            header = 'train %d' % i,
            train = True,
            log_file = f)
        model.process(
            dev_set,
            header = 'dev %d' % i,
            train = False,
            log_file = f)

train 0: time=0:02:37.470138, step=965, loss=65.4464 (37.968 + 27.4784), precision=0.47381, recall=0.838531, F=0.60549


dev 0: time=0:00:02.724285, step=965, loss=48.4185 (28.1573 + 20.2612), precision=0.628549, recall=0.807609, F=0.706916


train 1: time=0:02:31.684070, step=1930, loss=39.186 (25.4332 + 13.7529), precision=0.582439, recall=0.885379, F=0.702647


dev 1: time=0:00:02.583615, step=1930, loss=35.4048 (27.0248 + 8.37996), precision=0.697273, recall=0.794058, F=0.742525


train 2: time=0:02:31.740054, step=2895, loss=30.2211 (22.8888 + 7.33223), precision=0.609793, recall=0.899694, F=0.726905


dev 2: time=0:00:02.863238, step=2895, loss=30.1805 (25.3604 + 4.82009), precision=0.675182, recall=0.834407, F=0.746397


train 3: time=0:02:31.846296, step=3860, loss=26.204 (21.2687 + 4.93531), precision=0.626585, recall=0.910028, F=0.742165


dev 3: time=0:00:02.764185, step=3860, loss=29.5703 (25.9616 + 3.60876), precision=0.687163, recall=0.825932, F=0.750184


train 4: time=0:02:31.361258, step=4825, loss=23.8213 (20.0425 + 3.77876), precision=0.639974, recall=0.917596, F=0.754043


dev 4: time=0:00:02.580893, step=4825, loss=29.5177 (26.4454 + 3.07229), precision=0.701552, recall=0.815494, F=0.754244


train 5: time=0:02:30.281095, step=5790, loss=22.0925 (18.9738 + 3.11869), precision=0.651895, recall=0.924087, F=0.764486


dev 5: time=0:00:02.501831, step=5790, loss=30.3996 (27.4935 + 2.90616), precision=0.683314, recall=0.826705, F=0.748201


train 6: time=0:02:30.066458, step=6755, loss=20.8424 (18.0979 + 2.74453), precision=0.663152, recall=0.929201, F=0.773951


dev 6: time=0:00:02.533136, step=6755, loss=32.2327 (29.4718 + 2.76092), precision=0.693908, recall=0.820275, F=0.751818


train 7: time=0:02:30.128509, step=7720, loss=19.7351 (17.228 + 2.50703), precision=0.674574, recall=0.934345, F=0.783489


dev 7: time=0:00:02.539677, step=7720, loss=32.778 (30.1147 + 2.66338), precision=0.695872, recall=0.816867, F=0.751531


train 8: time=0:02:30.189422, step=8685, loss=18.8475 (16.5267 + 2.3208), precision=0.684392, recall=0.938229, F=0.791456


dev 8: time=0:00:02.491868, step=8685, loss=37.5906 (34.9572 + 2.63339), precision=0.722391, recall=0.785104, F=0.752443


train 9: time=0:02:30.140851, step=9650, loss=18.0086 (15.8238 + 2.18481), precision=0.694021, recall=0.9422, F=0.799289


dev 9: time=0:00:02.545961, step=9650, loss=33.1947 (30.8207 + 2.37397), precision=0.680458, recall=0.824642, F=0.745644


# Error Analysis

In [91]:
model._output_link_probs = tf.sigmoid(model._output_link_logits)
model._output_class_probs = tf.nn.softmax(model._output_class_logits)
model._output_classes = tf.argmax(model._output_class_probs, axis = -1, name = 'output_class_probs')

In [11]:
with open('../data/simplewiki/simplewiki-20171103.er_softmax_1.vocab.txt', 'rt') as f:
    id_to_word = [w.strip() for w in f]

In [12]:
with open('../data/simplewiki/simplewiki-20171103.er_softmax_1.targets.txt', 'rt') as f:
    id_to_target = [t.strip() for t in f]

In [113]:
def compute_negative_examples(filenames, limit = None):
    # initialize dataset iterator
    sess.run(model._dataset_iterator.initializer, feed_dict = {
        model._dataset_filenames: filenames,
        model._training: False })
    
    examples = []
    
    while True:
        # compute minibatch
        try:
            (context,
             targets,
             output_links,
             output_link_probs,
             output_classes,
             output_class_probs) = sess.run(
                (model._context, 
                 model._targets, 
                 model._output_links, 
                 model._output_link_probs,
                 model._output_classes,
                 model._output_class_probs),
                feed_dict = { model._training: False })
        except tf.errors.OutOfRangeError:
                break

        # loop through examples
        for cs, ts, ols, olps, ocs, ocps in zip(context, targets, output_links, output_link_probs, output_classes, output_class_probs):
            # stop if limit reached
            if limit and len(examples) >= limit:
                break
            
            # decode context
            decoded_context = [id_to_word[wid] for wid in cs]
            decoded_targets = [id_to_target[tid] if tid >= 0 else None for tid in ts]
            decoded_output_classes = [id_to_target[ocs[i]] if ols[i] else None for i in range(len(ols))]
            decoded_class_probs = [ocps[i, ocs[i]] if ols[i] else 0.0 for i in range(len(ols))]

            # decode example
            examples.append([
                decoded_context,
                decoded_targets,
                decoded_output_classes,
                olps,
                decoded_class_probs])
    
    return examples

In [114]:
examples = compute_negative_examples(dev_set[:1], limit = 500)

In [129]:
def print_example(e):
    context, targets, outputs, link_probs, class_probs = e
    for word, target, output, link_prob, class_prob in zip(context, targets, outputs, link_probs, class_probs):
        class_prob = ' %0.3f' % class_prob if class_prob else ''
        print('%20.20s %20.20s %20.20s  %0.3f %s' % (word, target, output, link_prob, class_prob))

In [181]:
print_example(examples[53])

               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
               <OOB>                 None                 None  0.000 
      