In [1]:
import tensorflow as tf
import modules.mediawiki_er as mw_er
import numpy as np
import gzip
from tensorflow.python.client import timeline
from tqdm import tqdm

In [2]:
sess = None

In [3]:
class HyperParameters():
    # adam learning rate
    learning_rate = 1e-3
    
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 128
    
    # number of hidden units in ff layer 
    ff_size = 256
    
    # number of combined (attention + feed forward) layers
    num_layers = 4
    
    # dropout rate
    dropout_rate = 0.1
    
    # number of sequences per batch
    pipeline_batch_size = 1024
    
    # number of parsing threads in data pipeline
    pipeline_num_parallel_calls = 4
    
    # size of prefetch in data pipeline
    pipeline_prefetch_size = pipeline_batch_size * 16
    
    # shuffle buffer size
    pipeline_shuffle_size = 10000

hp = HyperParameters()

In [4]:
class AttentionModel(mw_er.BaseModel):
    def __init__(self, hp):
        with gzip.open('../data/simplewiki/simplewiki-20171103.topic_model_1_128.embedding.npy.gz', 'rb') as f:
            self._page_id_embeddings = tf.constant(np.load(f), dtype=tf.float32)
        
        super().__init__(hp)
    
    def _build_prediction_model_internal(self):
        # Layer normalization
        # -------------------

        def layer_norm(x, scope, reuse=None, epsilon=1e-6):
            with tf.variable_scope(scope, reuse=reuse):
                num_units = self._hp.embedding_size
                scale = tf.get_variable(
                    "layer_norm_scale", [num_units], initializer=tf.ones_initializer())
                bias = tf.get_variable(
                    "layer_norm_bias", [num_units], initializer=tf.zeros_initializer())
                result = layer_norm_compute(x, epsilon, scale, bias)
                return result

        def layer_norm_compute(x, epsilon, scale, bias):
            # TODO: incorporate length into layer normalization?
            mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
            variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
            norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
            return norm_x * scale + bias
        
        # Embeddings
        # ----------
        
        with tf.variable_scope('embeddings'):
            input_page_ids_embedded = tf.nn.embedding_lookup(self._page_id_embeddings,
                                                             self._input_page_ids)
            input_page_ids_embedded = tf.tile(input_page_ids_embedded,
                                              [1, self._hp.max_sequence_length, 1])
            
            input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                                        (self._hp.vocab_size, self._hp.embedding_size))
            input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                              self._input_sequences)

            input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                                        (self._hp.max_sequence_length, self._hp.embedding_size))
            input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, self._input_positions)

            sequence_mask = tf.sequence_mask(self._input_lengths,
                                             self._hp.max_sequence_length,
                                             dtype = tf.float32)
            sequence_mask = tf.expand_dims(sequence_mask, 2)
            
            input_concat = tf.concat([input_sequences_embedded, 
                                      input_positions_embedded, 
                                      input_page_ids_embedded], 
                                     axis=-1)
            
            input_combined = tf.layers.dense(input_concat, 
                                             self._hp.embedding_size, 
                                             activation=tf.nn.relu, 
                                             name='input_combined')
            input_combined = tf.layers.dropout(input_combined,
                                               rate=self._hp.dropout_rate,
                                               training=self._is_training)
            input_combined = layer_norm(input_combined,
                                        scope='input_combined')
            input_combined *= sequence_mask

        # Attention
        # ---------

        def attention_layer(A):
            A_T = tf.transpose(A, perm=[0, 2, 1])
            scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
            result = tf.matmul(tf.nn.softmax(scaled_logits), A)
            result = tf.layers.dropout(result, rate = self._hp.dropout_rate, training = self._is_training)
            return result

        # Feed-forward
        # ------------

        def feed_forward_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = tf.layers.dense(A, self._hp.ff_size, activation=tf.nn.relu, name='fc1')
                A = tf.layers.dense(A, self._hp.embedding_size, name='fc2')
                A = tf.layers.dropout(A, rate = self._hp.dropout_rate, training = self._is_training)
                return A

        # Layers
        # ------

        def combined_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = layer_norm(A + attention_layer(A), scope='attention_norm')
                A = layer_norm(A + feed_forward_layer(A, 'ff'), scope='ff_norm')
                A *= sequence_mask
                return A

        with tf.variable_scope('layers'):
            layer = input_combined
            for i in range(self._hp.num_layers):
                layer = combined_layer(layer, 'layer_%d' % i)
            
        # Softmax
        # -------

        with tf.variable_scope('softmax'):
            output_logits = tf.layers.dense(layer, 2, name = 'softmax')
        
        return output_logits

In [5]:
sess = mw_er.reset_tf(sess)
model = AttentionModel(hp)
model.dump_statistics()

parameters for "embeddings/input_sequence_embeddings:0": 1280000
parameters for "embeddings/input_position_embeddings:0": 5120
parameters for "embeddings/input_combined/kernel:0": 49152
parameters for "embeddings/input_combined/bias:0": 128
parameters for "embeddings/input_combined/layer_norm_scale:0": 128
parameters for "embeddings/input_combined/layer_norm_bias:0": 128
parameters for "layers/layer_0/attention_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/attention_norm/layer_norm_bias:0": 128
parameters for "layers/layer_0/ff/fc1/kernel:0": 32768
parameters for "layers/layer_0/ff/fc1/bias:0": 256
parameters for "layers/layer_0/ff/fc2/kernel:0": 32768
parameters for "layers/layer_0/ff/fc2/bias:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_bias:0": 128
parameters for "layers/layer_1/attention_norm/layer_norm_scale:0": 128
parameters for "layers/layer_1/attention_norm/layer_norm_bias:0": 128
param

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                           header='train %d' % epoch,
                           train=True,
                           show_progress=True)
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                           header='dev %d' % epoch,
                           train=False,
                           show_progress=False)


train 0 (1294): loss=0.159197, precision=0.690282, recall=0.266831, F1=0.384883
dev 0 (1294): loss=0.14953, precision=0.750021, recall=0.30455, F1=0.433198



train 1 (2588): loss=0.142806, precision=0.7279, recall=0.338825, F1=0.462407
dev 1 (2588): loss=0.144777, precision=0.741856, recall=0.351263, F1=0.476776



train 2 (3882): loss=0.137332, precision=0.734383, recall=0.369122, F1=0.491302
dev 2 (3882): loss=0.137493, precision=0.702803, recall=0.407833, F1=0.516148



train 3 (5176): loss=0.13349, precision=0.741135, recall=0.388612, F1=0.509873
dev 3 (5176): loss=0.134626, precision=0.718207, recall=0.406224, F1=0.518934



train 4 (6470): loss=0.130632, precision=0.747488, recall=0.403033, F1=0.523697
dev 4 (6470): loss=0.133772, precision=0.710924, recall=0.424466, F1=0.531558



train 5 (7764): loss=0.128472, precision=0.75219, recall=0.414223, F1=0.534243
dev 5 (7764): loss=0.132043, precision=0.735763, recall=0.406827, F1=0.523947



train 6 (9058): loss=0.12655, precision=0.755369, recall=0.423556, F1=0.542767
dev 6 (9058): loss=0.13195, precision=0.752404, recall=0.393548, F1=0.516788
