In [1]:
import tensorflow as tf
import modules.mediawiki as mw
from tensorflow.python.client import timeline
from tqdm import tqdm

In [2]:
sess = None

In [3]:
class HyperParameters():
    # adam learning rate
    learning_rate = 1e-3
    
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 128
    
    # number of hidden units in ff layer 
    ff_size = 128
    
    # number of combined (attention + feed forward) layers
    num_layers = 1
    
    # dropout rate
    dropout_rate = 0.1
    
    # number of sequences per batch
    pipeline_batch_size = 1024
    
    # number of parsing threads in data pipeline
    pipeline_num_parallel_calls = 4
    
    # size of prefetch in data pipeline
    pipeline_prefetch_size = pipeline_batch_size * 16
    
    # shuffle buffer size
    pipeline_shuffle_size = 10000

hp = HyperParameters()

In [4]:
class AttentionModel(mw.BaseModel):
    def __init__(self, hp):
        super().__init__(hp)
    
    def _build_prediction_model_internal(self):
        # Embeddings
        # ----------
        
        with tf.variable_scope('embeddings'):
            input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                                        (self._hp.vocab_size, self._hp.embedding_size))
            input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                              self._input_sequences)

            input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                                        (self._hp.max_sequence_length, self._hp.embedding_size))
            input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, self._input_positions)

            input_word_ending_embeddings = tf.get_variable('input_word_ending_embeddings',
                                                           (2, self._hp.embedding_size))
            input_word_endings_embedded = tf.nn.embedding_lookup(input_word_ending_embeddings, 
                                                                 self._input_word_endings)

            sequence_mask = tf.sequence_mask(self._input_lengths,
                                             self._hp.max_sequence_length,
                                             dtype = tf.float32)
            sequence_mask = tf.expand_dims(sequence_mask, 2)

            input_combined_embedded = tf.add_n([input_sequences_embedded, 
                                                input_positions_embedded, 
                                                input_word_endings_embedded])
            input_combined_embedded *= sequence_mask
            input_combined_embedded = tf.layers.dropout(input_combined_embedded,
                                                        rate = self._hp.dropout_rate,
                                                        training = self._is_training)
        
        # Layer normalization
        # -------------------

        def layer_norm(x, scope, reuse=None, epsilon=1e-6):
            with tf.variable_scope(scope, reuse=reuse):
                num_units = self._hp.embedding_size
                scale = tf.get_variable(
                    "layer_norm_scale", [num_units], initializer=tf.ones_initializer())
                bias = tf.get_variable(
                    "layer_norm_bias", [num_units], initializer=tf.zeros_initializer())
                result = layer_norm_compute(x, epsilon, scale, bias)
                return result

        def layer_norm_compute(x, epsilon, scale, bias):
            # TODO: incorporate length into layer normalization?
            mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
            variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
            norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
            return norm_x * scale + bias

        # Attention
        # ---------

        def attention_layer(A):
            A_T = tf.transpose(A, perm=[0, 2, 1])
            scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
            result = tf.matmul(tf.nn.softmax(scaled_logits), A)
            result = tf.layers.dropout(result, rate = self._hp.dropout_rate, training = self._is_training)
            return result

        # Feed-forward
        # ------------

        def feed_forward_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = tf.layers.dense(A, self._hp.ff_size, activation=tf.nn.relu, name='fc1')
                A = tf.layers.dense(A, self._hp.embedding_size, name='fc2')
                A = tf.layers.dropout(A, rate = self._hp.dropout_rate, training = self._is_training)
                return A

        # Layers
        # ------

        def combined_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = layer_norm(A + attention_layer(A), scope='attention_norm')
                A = layer_norm(A + feed_forward_layer(A, 'ff'), scope='ff_norm')
                A *= sequence_mask
                return A

        with tf.variable_scope('layers'):
            layer = input_combined_embedded
            for i in range(self._hp.num_layers):
                layer = combined_layer(layer, 'layer_%d' % i)
            
        # Softmax
        # -------

        with tf.variable_scope('softmax'):
            output_logits = tf.layers.dense(layer, 2, name = 'softmax')
        
        return output_logits

In [5]:
sess = mw.reset_tf(sess)
model = AttentionModel(hp)
model.dump_statistics()

parameters for "embeddings/input_sequence_embeddings:0": 1280000
parameters for "embeddings/input_position_embeddings:0": 5120
parameters for "embeddings/input_word_ending_embeddings:0": 256
parameters for "layers/layer_0/attention_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/attention_norm/layer_norm_bias:0": 128
parameters for "layers/layer_0/ff/fc1/kernel:0": 16384
parameters for "layers/layer_0/ff/fc1/bias:0": 128
parameters for "layers/layer_0/ff/fc2/kernel:0": 16384
parameters for "layers/layer_0/ff/fc2/bias:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_bias:0": 128
parameters for "softmax/softmax/kernel:0": 256
parameters for "softmax/softmax/bias:0": 2
total parameters: 1319170


In [6]:
sess.run(tf.global_variables_initializer())

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                           header='train %d' % epoch,
                           train=True,
                           show_progress=True)
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                           header='dev %d' % epoch,
                           train=False,
                           show_progress=False)

16583215it [00:33, 483301.68it/s]