In [1]:
import tensorflow as tf
import modules.mediawiki_er as mw_er
import numpy as np
import gzip
from tensorflow.python.client import timeline
from tqdm import tqdm

In [2]:
sess = None

In [3]:
class HyperParameters():
    # adam learning rate
    learning_rate = 1e-3
    
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 128
    
    # number of hidden units in ff layer 
    ff_size = 256
    
    # number of combined (attention + feed forward) layers
    num_layers = 4
    
    # dropout rate
    dropout_rate = 0.1
    
    # number of sequences per batch
    pipeline_batch_size = 1024
    
    # number of parsing threads in data pipeline
    pipeline_num_parallel_calls = 4
    
    # size of prefetch in data pipeline
    pipeline_prefetch_size = pipeline_batch_size * 16
    
    # shuffle buffer size
    pipeline_shuffle_size = 10000

hp = HyperParameters()

In [4]:
class AttentionModel(mw_er.BaseModel):
    def __init__(self, hp):
#         with gzip.open('../data/simplewiki/simplewiki-20171103.topic_model_1_128.embedding.npy.gz', 'rb') as f:
#             self._page_id_embeddings = tf.constant(np.load(f), dtype=tf.float32)
        
        super().__init__(hp)
    
    def _build_prediction_model_internal(self):
        # Layer normalization
        # -------------------

        def layer_norm(x, scope, reuse=None, epsilon=1e-6):
            with tf.variable_scope(scope, reuse=reuse):
                num_units = self._hp.embedding_size
                scale = tf.get_variable(
                    "layer_norm_scale", [num_units], initializer=tf.ones_initializer())
                bias = tf.get_variable(
                    "layer_norm_bias", [num_units], initializer=tf.zeros_initializer())
                result = layer_norm_compute(x, epsilon, scale, bias)
                return result

        def layer_norm_compute(x, epsilon, scale, bias):
            # TODO: incorporate length into layer normalization?
            mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
            variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
            norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
            return norm_x * scale + bias
        
        # Embeddings
        # ----------
        
        with tf.variable_scope('embeddings'):
#             input_page_ids_embedded = tf.nn.embedding_lookup(self._page_id_embeddings,
#                                                              self._input_page_ids)
#             input_page_ids_embedded = tf.tile(input_page_ids_embedded,
#                                               [1, self._hp.max_sequence_length, 1])
            
            input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                                        (self._hp.vocab_size, self._hp.embedding_size))
            input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                              self._input_sequences)

            input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                                        (self._hp.max_sequence_length, self._hp.embedding_size))
            input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, self._input_positions)

            sequence_mask = tf.sequence_mask(self._input_lengths,
                                             self._hp.max_sequence_length,
                                             dtype = tf.float32)
            sequence_mask = tf.expand_dims(sequence_mask, 2)
            
#             input_concat = tf.concat([input_sequences_embedded, 
#                                       input_positions_embedded, 
#                                       input_page_ids_embedded], 
#                                      axis=-1)
            input_concat = tf.concat([input_sequences_embedded, 
                                      input_positions_embedded], 
                                     axis=-1)
            
            input_combined = tf.layers.dense(input_concat, 
                                             self._hp.embedding_size, 
                                             activation=tf.nn.relu, 
                                             name='input_combined')
            input_combined = tf.layers.dropout(input_combined,
                                               rate=self._hp.dropout_rate,
                                               training=self._is_training)
            input_combined = layer_norm(input_combined,
                                        scope='input_combined')
            input_combined *= sequence_mask

        # Attention
        # ---------

        def attention_layer(A):
            A_T = tf.transpose(A, perm=[0, 2, 1])
            scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
            result = tf.matmul(tf.nn.softmax(scaled_logits), A)
            result = tf.layers.dropout(result, rate = self._hp.dropout_rate, training = self._is_training)
            return result

        # Feed-forward
        # ------------

        def feed_forward_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = tf.layers.dense(A, self._hp.ff_size, activation=tf.nn.relu, name='fc1')
                A = tf.layers.dense(A, self._hp.embedding_size, name='fc2')
                A = tf.layers.dropout(A, rate = self._hp.dropout_rate, training = self._is_training)
                return A

        # Layers
        # ------

        def combined_layer(A, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                A = layer_norm(A + attention_layer(A), scope='attention_norm')
                A = layer_norm(A + feed_forward_layer(A, 'ff'), scope='ff_norm')
                A *= sequence_mask
                return A

        with tf.variable_scope('layers'):
            layer = input_combined
            for i in range(self._hp.num_layers):
                layer = combined_layer(layer, 'layer_%d' % i)
            
        # Softmax
        # -------

        with tf.variable_scope('softmax'):
            output_logits = tf.layers.dense(layer, 2, name = 'softmax')
        
        return output_logits

In [5]:
sess = mw_er.reset_tf(sess)
model = AttentionModel(hp)
model.dump_statistics()

parameters for "embeddings/input_sequence_embeddings:0": 1280000
parameters for "embeddings/input_position_embeddings:0": 5120
parameters for "embeddings/input_combined/kernel:0": 32768
parameters for "embeddings/input_combined/bias:0": 128
parameters for "embeddings/input_combined/layer_norm_scale:0": 128
parameters for "embeddings/input_combined/layer_norm_bias:0": 128
parameters for "layers/layer_0/attention_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/attention_norm/layer_norm_bias:0": 128
parameters for "layers/layer_0/ff/fc1/kernel:0": 32768
parameters for "layers/layer_0/ff/fc1/bias:0": 256
parameters for "layers/layer_0/ff/fc2/kernel:0": 32768
parameters for "layers/layer_0/ff/fc2/bias:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_scale:0": 128
parameters for "layers/layer_0/ff_norm/layer_norm_bias:0": 128
parameters for "layers/layer_1/attention_norm/layer_norm_scale:0": 128
parameters for "layers/layer_1/attention_norm/layer_norm_bias:0": 128
param

In [6]:
sess.run(tf.global_variables_initializer())

In [7]:
num_epochs = 100

for epoch in range(num_epochs):
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                           header='train %d' % epoch,
                           train=True,
                           show_progress=True)
    model.evaluate_dataset(sess,
                           '../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                           header='dev %d' % epoch,
                           train=False,
                           show_progress=False)


train 0 (1294): loss=0.161792, precision=0.679477, recall=0.254492, F1=0.370294
dev 0 (1294): loss=0.159657, precision=0.555457, recall=0.439154, F1=0.490505



train 1 (2588): loss=0.14513, precision=0.722012, recall=0.32609, F1=0.449271
dev 1 (2588): loss=0.148754, precision=0.633007, recall=0.379431, F1=0.474463



train 2 (3882): loss=0.13959, precision=0.73059, recall=0.354431, F1=0.477306
dev 2 (3882): loss=0.141333, precision=0.774503, recall=0.314543, F1=0.447391



train 3 (5176): loss=0.135603, precision=0.737728, recall=0.374208, F1=0.496546
dev 3 (5176): loss=0.13728, precision=0.704097, recall=0.38436, F1=0.497267



train 4 (6470): loss=0.132764, precision=0.743637, recall=0.38874, F1=0.510574
dev 4 (6470): loss=0.134943, precision=0.752187, recall=0.369069, F1=0.495175



train 5 (7764): loss=0.130305, precision=0.748738, recall=0.401389, F1=0.522612
dev 5 (7764): loss=0.137743, precision=0.825066, recall=0.301767, F1=0.441907



train 6 (9058): loss=0.128431, precision=0.752496, recall=0.410904, F1=0.531552
dev 6 (9058): loss=0.133407, precision=0.737931, recall=0.398276, F1=0.517336



train 7 (10352): loss=0.126717, precision=0.754465, recall=0.420651, F1=0.540145
dev 7 (10352): loss=0.132731, precision=0.725277, recall=0.410952, F1=0.524637



train 8 (11646): loss=0.125242, precision=0.758628, recall=0.428532, F1=0.547687
dev 8 (11646): loss=0.133137, precision=0.710079, recall=0.424533, F1=0.531375



train 9 (12940): loss=0.123929, precision=0.76003, recall=0.435393, F1=0.553631
dev 9 (12940): loss=0.134817, precision=0.667766, recall=0.468697, F1=0.550796



train 10 (14234): loss=0.122753, precision=0.761939, recall=0.441956, F1=0.559424
dev 10 (14234): loss=0.133953, precision=0.682807, recall=0.457731, F1=0.548061



train 11 (15528): loss=0.121654, precision=0.764691, recall=0.448288, F1=0.565223
dev 11 (15528): loss=0.134003, precision=0.682363, recall=0.458234, F1=0.548278



train 12 (16822): loss=0.120622, precision=0.765749, recall=0.453816, F1=0.56989
dev 12 (16822): loss=0.135664, precision=0.656288, recall=0.484189, F1=0.557254



train 13 (18116): loss=0.119726, precision=0.767562, recall=0.458778, F1=0.574295
dev 13 (18116): loss=0.135363, precision=0.670304, recall=0.473827, F1=0.555195



train 14 (19410): loss=0.118833, precision=0.768997, recall=0.463328, F1=0.578253
dev 14 (19410): loss=0.133735, precision=0.690313, recall=0.452601, F1=0.546736



train 15 (20704): loss=0.118081, precision=0.769753, recall=0.467556, F1=0.58175
dev 15 (20704): loss=0.135126, precision=0.666194, recall=0.472419, F1=0.552817



train 16 (21998): loss=0.117315, precision=0.770956, recall=0.471973, F1=0.585505
dev 16 (21998): loss=0.133478, precision=0.686055, recall=0.458804, F1=0.549875



train 17 (23292): loss=0.116579, precision=0.77306, recall=0.476103, F1=0.589284
dev 17 (23292): loss=0.133834, precision=0.677485, recall=0.468294, F1=0.553793



train 18 (24586): loss=0.115959, precision=0.773423, recall=0.479261, F1=0.591804
dev 18 (24586): loss=0.133607, precision=0.690648, recall=0.451963, F1=0.546376



train 19 (25880): loss=0.115387, precision=0.774432, recall=0.482528, F1=0.594586
dev 19 (25880): loss=0.13477, precision=0.665508, recall=0.475303, F1=0.554549



train 20 (27174): loss=0.114712, precision=0.775049, recall=0.485942, F1=0.597353
dev 20 (27174): loss=0.132676, precision=0.720884, recall=0.423426, F1=0.533494



train 21 (28468): loss=0.114094, precision=0.776279, recall=0.489591, F1=0.600471
dev 21 (28468): loss=0.132975, precision=0.704879, recall=0.435063, F1=0.538039



train 22 (29762): loss=0.113576, precision=0.776874, recall=0.492108, F1=0.60254
dev 22 (29762): loss=0.132698, precision=0.716366, recall=0.427283, F1=0.535288



train 23 (31056): loss=0.113096, precision=0.778262, recall=0.494762, F1=0.604944
dev 23 (31056): loss=0.133288, precision=0.714732, recall=0.429161, F1=0.5363



train 24 (32350): loss=0.112585, precision=0.779109, recall=0.497645, F1=0.607352
dev 24 (32350): loss=0.134384, precision=0.696695, recall=0.448141, F1=0.545436



train 25 (33644): loss=0.112061, precision=0.780206, recall=0.500228, F1=0.609607
dev 25 (33644): loss=0.133334, precision=0.707635, recall=0.433252, F1=0.537449



train 26 (34938): loss=0.111617, precision=0.780244, recall=0.502922, F1=0.611615
dev 26 (34938): loss=0.13308, precision=0.72439, recall=0.418296, F1=0.530346



train 27 (36232): loss=0.11123, precision=0.780948, recall=0.504739, F1=0.613174
dev 27 (36232): loss=0.133926, precision=0.700538, recall=0.445491, F1=0.544635



train 28 (37526): loss=0.110753, precision=0.782479, recall=0.50704, F1=0.615343
dev 28 (37526): loss=0.133764, precision=0.712828, recall=0.428591, F1=0.535319



train 29 (38820): loss=0.110484, precision=0.782839, recall=0.50809, F1=0.616227
dev 29 (38820): loss=0.133095, precision=0.721594, recall=0.421884, F1=0.532461



train 30 (40114): loss=0.110163, precision=0.783293, recall=0.510069, F1=0.617822
dev 30 (40114): loss=0.133489, precision=0.722974, recall=0.426545, F1=0.536539



train 31 (41408): loss=0.109744, precision=0.783726, recall=0.512415, F1=0.619675
dev 31 (41408): loss=0.13417, precision=0.724909, recall=0.42071, F1=0.532422



train 32 (42702): loss=0.109248, precision=0.784642, recall=0.515446, F1=0.622174
dev 32 (42702): loss=0.134404, precision=0.723066, recall=0.424902, F1=0.535262



train 33 (43996): loss=0.108867, precision=0.785802, recall=0.517186, F1=0.623806
dev 33 (43996): loss=0.134808, precision=0.702329, recall=0.444888, F1=0.544723



train 34 (45290): loss=0.108487, precision=0.786853, recall=0.519519, F1=0.625833
dev 34 (45290): loss=0.136342, precision=0.722391, recall=0.423125, F1=0.533666



train 35 (46584): loss=0.108194, precision=0.78675, recall=0.521031, F1=0.626896
dev 35 (46584): loss=0.134664, precision=0.73205, recall=0.419503, F1=0.533362



train 36 (47878): loss=0.107821, precision=0.787561, recall=0.522566, F1=0.628264
dev 36 (47878): loss=0.135096, precision=0.724882, recall=0.418799, F1=0.530882



train 37 (49172): loss=0.107555, precision=0.788213, recall=0.523796, F1=0.62936
dev 37 (49172): loss=0.14406, precision=0.603179, recall=0.533114, F1=0.565987



train 38 (50466): loss=0.108344, precision=0.785793, recall=0.520149, F1=0.625953
dev 38 (50466): loss=0.134549, precision=0.732268, recall=0.413366, F1=0.528432



train 39 (51760): loss=0.10712, precision=0.789111, recall=0.526281, F1=0.631438
dev 39 (51760): loss=0.135816, precision=0.711431, recall=0.435767, F1=0.540479



train 40 (53054): loss=0.106755, precision=0.789926, recall=0.52829, F1=0.633143
dev 40 (53054): loss=0.135466, precision=0.722229, recall=0.422085, F1=0.532794



train 41 (54348): loss=0.106425, precision=0.79041, recall=0.529954, F1=0.634493
dev 41 (54348): loss=0.136275, precision=0.709024, recall=0.436337, F1=0.54022



train 42 (55642): loss=0.106216, precision=0.789977, recall=0.531631, F1=0.635553
dev 42 (55642): loss=0.135581, precision=0.724316, recall=0.417525, F1=0.529706



train 43 (56936): loss=0.105948, precision=0.790808, recall=0.532653, F1=0.636553
dev 43 (56936): loss=0.136794, precision=0.715704, recall=0.423024, F1=0.531751



train 44 (58230): loss=0.105721, precision=0.791176, recall=0.533983, F1=0.637621
dev 44 (58230): loss=0.137585, precision=0.70713, recall=0.436001, F1=0.539413



train 45 (59524): loss=0.105359, precision=0.792318, recall=0.536001, F1=0.639429
dev 45 (59524): loss=0.137305, precision=0.710646, recall=0.434023, F1=0.53891



train 46 (60818): loss=0.10521, precision=0.792354, recall=0.536899, F1=0.64008
dev 46 (60818): loss=0.136562, precision=0.720865, recall=0.419402, F1=0.530283



train 47 (62112): loss=0.105068, precision=0.792482, recall=0.537866, F1=0.640808
dev 47 (62112): loss=0.138217, precision=0.730794, recall=0.418195, F1=0.531971



train 48 (63406): loss=0.104896, precision=0.792923, recall=0.538365, F1=0.641307
dev 48 (63406): loss=0.137306, precision=0.72152, recall=0.419034, F1=0.530165



train 49 (64700): loss=0.104799, precision=0.792281, recall=0.539334, F1=0.641783
dev 49 (64700): loss=0.136156, precision=0.68986, recall=0.45582, F1=0.548935



train 50 (65994): loss=0.104534, precision=0.792839, recall=0.540597, F1=0.64286
dev 50 (65994): loss=0.136964, precision=0.711902, recall=0.432045, F1=0.537741



train 51 (67288): loss=0.104359, precision=0.793833, recall=0.541311, F1=0.643692
dev 51 (67288): loss=0.137501, precision=0.693528, recall=0.449918, F1=0.545773



train 52 (68582): loss=0.104079, precision=0.794751, recall=0.542599, F1=0.644904
dev 52 (68582): loss=0.137327, precision=0.700881, recall=0.445357, F1=0.544638



train 53 (69876): loss=0.103716, precision=0.795081, recall=0.544108, F1=0.646078
dev 53 (69876): loss=0.13808, precision=0.698906, recall=0.447805, F1=0.545863



train 54 (71170): loss=0.103531, precision=0.794732, recall=0.545457, F1=0.646912
dev 54 (71170): loss=0.137678, precision=0.713826, recall=0.431273, F1=0.53769



train 55 (72464): loss=0.103451, precision=0.795677, recall=0.54608, F1=0.647663
dev 55 (72464): loss=0.138626, precision=0.709679, recall=0.437645, F1=0.541412



train 56 (73758): loss=0.103274, precision=0.79549, recall=0.546902, F1=0.648179
dev 56 (73758): loss=0.137885, precision=0.706978, recall=0.445391, F1=0.546494



train 57 (75052): loss=0.103035, precision=0.796036, recall=0.548096, F1=0.649199
dev 57 (75052): loss=0.13833, precision=0.701553, recall=0.449784, F1=0.548141



train 58 (76346): loss=0.102889, precision=0.796781, recall=0.548589, F1=0.649792
dev 58 (76346): loss=0.139691, precision=0.696316, recall=0.446263, F1=0.543927


KeyboardInterrupt: 

In [8]:
sess.run(model._dataset_iterator.initializer, feed_dict={
    model._dataset_filenames: ['../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords']
})

In [12]:
with open('../data/simplewiki/simplewiki-20171103.vocab_30k.txt', 'rt', encoding='utf-8') as f:
    id_to_word_30k = [line.strip() for line in f]

In [19]:
(input_page_ids,
 input_para_ids,
 input_sentence_ids,
 input_sequences,
 input_lengths,
 input_positions,
 target_sequences) = sess.run((model._input_page_ids,
                               model._input_para_ids,
                               model._input_sentence_ids,
                               model._input_sequences,
                               model._input_lengths,
                               model._input_positions,
                               model._target_sequences))

In [30]:
for page_ids, para_ids, sent_ids, seqs, seq_len, pos, targs in zip(input_page_ids, input_para_ids, input_sentence_ids, input_sequences, input_lengths, input_positions, target_sequences):
    strs = []
    for word_id, targ_id in zip(seqs[:seq_len], targs):
        if targ_id:
            strs.append('*' + id_to_word_30k[word_id].upper() + '*')
        else:
            strs.append(id_to_word_30k[word_id])
    print(' '.join(strs))

|-
<UNK> song has vocals from <UNK> .
<UNK> combination of <UNK> henderson charts , his strong clarinet playing , and his band that practiced well made him a rising star in <UNK> <UNK> .
\begin { bmatrix }
earlier , witnesses saw a man dressed in blue jeans and a black jacket with a head scarf arrive at <UNK> national war memorial in a car , carrying a <UNK> shotgun .
scent marks are what animals leave when they are marking their *TERRITORY* .
<UNK> parasites live off <UNK> host .
<UNK> <UNK> was once among <UNK> most threatened *WATERFOWL* *SPECIES* around <UNK> world .
__notoc__
according to popular *BRITISH* folklore , <UNK> tradition once had a sinister twist , in that <UNK> may queen was put to death once <UNK> festivities were over .
human beings , their faculties developed in balanced fashion , able to see <UNK> system of
<UNK> line was <UNK> probably <UNK> written by ken <UNK> during one of <UNK> episode 's <UNK> sessions , although none of those present on <UNK> episode 's dvd