In [1]:
import tensorflow as tf
import numpy as np
import math
import gzip
import json
import datetime
import shutil
from tqdm import tqdm

In [2]:
def reset_tf():
    global sess
    sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [3]:
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [4]:
class HyperParameters():
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 256
    
    # number of sequences per batch
    batch_size = 256
    
    # number of target classes
    num_target_classes = 2
    
    # number of combined (attention + feed forward) layers
    num_layers = 4
    
    # number of parsing threads in data pipeline
    dataset_pipeline_parallel_calls = 4
    
    # size of prefetch in data pipeline
    dataset_pipeline_prefetch = batch_size * 16
    
    # shuffle buffer size
    dataset_pipeline_shuffle_buffer_size = 10000
    
    # dropout rate
    dropout_rate = 0.1

hp = HyperParameters()

In [5]:
def parse_example(example_proto, max_sequence_length=hp.max_sequence_length):
    features = {
        'inputs': tf.VarLenFeature(tf.int64),
        'word_endings': tf.VarLenFeature(tf.int64),
        'targets': tf.VarLenFeature(tf.int64)
    }
    
    parsed = tf.parse_single_example(example_proto, features)
    
    def convert_and_pad(sparse_tensor):
        result = tf.sparse_tensor_to_dense(sparse_tensor)
        # TODO: properly ignore elements which are too large (right now we just clip)
        result = result[:max_sequence_length]
        result = tf.pad(result, [[0, max_sequence_length - tf.shape(result)[0]]])
        return result
    
    return (convert_and_pad(parsed['inputs']),
            tf.shape(parsed['inputs'])[0],
            convert_and_pad(parsed['word_endings']),
            convert_and_pad(parsed['targets']))

In [6]:
reset_tf()

# Data pipeline
# -------------

dataset_filenames = tf.placeholder(tf.string, shape=[None])

dataset = tf.data.TFRecordDataset(dataset_filenames)
dataset = dataset.map(parse_example, 
                      num_parallel_calls = hp.dataset_pipeline_parallel_calls)
dataset = dataset.shuffle(hp.dataset_pipeline_shuffle_buffer_size)
dataset = dataset.prefetch(hp.dataset_pipeline_prefetch)
dataset = dataset.batch(hp.batch_size)

dataset_iterator = dataset.make_initializable_iterator()
input_sequences_it, input_lengths_it, input_word_endings_it, target_sequences_it = dataset_iterator.get_next()

# Placeholders
# ------------

input_sequences = input_sequences_it
input_lengths = input_lengths_it
input_word_endings = input_word_endings_it
target_sequences = target_sequences_it

# TODO: figure out how to use placeholder_with_default
# input_sequences = tf.placeholder_with_default(input_sequences_it, 
#                                               shape = (hp.batch_size, hp.max_sequence_length), 
#                                               name = 'input_sequences')
# input_lengths = tf.placeholder_with_default(input_lengths_it,
#                                             shape = (hp.batch_size),
#                                             name = 'input_lengths')
# input_word_endings = tf.placeholder_with_default(input_word_endings_it,
#                                                  shape = (hp.batch_size, hp.max_sequence_length),
#                                                  name = 'input_word_endings')
# target_sequences = tf.placeholder_with_default(target_sequences_it,
#                                                shape = (hp.batch_size, hp.max_sequence_length),
#                                                name = 'target_sequences')

# sequences of input positions (not a placeholder)
input_positions = tf.range(hp.max_sequence_length, dtype=tf.int32)
input_positions = tf.tile(input_positions, [tf.shape(input_sequences)[0]])
input_positions = tf.reshape(input_positions, 
                             (tf.shape(input_sequences)[0], hp.max_sequence_length), 
                             name = 'input_positions')

is_training = tf.placeholder(tf.bool)

# Embeddings
# ----------

# sequences of input embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                            (hp.vocab_size, hp.embedding_size))
input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                  input_sequences,
                                                  name = 'input_sequences_embedded')

# sequences of input position embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                            (hp.max_sequence_length, hp.embedding_size))
input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, input_positions)

# sequences of word ending embeddings w/ shape:
#   (hp.batch_size, hp.max_sequence_length, hp.embedding_size)
input_word_ending_embeddings = tf.get_variable('input_word_ending_embeddings',
                                               (2, hp.embedding_size))
input_word_endings_embedded = tf.nn.embedding_lookup(input_word_ending_embeddings, 
                                                     input_word_endings,
                                                     name = 'input_word_endings_embedded')

# Sequence mask
# -------------

sequence_mask = tf.sequence_mask(input_lengths,
                                 hp.max_sequence_length,
                                 dtype = tf.float32)
# expand dimensions to support broadcasting
expanded_sequence_mask = tf.expand_dims(sequence_mask, 
                                        2, 
                                        name = 'sequence')

input_combined_embedded = tf.add_n([input_sequences_embedded, 
                                    input_positions_embedded, 
                                    input_word_endings_embedded])
# TODO: is this necessary?
input_combined_embedded = tf.multiply(input_combined_embedded,
                                      expanded_sequence_mask)
input_combined_embedded = tf.layers.dropout(input_combined_embedded,
                                            rate = hp.dropout_rate,
                                            training = is_training)

# Layer normalization
# -------------------

def layer_norm(x, num_units, scope, reuse=None, epsilon=1e-6):
    with tf.variable_scope(scope, reuse=reuse):
        scale = tf.get_variable(
            "layer_norm_scale", [num_units], initializer=tf.ones_initializer())
        bias = tf.get_variable(
            "layer_norm_bias", [num_units], initializer=tf.zeros_initializer())
        result = layer_norm_compute(x, epsilon, scale, bias)
        return result

def layer_norm_compute(x, epsilon, scale, bias):
    # TODO: incorporate length into layer normalization?
    mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    return norm_x * scale + bias
    
# Attention
# ---------

def attention_layer(A, dropout_rate, is_training):
    A_T = tf.transpose(A, perm=[0, 2, 1])
    scaled_logits = tf.matmul(A, A_T) / tf.sqrt(tf.cast(tf.shape(A)[-1], tf.float32))
    result = tf.matmul(tf.nn.softmax(scaled_logits), A)
    result = tf.layers.dropout(result,
                               rate = dropout_rate,
                               training = is_training)
    return result

# Feed-forward
# ------------

def feed_forward_layer(A, num_units, dropout_rate, is_training, scope, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        A = tf.layers.dense(A, num_units, activation=tf.nn.relu, name='fc1')
        A = tf.layers.dense(A, num_units, name='fc2')
        A = tf.layers.dropout(A, rate = dropout_rate, training = is_training)
        return A

# Layers
# ------

def combined_layer(A, num_units, mask, dropout_rate, is_training, scope, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        A = layer_norm(A + attention_layer(A, dropout_rate, is_training), num_units, scope='attention_norm')
        A = layer_norm(A + feed_forward_layer(A, num_units, dropout_rate, is_training, 'ff'), num_units, scope='ff_norm')
        A *= mask
        return A
    
layer = input_combined_embedded
for i in range(hp.num_layers):
    layer = combined_layer(layer, 
                           hp.embedding_size, 
                           expanded_sequence_mask, 
                           hp.dropout_rate,
                           is_training,
                           'layer_%d' % i)

# Softmax
# -------

output_logits = tf.layers.dense(layer, hp.num_target_classes, name='softmax')

# TODO: is the softmax here really necessary?
output_sequences = tf.nn.softmax(output_logits)
output_sequences = tf.argmax(output_sequences, axis=-1)
output_sequences *= tf.cast(sequence_mask, tf.int64)

# Loss
# ----

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_sequences,
                                                        logits=output_logits)
losses *= sequence_mask

total_loss = tf.reduce_sum(losses)
total_input_length = tf.reduce_sum(input_lengths)
mean_loss  = total_loss / tf.cast(total_input_length, tf.float32)

true_positives = tf.reduce_sum(output_sequences * target_sequences)
false_positives = tf.reduce_sum(tf.maximum(output_sequences - target_sequences, 0))
false_negatives = tf.reduce_sum(tf.maximum(target_sequences - output_sequences, 0))

# Training
# --------

global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_op = optimizer.minimize(mean_loss, global_step=global_step)

# Summaries
# ---------

tf.summary.scalar('mean_loss', mean_loss)

merged_summaries = tf.summary.merge_all()

In [7]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    print('parameters for "%s": %d' % (variable.name, variable_parameters))
    total_parameters += variable_parameters
print('total parameters: %d' % total_parameters)

parameters for "input_sequence_embeddings:0": 2560000
parameters for "input_position_embeddings:0": 10240
parameters for "input_word_ending_embeddings:0": 512
parameters for "layer_0/attention_norm/layer_norm_scale:0": 256
parameters for "layer_0/attention_norm/layer_norm_bias:0": 256
parameters for "layer_0/ff/fc1/kernel:0": 65536
parameters for "layer_0/ff/fc1/bias:0": 256
parameters for "layer_0/ff/fc2/kernel:0": 65536
parameters for "layer_0/ff/fc2/bias:0": 256
parameters for "layer_0/ff_norm/layer_norm_scale:0": 256
parameters for "layer_0/ff_norm/layer_norm_bias:0": 256
parameters for "layer_1/attention_norm/layer_norm_scale:0": 256
parameters for "layer_1/attention_norm/layer_norm_bias:0": 256
parameters for "layer_1/ff/fc1/kernel:0": 65536
parameters for "layer_1/ff/fc1/bias:0": 256
parameters for "layer_1/ff/fc2/kernel:0": 65536
parameters for "layer_1/ff/fc2/bias:0": 256
parameters for "layer_1/ff_norm/layer_norm_scale:0": 256
parameters for "layer_1/ff_norm/layer_norm_bias:0

In [8]:
sess.run(tf.global_variables_initializer())

In [9]:
def evaluate_dataset(filename, header='results', train=False, show_progress=True):
    cum_loss = 0
    cum_input_length = 0
    
    cum_true_positives = 0
    cum_false_positives = 0
    cum_false_negatives = 0
    
    sess.run(dataset_iterator.initializer, feed_dict={
        dataset_filenames: [filename]
    })
    
    start = datetime.datetime.now()
    
    if show_progress:
        progress = tqdm()
        
    while True:
        try:
            (_,
             curr_loss, 
             curr_input_length, 
             curr_true_positives,
             curr_false_positives,
             curr_false_negatives) = sess.run((train_op if train else [],
                                               total_loss,
                                               total_input_length,
                                               true_positives,
                                               false_positives,
                                               false_negatives),
                                              feed_dict = { is_training: train })
        except tf.errors.OutOfRangeError:
            break

        if show_progress:
            progress.update(curr_input_length)

        cum_loss += curr_loss
        cum_input_length += curr_input_length
        cum_true_positives += curr_true_positives
        cum_false_positives += curr_false_positives
        cum_false_negatives += curr_false_negatives
    
    if show_progress:
        progress.close()

    finish = datetime.datetime.now()
    elapsed = (finish - start).total_seconds()
    
    precision = cum_true_positives / (cum_true_positives + cum_false_positives)
    recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    
    print('%s: loss=%g, elapsed=%gs, precision=%g, recall=%g, F1=%g' % (header,
                                                                        cum_loss/cum_input_length, 
                                                                        elapsed, 
                                                                        precision, 
                                                                        recall, 
                                                                        F1))

In [10]:
# builder = tf.saved_model.builder.SavedModelBuilder('../models/simplewiki/attention_1_layer')
# builder.add_meta_graph_and_variables(sess, ['training'])

num_epochs = 100

for epoch in range(num_epochs):
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                              header='train %d' % epoch,
                              train=True,
                              show_progress=True)
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                              header='dev %d' % epoch,
                              train=False,
                              show_progress=False)
    #builder.save()

23718333it [03:38, 108717.36it/s]


train 0: loss=0.191152, elapsed=218.166s, precision=0.672429, recall=0.224368, F1=0.336468


0it [00:00, ?it/s]

dev 0: loss=0.177852, elapsed=2.79058s, precision=0.702729, recall=0.304435, F1=0.424827


23718333it [03:35, 110149.53it/s]


train 1: loss=0.170946, elapsed=215.331s, precision=0.702876, recall=0.314277, F1=0.434345


0it [00:00, ?it/s]

dev 1: loss=0.172337, elapsed=2.37324s, precision=0.757393, recall=0.2814, F1=0.410343


23718333it [03:34, 110809.13it/s]


train 2: loss=0.162695, elapsed=214.05s, precision=0.713418, recall=0.352873, F1=0.47219


0it [00:00, ?it/s]

dev 2: loss=0.16899, elapsed=2.40012s, precision=0.742294, recall=0.323627, F1=0.45074


23718333it [03:41, 107034.53it/s]


train 3: loss=0.157263, elapsed=221.598s, precision=0.720954, recall=0.382372, F1=0.499712


0it [00:00, ?it/s]

dev 3: loss=0.166897, elapsed=3.04232s, precision=0.734349, recall=0.34687, F1=0.471179


23718333it [04:06, 117165.23it/s]


train 4: loss=0.153232, elapsed=246.206s, precision=0.727101, recall=0.405301, F1=0.520477


0it [00:00, ?it/s]

dev 4: loss=0.165216, elapsed=2.7766s, precision=0.717166, recall=0.379622, F1=0.496453


23718333it [03:54, 119578.98it/s]


train 5: loss=0.150061, elapsed=234.352s, precision=0.731925, recall=0.423674, F1=0.536687


0it [00:00, ?it/s]

dev 5: loss=0.165303, elapsed=2.66284s, precision=0.707554, recall=0.393895, F1=0.506064


23718333it [03:36, 109607.72it/s]


train 6: loss=0.14732, elapsed=216.396s, precision=0.735668, recall=0.440205, F1=0.550816


0it [00:00, ?it/s]

dev 6: loss=0.166099, elapsed=2.34004s, precision=0.687513, recall=0.416633, F1=0.518846


23718333it [03:35, 110248.30it/s]


train 7: loss=0.145014, elapsed=215.139s, precision=0.739568, recall=0.45469, F1=0.563152


0it [00:00, ?it/s]

dev 7: loss=0.166568, elapsed=2.33115s, precision=0.667898, recall=0.43374, F1=0.525933


23718333it [03:34, 110784.14it/s]


train 8: loss=0.142883, elapsed=214.098s, precision=0.74302, recall=0.467166, F1=0.573654


0it [00:00, ?it/s]

dev 8: loss=0.166791, elapsed=2.34984s, precision=0.658678, recall=0.448169, F1=0.533405


23718333it [03:38, 108684.46it/s]


train 9: loss=0.141016, elapsed=218.234s, precision=0.74573, recall=0.478877, F1=0.583229


0it [00:00, ?it/s]

dev 9: loss=0.169518, elapsed=2.18395s, precision=0.641747, recall=0.466492, F1=0.540262


23718333it [03:37, 109203.67it/s]


train 10: loss=0.13935, elapsed=217.195s, precision=0.748947, recall=0.488417, F1=0.591254


0it [00:00, ?it/s]

dev 10: loss=0.170255, elapsed=2.05523s, precision=0.643202, recall=0.468265, F1=0.541966


23718333it [03:35, 109990.30it/s]


train 11: loss=0.137789, elapsed=215.642s, precision=0.751528, recall=0.497534, F1=0.598707


0it [00:00, ?it/s]

dev 11: loss=0.170559, elapsed=2.18298s, precision=0.63364, recall=0.485284, F1=0.549627


23718333it [03:35, 109979.81it/s]


train 12: loss=0.136261, elapsed=215.664s, precision=0.754856, recall=0.506144, F1=0.605973


0it [00:00, ?it/s]

dev 12: loss=0.171171, elapsed=2.20526s, precision=0.63565, recall=0.480138, F1=0.547057


23718333it [03:35, 110262.85it/s]


train 13: loss=0.134944, elapsed=215.11s, precision=0.757279, recall=0.513634, F1=0.612102


0it [00:00, ?it/s]

dev 13: loss=0.172007, elapsed=2.18706s, precision=0.63433, recall=0.482207, F1=0.547906


23718333it [03:36, 109733.85it/s]


train 14: loss=0.133634, elapsed=216.147s, precision=0.759387, recall=0.520943, F1=0.617961


0it [00:00, ?it/s]

dev 14: loss=0.173189, elapsed=2.21151s, precision=0.625803, recall=0.497679, F1=0.554436


23718333it [03:36, 109437.33it/s]


train 15: loss=0.132435, elapsed=216.731s, precision=0.762155, recall=0.527335, F1=0.623364


0it [00:00, ?it/s]

dev 15: loss=0.174302, elapsed=2.30836s, precision=0.624429, recall=0.494046, F1=0.551638


23718333it [03:29, 113379.96it/s]


train 16: loss=0.13127, elapsed=209.195s, precision=0.764291, recall=0.533421, F1=0.62832


0it [00:00, ?it/s]

dev 16: loss=0.179509, elapsed=2.30229s, precision=0.612078, recall=0.510961, F1=0.556967


23718333it [03:28, 124378.71it/s]


train 17: loss=0.130353, elapsed=208.401s, precision=0.76662, recall=0.538633, F1=0.632716


0it [00:00, ?it/s]

dev 17: loss=0.174942, elapsed=2.30234s, precision=0.621262, recall=0.503086, F1=0.555963


23718333it [03:28, 124262.88it/s]


train 18: loss=0.129304, elapsed=208.297s, precision=0.768551, recall=0.543683, F1=0.63685


0it [00:00, ?it/s]

dev 18: loss=0.179084, elapsed=2.29795s, precision=0.613844, recall=0.507675, F1=0.555734


23718333it [03:28, 113682.68it/s]


train 19: loss=0.128386, elapsed=208.639s, precision=0.770213, recall=0.548354, F1=0.640619


0it [00:00, ?it/s]

dev 19: loss=0.175883, elapsed=2.30901s, precision=0.623463, recall=0.498149, F1=0.553805


23718333it [03:28, 122722.50it/s]


train 20: loss=0.127557, elapsed=208.576s, precision=0.772103, recall=0.55265, F1=0.6442


0it [00:00, ?it/s]

dev 20: loss=0.178124, elapsed=2.30607s, precision=0.630287, recall=0.487527, F1=0.549791


23718333it [03:28, 113791.44it/s]


train 21: loss=0.126736, elapsed=208.44s, precision=0.773591, recall=0.556791, F1=0.647526


0it [00:00, ?it/s]

dev 21: loss=0.175091, elapsed=2.31081s, precision=0.63746, recall=0.483146, F1=0.549678


23718333it [03:28, 123340.98it/s]


train 22: loss=0.125854, elapsed=208.446s, precision=0.775703, recall=0.561503, F1=0.651447


0it [00:00, ?it/s]

dev 22: loss=0.17925, elapsed=2.26526s, precision=0.631879, recall=0.49123, F1=0.552747


23718333it [03:28, 124064.32it/s]


train 23: loss=0.125196, elapsed=208.397s, precision=0.776697, recall=0.564884, F1=0.654069


0it [00:00, ?it/s]

dev 23: loss=0.180702, elapsed=2.34658s, precision=0.606261, recall=0.518488, F1=0.55895


23718333it [03:28, 123540.78it/s]


train 24: loss=0.124535, elapsed=208.37s, precision=0.777738, recall=0.56731, F1=0.656064


0it [00:00, ?it/s]

dev 24: loss=0.181901, elapsed=2.31333s, precision=0.60392, recall=0.5212, F1=0.559519


23718333it [03:28, 123937.55it/s]


train 25: loss=0.123805, elapsed=208.707s, precision=0.779503, recall=0.570755, F1=0.658993


0it [00:00, ?it/s]

dev 25: loss=0.182244, elapsed=2.3008s, precision=0.613943, recall=0.50352, F1=0.553276


23718333it [03:28, 124109.71it/s]


train 26: loss=0.123176, elapsed=208.453s, precision=0.780947, recall=0.574601, F1=0.662069


0it [00:00, ?it/s]

dev 26: loss=0.184369, elapsed=2.32449s, precision=0.592683, recall=0.522435, F1=0.555346


23718333it [03:28, 122267.02it/s]


train 27: loss=0.122615, elapsed=208.499s, precision=0.781473, recall=0.577107, F1=0.663919


0it [00:00, ?it/s]

dev 27: loss=0.186102, elapsed=2.29294s, precision=0.618672, recall=0.494341, F1=0.549562


23718333it [03:28, 124148.42it/s]


train 28: loss=0.12204, elapsed=208.35s, precision=0.782731, recall=0.579598, F1=0.66602


0it [00:00, ?it/s]

dev 28: loss=0.186386, elapsed=2.30033s, precision=0.610127, recall=0.507762, F1=0.554258


23718333it [03:28, 113780.46it/s]


train 29: loss=0.121423, elapsed=208.46s, precision=0.784155, recall=0.582536, F1=0.668474


0it [00:00, ?it/s]

dev 29: loss=0.188612, elapsed=2.32519s, precision=0.613598, recall=0.502512, F1=0.552527


23718333it [03:28, 121628.36it/s]


train 30: loss=0.120967, elapsed=208.4s, precision=0.785361, recall=0.584762, F1=0.670376


0it [00:00, ?it/s]

dev 30: loss=0.188263, elapsed=2.30714s, precision=0.603005, recall=0.511343, F1=0.553404


23718333it [03:28, 124191.66it/s]


train 31: loss=0.120458, elapsed=208.306s, precision=0.786675, recall=0.58669, F1=0.672122


0it [00:00, ?it/s]

dev 31: loss=0.190228, elapsed=2.2915s, precision=0.595597, recall=0.527737, F1=0.559617


23718333it [03:28, 124381.87it/s]


train 32: loss=0.119818, elapsed=208.479s, precision=0.787247, recall=0.589743, F1=0.674331


0it [00:00, ?it/s]

dev 32: loss=0.192314, elapsed=2.29593s, precision=0.589593, recall=0.530657, F1=0.558575


23718333it [03:28, 124337.89it/s]


train 33: loss=0.119429, elapsed=208.413s, precision=0.788038, recall=0.591639, F1=0.675859


0it [00:00, ?it/s]

dev 33: loss=0.190663, elapsed=2.30339s, precision=0.606122, recall=0.510509, F1=0.554222


23718333it [03:28, 123624.60it/s]


train 34: loss=0.119027, elapsed=208.49s, precision=0.788664, recall=0.593513, F1=0.677312


0it [00:00, ?it/s]

dev 34: loss=0.187146, elapsed=2.31344s, precision=0.644763, recall=0.472159, F1=0.545124


23718333it [03:28, 124977.24it/s]


train 35: loss=0.118513, elapsed=208.432s, precision=0.790307, recall=0.595496, F1=0.679208


0it [00:00, ?it/s]

dev 35: loss=0.190395, elapsed=2.32561s, precision=0.608275, recall=0.509361, F1=0.554441


23718333it [03:28, 113745.23it/s]


train 36: loss=0.118056, elapsed=208.525s, precision=0.790788, recall=0.598142, F1=0.681105


0it [00:00, ?it/s]

dev 36: loss=0.191317, elapsed=2.33472s, precision=0.600433, recall=0.516437, F1=0.555276


23718333it [03:28, 123930.81it/s]


train 37: loss=0.117747, elapsed=208.395s, precision=0.791503, recall=0.599202, F1=0.682057


0it [00:00, ?it/s]

dev 37: loss=0.194088, elapsed=2.31826s, precision=0.599138, recall=0.517271, F1=0.555203


23718333it [03:28, 124035.66it/s]


train 38: loss=0.117262, elapsed=208.47s, precision=0.792214, recall=0.601119, F1=0.683562


0it [00:00, ?it/s]

dev 38: loss=0.194459, elapsed=2.28394s, precision=0.598132, recall=0.521009, F1=0.556913


23718333it [03:28, 123803.24it/s]


train 39: loss=0.116907, elapsed=208.609s, precision=0.793152, recall=0.60299, F1=0.68512


0it [00:00, ?it/s]

dev 39: loss=0.194083, elapsed=2.31261s, precision=0.606787, recall=0.505415, F1=0.551481


23718333it [03:28, 113797.59it/s]


train 40: loss=0.116511, elapsed=208.428s, precision=0.794394, recall=0.603678, F1=0.686028


0it [00:00, ?it/s]

dev 40: loss=0.191537, elapsed=2.30779s, precision=0.607438, recall=0.505398, F1=0.55174


23718333it [03:28, 123815.76it/s]


train 41: loss=0.116245, elapsed=208.306s, precision=0.794086, recall=0.605618, F1=0.687164


0it [00:00, ?it/s]

dev 41: loss=0.193813, elapsed=2.2922s, precision=0.611712, recall=0.506093, F1=0.553913


23718333it [03:28, 113771.73it/s]


train 42: loss=0.115911, elapsed=208.476s, precision=0.794912, recall=0.606952, F1=0.688331


0it [00:00, ?it/s]

dev 42: loss=0.193274, elapsed=2.33583s, precision=0.602131, recall=0.507032, F1=0.550504


23718333it [03:28, 124396.29it/s]


train 43: loss=0.115376, elapsed=208.754s, precision=0.79633, recall=0.609573, F1=0.690548


0it [00:00, ?it/s]

dev 43: loss=0.19597, elapsed=2.2911s, precision=0.584801, recall=0.530553, F1=0.556358


23718333it [03:28, 113874.48it/s]


train 44: loss=0.115067, elapsed=208.292s, precision=0.796838, recall=0.610084, F1=0.691066


0it [00:00, ?it/s]

dev 44: loss=0.196495, elapsed=2.3452s, precision=0.586112, recall=0.527493, F1=0.55526


23718333it [03:28, 113771.20it/s]


train 45: loss=0.114838, elapsed=208.477s, precision=0.796869, recall=0.611703, F1=0.692116


0it [00:00, ?it/s]

dev 45: loss=0.19873, elapsed=2.34368s, precision=0.581265, recall=0.535368, F1=0.557373


23718333it [03:28, 113833.29it/s]


train 46: loss=0.114385, elapsed=208.363s, precision=0.798715, recall=0.613061, F1=0.693681


0it [00:00, ?it/s]

dev 46: loss=0.199481, elapsed=2.2813s, precision=0.578313, recall=0.536359, F1=0.556547


23718333it [03:28, 113724.89it/s]


train 47: loss=0.114093, elapsed=208.562s, precision=0.798847, recall=0.614631, F1=0.694735


0it [00:00, ?it/s]

dev 47: loss=0.19728, elapsed=2.32746s, precision=0.579427, recall=0.537907, F1=0.557895


23718333it [03:28, 113645.73it/s]


train 48: loss=0.113708, elapsed=208.707s, precision=0.799248, recall=0.616402, F1=0.696017


0it [00:00, ?it/s]

dev 48: loss=0.198743, elapsed=2.29831s, precision=0.579103, recall=0.532239, F1=0.554683


23718333it [03:28, 123872.99it/s]


train 49: loss=0.113494, elapsed=208.489s, precision=0.800505, recall=0.616851, F1=0.696779


0it [00:00, ?it/s]

dev 49: loss=0.200338, elapsed=2.31094s, precision=0.598007, recall=0.516472, F1=0.554257


23014221it [03:27, 102272.68it/s]

KeyboardInterrupt: 

23019781it [03:40, 102272.68it/s]