In [1]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm

In [2]:
def reset_tf():
    global sess
    sess.close()
    tf.reset_default_graph()
    tf.set_random_seed(0)
    sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [3]:
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

In [4]:
class HyperParameters():
    # maximum number of symbols in an input sequence
    max_sequence_length = 40

    # number of symbols in vocabulary
    # (symbols are expected to be in range(vocab_size))
    vocab_size = 10000

    # number of dimensions in input embeddings
    embedding_size = 256
    
    # number of dimensions in hidden state
    rnn_hidden_state_size = 256
    
    # feed-forward hidden state size
    ff_hidden_state_size = 512
    
    # number of sequences per batch
    batch_size = 256
    
    # dropout rate
    dropout_rate = 0.1
    
    # number of parsing threads in data pipeline
    dataset_pipeline_parallel_calls = 4
    
    # size of prefetch in data pipeline
    dataset_pipeline_prefetch = batch_size * 16
    
    # shuffle buffer size
    dataset_pipeline_shuffle_buffer_size = 10000

hp = HyperParameters()

In [5]:
def parse_example(example_proto, max_sequence_length=hp.max_sequence_length):
    features = {
        'inputs': tf.VarLenFeature(tf.int64),
        'word_endings': tf.VarLenFeature(tf.int64),
        'targets': tf.VarLenFeature(tf.int64)
    }
    
    parsed = tf.parse_single_example(example_proto, features)
    
    def convert_and_pad(sparse_tensor):
        result = tf.sparse_tensor_to_dense(sparse_tensor)
        # TODO: properly ignore elements which are too large (right now we just clip)
        result = result[:max_sequence_length]
        result = tf.pad(result, [[0, max_sequence_length - tf.shape(result)[0]]])
        return result
    
    return (convert_and_pad(parsed['inputs']),
            tf.shape(parsed['inputs'])[0],
            convert_and_pad(parsed['word_endings']),
            convert_and_pad(parsed['targets']))

In [6]:
reset_tf()

# Data pipeline
# -------------

dataset_filenames = tf.placeholder(tf.string, shape=[None])

dataset = tf.data.TFRecordDataset(dataset_filenames)
dataset = dataset.map(parse_example, 
                      num_parallel_calls = hp.dataset_pipeline_parallel_calls)
dataset = dataset.shuffle(hp.dataset_pipeline_shuffle_buffer_size)
dataset = dataset.prefetch(hp.dataset_pipeline_prefetch)
dataset = dataset.batch(hp.batch_size)

dataset_iterator = dataset.make_initializable_iterator()
(input_sequences_it,
 input_lengths_it,
 input_word_endings_it,
 target_sequences_it) = dataset_iterator.get_next()

# Placeholders
# ------------

input_sequences = tf.placeholder_with_default(input_sequences_it,
                                              shape = [None, hp.max_sequence_length],
                                              name = 'input_sequences')
input_lengths = tf.placeholder_with_default(input_lengths_it,
                                            shape = [None],
                                            name = 'input_lengths')
input_word_endings = tf.placeholder_with_default(input_word_endings_it,
                                                 shape = [None, hp.max_sequence_length],
                                                 name = 'input_word_endings')
target_sequences = tf.placeholder_with_default(target_sequences_it,
                                               shape = [None, hp.max_sequence_length],
                                               name = 'target_sequences')

# sequences of input positions (not a placeholder)
input_positions = tf.range(hp.max_sequence_length, dtype=tf.int32)
input_positions = tf.tile(input_positions, [tf.shape(input_sequences)[0]])
input_positions = tf.reshape(input_positions, 
                             (tf.shape(input_sequences)[0], hp.max_sequence_length), 
                             name = 'input_positions')

is_training = tf.placeholder(tf.bool)


# Embeddings
# ----------

input_sequence_embeddings = tf.get_variable('input_sequence_embeddings', 
                                            (hp.vocab_size, hp.embedding_size))
input_sequences_embedded = tf.nn.embedding_lookup(input_sequence_embeddings, 
                                                  input_sequences,
                                                  name = 'input_sequences_embedded')

input_position_embeddings = tf.get_variable('input_position_embeddings', 
                                            (hp.max_sequence_length, hp.embedding_size))
input_positions_embedded = tf.nn.embedding_lookup(input_position_embeddings, input_positions)

input_word_ending_embeddings = tf.get_variable('input_word_ending_embeddings',
                                               (2, hp.embedding_size))
input_word_endings_embedded = tf.nn.embedding_lookup(input_word_ending_embeddings, 
                                                     input_word_endings,
                                                     name = 'input_word_endings_embedded')

input_combined_embedded = tf.add_n([input_sequences_embedded, 
                                    input_positions_embedded, 
                                    input_word_endings_embedded])
input_combined_embedded = tf.layers.dropout(input_combined_embedded,
                                            rate = hp.dropout_rate,
                                            training = is_training)

# RNNs
# ----

dropout_keep_prob = tf.cond(is_training,
                            lambda: tf.constant(1.0),
                            lambda: tf.constant(1.0 - hp.dropout_rate))

fw_rnn_cell = tf.nn.rnn_cell.GRUCell(hp.rnn_hidden_state_size)
fw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(fw_rnn_cell,
                                            input_keep_prob = dropout_keep_prob,
                                            output_keep_prob = dropout_keep_prob)

bw_rnn_cell = tf.nn.rnn_cell.GRUCell(hp.rnn_hidden_state_size)
bw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(bw_rnn_cell,
                                            input_keep_prob = dropout_keep_prob,
                                            output_keep_prob = dropout_keep_prob)

rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(fw_rnn_cell,
                                                 bw_rnn_cell,
                                                 input_combined_embedded,
                                                 sequence_length = input_lengths,
                                                 dtype = tf.float32)
rnn_outputs = tf.concat(rnn_outputs, 2)

# Feed-forward
# ------------

# TODO: more layers here?
feed_forward = tf.layers.dense(rnn_outputs,
                               hp.ff_hidden_state_size,
                               activation = tf.nn.relu,
                               name = 'feed_forward')
feed_forward = tf.layers.dropout(feed_forward,
                                 rate = hp.dropout_rate,
                                 training = is_training)


# Mask
# ----

sequence_mask = tf.sequence_mask(input_lengths,
                                 hp.max_sequence_length,
                                 dtype = tf.bool)


# Softmax
# -------

output_logits = tf.layers.dense(feed_forward, 2, name='softmax')

# TODO: is the softmax here really necessary?
output_sequences = tf.nn.softmax(output_logits)
output_sequences = tf.argmax(output_sequences, axis = -1)
output_sequences *= tf.cast(sequence_mask, tf.int64)

# Loss
# ----

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target_sequences,
                                                        logits = output_logits)
losses *= tf.cast(sequence_mask, tf.float32)

total_loss = tf.reduce_sum(losses)
total_input_length = tf.reduce_sum(input_lengths)
mean_loss  = total_loss / tf.cast(total_input_length, tf.float32)

true_positives = tf.reduce_sum(output_sequences * target_sequences)
false_positives = tf.reduce_sum(tf.maximum(output_sequences - target_sequences, 0))
false_negatives = tf.reduce_sum(tf.maximum(target_sequences - output_sequences, 0))

# Training
# --------

global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_op = optimizer.minimize(mean_loss, global_step=global_step)

# Summary
# -------

total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    print('parameters for "%s": %d' % (variable.name, variable_parameters))
    total_parameters += variable_parameters
print('total parameters: %d' % total_parameters)

parameters for "input_sequence_embeddings:0": 2560000
parameters for "input_position_embeddings:0": 10240
parameters for "input_word_ending_embeddings:0": 512
parameters for "bidirectional_rnn/fw/gru_cell/gates/kernel:0": 262144
parameters for "bidirectional_rnn/fw/gru_cell/gates/bias:0": 512
parameters for "bidirectional_rnn/fw/gru_cell/candidate/kernel:0": 131072
parameters for "bidirectional_rnn/fw/gru_cell/candidate/bias:0": 256
parameters for "bidirectional_rnn/bw/gru_cell/gates/kernel:0": 262144
parameters for "bidirectional_rnn/bw/gru_cell/gates/bias:0": 512
parameters for "bidirectional_rnn/bw/gru_cell/candidate/kernel:0": 131072
parameters for "bidirectional_rnn/bw/gru_cell/candidate/bias:0": 256
parameters for "feed_forward/kernel:0": 262144
parameters for "feed_forward/bias:0": 512
parameters for "softmax/kernel:0": 1024
parameters for "softmax/bias:0": 2
total parameters: 3622402


In [7]:
def evaluate_dataset(filename, header='results', train=False, show_progress=True):
    cum_loss = 0
    cum_input_length = 0
    
    cum_true_positives = 0
    cum_false_positives = 0
    cum_false_negatives = 0
    
    sess.run(dataset_iterator.initializer, feed_dict={
        dataset_filenames: [filename]
    })
    
    if show_progress:
        progress = tqdm()
        
    while True:
        try:
            (_,
             curr_loss, 
             curr_input_length, 
             curr_true_positives,
             curr_false_positives,
             curr_false_negatives) = sess.run((train_op if train else [],
                                               total_loss,
                                               total_input_length,
                                               true_positives,
                                               false_positives,
                                               false_negatives),
                                              feed_dict = { is_training: train })
        except tf.errors.OutOfRangeError:
            break

        if show_progress:
            progress.update(curr_input_length)

        cum_loss += curr_loss
        cum_input_length += curr_input_length
        cum_true_positives += curr_true_positives
        cum_false_positives += curr_false_positives
        cum_false_negatives += curr_false_negatives
    
    if show_progress:
        progress.close()

    precision = cum_true_positives / (cum_true_positives + cum_false_positives)
    recall = cum_true_positives / (cum_true_positives + cum_false_negatives)
    F1 = 2 * (precision * recall) / (precision + recall)
    
    print('%s: loss=%g, precision=%g, recall=%g, F1=%g' % (header,
                                                           cum_loss/cum_input_length, 
                                                           precision, 
                                                           recall, 
                                                           F1))

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.train.tfrecords',
                              header='train %d' % epoch,
                              train=True,
                              show_progress=True)
    result = evaluate_dataset('../data/simplewiki/simplewiki-20171103.entity_recognition.dev.tfrecords',
                              header='dev %d' % epoch,
                              train=False,
                              show_progress=False)

23718333it [04:39, 84757.77it/s] 


train 0: loss=0.166985, precision=0.701448, recall=0.333386, F1=0.451963


0it [00:00, ?it/s]

dev 0: loss=0.163846, precision=0.777298, recall=0.300349, F1=0.433279


23718333it [04:22, 90373.30it/s] 


train 1: loss=0.15313, precision=0.725064, recall=0.404923, F1=0.519643


0it [00:00, ?it/s]

dev 1: loss=0.157979, precision=0.730166, recall=0.3904, F1=0.508773


23718333it [04:13, 93558.51it/s] 


train 2: loss=0.147236, precision=0.736967, recall=0.435679, F1=0.547618


0it [00:00, ?it/s]

dev 2: loss=0.156589, precision=0.72778, recall=0.403838, F1=0.519443


23718333it [04:10, 94571.83it/s] 


train 3: loss=0.142314, precision=0.746003, recall=0.461487, F1=0.570225


0it [00:00, ?it/s]

dev 3: loss=0.156557, precision=0.707631, recall=0.437842, F1=0.540965


23718333it [04:09, 95168.25it/s] 


train 4: loss=0.137961, precision=0.754986, recall=0.48534, F1=0.590852


0it [00:00, ?it/s]

dev 4: loss=0.157917, precision=0.689128, recall=0.458165, F1=0.550398


23718333it [04:08, 95304.86it/s] 


train 5: loss=0.13282, precision=0.765011, recall=0.511791, F1=0.613292


0it [00:00, ?it/s]

dev 5: loss=0.160341, precision=0.683092, recall=0.460303, F1=0.549992


23718333it [04:08, 95419.21it/s] 


train 6: loss=0.13005, precision=0.771252, recall=0.525087, F1=0.624797


0it [00:00, ?it/s]

dev 6: loss=0.162775, precision=0.673233, recall=0.46637, F1=0.551026


23718333it [04:08, 95428.81it/s] 


train 7: loss=0.126893, precision=0.777464, recall=0.541115, F1=0.638108


0it [00:00, ?it/s]

dev 7: loss=0.164984, precision=0.677227, recall=0.462285, F1=0.549484


23718333it [04:08, 95413.82it/s] 


train 8: loss=0.124738, precision=0.782073, recall=0.552399, F1=0.647472


0it [00:00, ?it/s]

dev 8: loss=0.16409, precision=0.669475, recall=0.471064, F1=0.553011


23718333it [04:08, 95456.64it/s] 


train 9: loss=0.122373, precision=0.786348, recall=0.563903, F1=0.656802


0it [00:00, ?it/s]

dev 9: loss=0.165288, precision=0.667674, recall=0.47235, F1=0.553279


23718333it [04:08, 95437.27it/s] 


train 10: loss=0.120855, precision=0.789439, recall=0.571339, F1=0.662911


0it [00:00, ?it/s]

dev 10: loss=0.169176, precision=0.664376, recall=0.468873, F1=0.549761


23718333it [04:08, 95359.66it/s] 


train 11: loss=0.119521, precision=0.792041, recall=0.57768, F1=0.668087


0it [00:00, ?it/s]

dev 11: loss=0.170174, precision=0.666285, recall=0.464788, F1=0.547588


23718333it [04:08, 95373.60it/s] 


train 12: loss=0.117297, precision=0.796467, recall=0.587422, F1=0.676156


0it [00:00, ?it/s]

dev 12: loss=0.170753, precision=0.663979, recall=0.47388, F1=0.55305


23718333it [04:08, 95454.72it/s] 


train 13: loss=0.116077, precision=0.798201, recall=0.593398, F1=0.680729


0it [00:00, ?it/s]

dev 13: loss=0.173182, precision=0.661426, recall=0.469551, F1=0.549213


23718333it [04:08, 95321.76it/s] 


train 14: loss=0.11548, precision=0.799365, recall=0.596576, F1=0.683241


0it [00:00, ?it/s]

dev 14: loss=0.173429, precision=0.664465, recall=0.473845, F1=0.553195


23718333it [04:09, 95162.61it/s] 


train 15: loss=0.11462, precision=0.801129, recall=0.601278, F1=0.686964


0it [00:00, ?it/s]

dev 15: loss=0.17376, precision=0.664581, recall=0.463415, F1=0.54606


23718333it [04:08, 95315.73it/s] 


train 16: loss=0.113739, precision=0.802275, recall=0.6048, F1=0.68968


0it [00:00, ?it/s]

dev 16: loss=0.174748, precision=0.652199, recall=0.478035, F1=0.551698


23718333it [04:10, 94856.42it/s] 


train 17: loss=0.111992, precision=0.805573, recall=0.612821, F1=0.6961


0it [00:00, ?it/s]

dev 17: loss=0.17467, precision=0.644614, recall=0.4885, F1=0.555803


23718333it [04:08, 95391.45it/s] 


train 18: loss=0.110395, precision=0.808361, recall=0.619301, F1=0.701313


0it [00:00, ?it/s]

dev 18: loss=0.175401, precision=0.650733, recall=0.47687, F1=0.550398


23718333it [04:08, 95296.16it/s] 


train 19: loss=0.10928, precision=0.81064, recall=0.625211, F1=0.705952


0it [00:00, ?it/s]

dev 19: loss=0.177881, precision=0.633372, recall=0.495836, F1=0.556228


23718333it [04:08, 95415.83it/s] 


train 20: loss=0.108294, precision=0.812461, recall=0.62898, F1=0.709043


0it [00:00, ?it/s]

dev 20: loss=0.178441, precision=0.633572, recall=0.505311, F1=0.562219


23718333it [04:08, 95401.78it/s] 


train 21: loss=0.107241, precision=0.815018, recall=0.633614, F1=0.712958


0it [00:00, ?it/s]

dev 21: loss=0.178902, precision=0.644973, recall=0.487909, F1=0.555553


23718333it [04:08, 95418.65it/s] 


train 22: loss=0.106635, precision=0.815607, recall=0.636242, F1=0.714845


0it [00:00, ?it/s]

dev 22: loss=0.178093, precision=0.645326, recall=0.486449, F1=0.554736


23718333it [04:08, 95427.77it/s] 


train 23: loss=0.106449, precision=0.816215, recall=0.637426, F1=0.715825


0it [00:00, ?it/s]

dev 23: loss=0.180715, precision=0.63952, recall=0.488309, F1=0.553778


23718333it [04:09, 95235.99it/s] 


train 24: loss=0.105331, precision=0.817934, recall=0.642061, F1=0.719404


0it [00:00, ?it/s]

dev 24: loss=0.18362, precision=0.636785, recall=0.494811, F1=0.556891


23718333it [04:08, 95307.35it/s] 


train 25: loss=0.105876, precision=0.81676, recall=0.639498, F1=0.717341


0it [00:00, ?it/s]

dev 25: loss=0.178752, precision=0.632977, recall=0.495593, F1=0.555923


23718333it [04:08, 95388.16it/s] 


train 26: loss=0.105683, precision=0.816822, recall=0.641006, F1=0.718312


0it [00:00, ?it/s]

dev 26: loss=0.181935, precision=0.636727, recall=0.492707, F1=0.555535


23718333it [04:08, 95337.03it/s] 


train 27: loss=0.106646, precision=0.814877, recall=0.636728, F1=0.714871


0it [00:00, ?it/s]

dev 27: loss=0.181296, precision=0.647585, recall=0.479652, F1=0.551109


23718333it [04:08, 95290.23it/s] 


train 28: loss=0.106161, precision=0.815518, recall=0.638781, F1=0.71641


0it [00:00, ?it/s]

dev 28: loss=0.183078, precision=0.638891, recall=0.486518, F1=0.55239


23718333it [04:10, 94804.94it/s] 


train 29: loss=0.105212, precision=0.817753, recall=0.642603, F1=0.719674


0it [00:00, ?it/s]

dev 29: loss=0.185162, precision=0.642317, recall=0.479825, F1=0.549306


23718333it [04:09, 95206.12it/s] 


train 30: loss=0.105771, precision=0.81585, recall=0.640381, F1=0.717544


0it [00:00, ?it/s]

dev 30: loss=0.186464, precision=0.639685, recall=0.479061, F1=0.547842


23718333it [04:09, 95227.58it/s] 


train 31: loss=0.106827, precision=0.813507, recall=0.636084, F1=0.713938


0it [00:00, ?it/s]

dev 31: loss=0.185132, precision=0.654055, recall=0.471811, F1=0.548183


23718333it [04:09, 95207.40it/s] 


train 32: loss=0.105785, precision=0.815672, recall=0.640595, F1=0.71761


0it [00:00, ?it/s]

dev 32: loss=0.186371, precision=0.643258, recall=0.478191, F1=0.548577


23718333it [04:08, 95308.60it/s] 


train 33: loss=0.105267, precision=0.816822, recall=0.643249, F1=0.719718


0it [00:00, ?it/s]

dev 33: loss=0.183821, precision=0.652554, recall=0.472385, F1=0.548042


23718333it [04:08, 95318.11it/s] 


train 34: loss=0.104493, precision=0.818492, recall=0.647023, F1=0.722726


0it [00:00, ?it/s]

dev 34: loss=0.184114, precision=0.636125, recall=0.480608, F1=0.547538


23718333it [04:09, 95021.10it/s] 


train 35: loss=0.103887, precision=0.819437, recall=0.648283, F1=0.723881


0it [00:00, ?it/s]

dev 35: loss=0.188456, precision=0.656377, recall=0.466127, F1=0.545129


23718333it [04:09, 95229.32it/s] 


train 36: loss=0.104309, precision=0.818493, recall=0.647002, F1=0.722713


0it [00:00, ?it/s]

dev 36: loss=0.184156, precision=0.634838, recall=0.489126, F1=0.552537


23718333it [04:08, 95359.46it/s] 


train 37: loss=0.103874, precision=0.818799, recall=0.648972, F1=0.724061


0it [00:00, ?it/s]

dev 37: loss=0.188961, precision=0.634378, recall=0.487162, F1=0.551108


23718333it [04:08, 95322.52it/s] 


train 38: loss=0.103345, precision=0.819485, recall=0.651167, F1=0.725694


0it [00:00, ?it/s]

dev 38: loss=0.189267, precision=0.650531, recall=0.463502, F1=0.541316


23718333it [04:09, 95235.20it/s] 


train 39: loss=0.10357, precision=0.819314, recall=0.650426, F1=0.725167


0it [00:00, ?it/s]

dev 39: loss=0.185919, precision=0.642537, recall=0.483285, F1=0.551648


23718333it [04:08, 95255.94it/s] 


train 40: loss=0.103595, precision=0.819653, recall=0.650096, F1=0.725094


0it [00:00, ?it/s]

dev 40: loss=0.185958, precision=0.649841, recall=0.47581, F1=0.549372


23718333it [04:09, 95250.32it/s] 


train 41: loss=0.103745, precision=0.818674, recall=0.649897, F1=0.724587


0it [00:00, ?it/s]

dev 41: loss=0.186373, precision=0.637553, recall=0.479547, F1=0.547376


23718333it [04:08, 95272.38it/s] 


train 42: loss=0.104189, precision=0.818138, recall=0.647485, F1=0.722876


0it [00:00, ?it/s]

dev 42: loss=0.191498, precision=0.654708, recall=0.461902, F1=0.541659


23718333it [04:08, 95260.54it/s] 


train 43: loss=0.105205, precision=0.816093, recall=0.643965, F1=0.719883


0it [00:00, ?it/s]

dev 43: loss=0.186359, precision=0.637884, recall=0.485441, F1=0.551318


23718333it [04:09, 95039.84it/s] 


train 44: loss=0.105139, precision=0.815333, recall=0.64362, F1=0.719372


0it [00:00, ?it/s]

dev 44: loss=0.187063, precision=0.641728, recall=0.469847, F1=0.542498


23718333it [04:09, 95234.48it/s] 


train 45: loss=0.105831, precision=0.814523, recall=0.640787, F1=0.717285


0it [00:00, ?it/s]

dev 45: loss=0.185802, precision=0.637673, recall=0.484815, F1=0.550836


23718333it [04:09, 95246.48it/s] 


train 46: loss=0.106355, precision=0.813437, recall=0.638646, F1=0.715521


0it [00:00, ?it/s]

dev 46: loss=0.186134, precision=0.639452, recall=0.480364, F1=0.548608


23718333it [04:09, 95126.74it/s] 


train 47: loss=0.105902, precision=0.813926, recall=0.640322, F1=0.716762


0it [00:00, ?it/s]

dev 47: loss=0.187906, precision=0.633963, recall=0.478191, F1=0.545168


23718333it [04:09, 95205.41it/s] 


train 48: loss=0.105291, precision=0.815542, recall=0.643355, F1=0.719287


0it [00:00, ?it/s]

dev 48: loss=0.185183, precision=0.639192, recall=0.476835, F1=0.546204


23718333it [04:09, 95158.47it/s] 


train 49: loss=0.106053, precision=0.814239, recall=0.640478, F1=0.716981


0it [00:00, ?it/s]

dev 49: loss=0.183132, precision=0.646533, recall=0.471916, F1=0.545593


23718333it [04:09, 95107.47it/s] 


train 50: loss=0.107939, precision=0.810317, recall=0.632393, F1=0.710384


0it [00:00, ?it/s]

dev 50: loss=0.182128, precision=0.64335, recall=0.471672, F1=0.544295


23718333it [04:09, 95117.99it/s] 


train 51: loss=0.107059, precision=0.811653, recall=0.635679, F1=0.712968


0it [00:00, ?it/s]

dev 51: loss=0.184172, precision=0.629897, recall=0.484137, F1=0.547481


23718333it [04:09, 95164.37it/s] 


train 52: loss=0.107213, precision=0.811124, recall=0.634993, F1=0.712333


0it [00:00, ?it/s]

dev 52: loss=0.183883, precision=0.644973, recall=0.47016, F1=0.543864


23718333it [04:09, 95148.84it/s] 


train 53: loss=0.1064, precision=0.813463, recall=0.639112, F1=0.715824


0it [00:00, ?it/s]

dev 53: loss=0.18827, precision=0.634926, recall=0.482207, F1=0.548128


23718333it [04:09, 95169.68it/s] 


train 54: loss=0.106742, precision=0.812098, recall=0.637059, F1=0.714007


0it [00:00, ?it/s]

dev 54: loss=0.183684, precision=0.639552, recall=0.48591, F1=0.552244


23718333it [04:09, 95175.90it/s] 


train 55: loss=0.106466, precision=0.812908, recall=0.638593, F1=0.715284


0it [00:00, ?it/s]

dev 55: loss=0.185309, precision=0.642329, recall=0.476105, F1=0.546865


23718333it [04:09, 95151.87it/s] 


train 56: loss=0.106887, precision=0.812331, recall=0.63688, F1=0.713985


0it [00:00, ?it/s]

dev 56: loss=0.187657, precision=0.633516, recall=0.478504, F1=0.545206


23718333it [04:09, 95088.69it/s] 


train 57: loss=0.107181, precision=0.811814, recall=0.636495, F1=0.713543


0it [00:00, ?it/s]

dev 57: loss=0.185041, precision=0.647217, recall=0.469534, F1=0.54424


23718333it [04:09, 94894.55it/s] 


train 58: loss=0.106277, precision=0.812989, recall=0.639453, F1=0.715854


0it [00:00, ?it/s]

dev 58: loss=0.187604, precision=0.641275, recall=0.476662, F1=0.546849


23718333it [04:09, 95127.47it/s] 


train 59: loss=0.105694, precision=0.814142, recall=0.641855, F1=0.717805


0it [00:00, ?it/s]

dev 59: loss=0.18914, precision=0.633257, recall=0.482624, F1=0.547774


23718333it [04:09, 95122.13it/s] 


train 60: loss=0.105968, precision=0.813535, recall=0.640471, F1=0.716703


0it [00:00, ?it/s]

dev 60: loss=0.186023, precision=0.641958, recall=0.476957, F1=0.547292


23718333it [04:09, 95145.35it/s] 


train 61: loss=0.106478, precision=0.812617, recall=0.637755, F1=0.714645


0it [00:00, ?it/s]

dev 61: loss=0.184513, precision=0.658276, recall=0.457347, F1=0.539718


23718333it [04:16, 92610.74it/s] 


train 62: loss=0.107474, precision=0.810831, recall=0.634501, F1=0.71191


0it [00:00, ?it/s]

dev 62: loss=0.187137, precision=0.637214, recall=0.479547, F1=0.547251


12191419it [02:08, 94210.27it/s] 