In [1]:
import numpy as np
import os
import signal
import json

from parlai.core.agents import create_agent, create_agent_from_shared, get_agent_module
from parlai.core.worlds import create_task
from parlai.core.params import ParlaiParser
from parlai.core.utils import Timer, round_sigfigs, warn_once
from parlai.core.logs import TensorboardLogger
from parlai.scripts.build_dict import build_dict, setup_args as setup_dict_args
from parlai.core.distributed_utils import (
    sync_object, is_primary_worker, all_gather_list, is_distributed, num_workers
)
from parlai.scripts.build_pytorch_data import get_pyt_dict_file
from parlai.scripts.train_model import *

In [2]:
task = "personachat"
model = "seq2seq"
batch_size = 36
lr = 1e-2
hidden_size = 128
args = f"""-m parlai.scripts.train_model -m {model} 
           -t {task} 
           -mf '/tmp/model' 
           -bs {batch_size} 
           -lr {lr} 
           -hs {hidden_size}"""

In [3]:
# Create dictionary of arguments
opt = setup_args().parse_args(args.split())

[ Main ParlAI Arguments: ] 
[  batchsize: 36 ]
[  datapath: C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\data ]
[  datatype: train ]
[  download_path: C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\downloads ]
[  hide_labels: False ]
[  image_mode: raw ]
[  multitask_weights: [1] ]
[  numthreads: 1 ]
[  show_advanced_args: False ]
[  task: personachat ]
[ ParlAI Model Arguments: ] 
[  dict_class: parlai.core.dict:DictionaryAgent ]
[  init_model: None ]
[  model: seq2seq ]
[  model_file: '/tmp/model' ]
[ Training Loop Arguments: ] 
[  dict_build_first: True ]
[  display_examples: False ]
[  eval_batchsize: None ]
[  evaltask: None ]
[  load_from_checkpoint: False ]
[  max_train_time: -1 ]
[  num_epochs: -1 ]
[  save_after_valid: False ]
[  save_every_n_secs: -1 ]
[  validation_cutoff: 1.0 ]
[  validation_every_n_epochs: -1 ]
[  validation_every_n_secs: -1 ]
[  validation_max_exs: -1 ]
[  validation_metric: accuracy ]
[  validation_metric_mode: None ]


In [4]:
# Agent is essentially an object for our 'Seq2Seq' model
agent = create_agent(opt)
# World is a BatchWorld object that stores the data and task at hand
# It contains a list (worlds) of DialogPartnerWorld objects
world = create_task(opt, agent)

[ no model with opt yet at: '/tmp/model'(.opt) ]
Dictionary: loading dictionary from '/tmp/model'.dict
[ num words =  18745 ]
[ Using CUDA ]




[ Loading existing model params from '/tmp/model' ]
[creating task(s): personachat]
[loading fbdialog data:C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\data\Persona-Chat\personachat\train_self_original.txt]


  warn_once("LR scheduler is different from saved. Starting fresh!")


In [5]:
len(world.worlds) # = batch_size

36

Below I show how we run matches with "worlds" and "agents". I have made print statements inside parlai.core.worlds and modules to show what's happening internally. 

It seems agent 0 is simply data extraction and agent 1 is the model (receives observation from agent 0). The data is ordered such that batch 1 is persona + first dialogue line and batch two just provides the next observed lines (but as you can see from the input lines being printed for agent 1 the previous dialogue are still in the sequence). I'm pretty sure everything is saved in agent.batch_observations.

You can look in data/train_both_original.txt for the dialogue. 

In [6]:
# This is how we loop through batches
with world:
    b = 0
    while True:
        print("Batch =", b)
        b += 1
        # do one example / batch of examples
        world.parley()
        
        if b > 6:
            break

Batch = 0
Batch = 1
Batch = 2
Batch = 3
Batch = 4
Batch = 5
Batch = 6


In [7]:
# This is where dialogue is stored (same as above but concatenated)
print(world.batch_observations[1][0]['text'])

your persona: i was born in kansas.
your persona: i work on a farm.
your persona: i know how to drive a truck.
your persona: i love chickens.
your persona: i eat eggs for breakfast.
hi how are you today
hi , i just love the farm life !
hi how are you today
i am fine thank you . i cant wait for breakfast !
what will you eat then
i am certainly having eggs ! hopefully my chickens will have eggs .
nice let me have some fresh eggs
i raise the chickens on my farm here in kansas .
nice where do you work
i work right here at home on the family farm in kansas .
do you listen to music
all the time in my truck that i know how to drive .
what kind of truck is it


# Training Models

In [8]:
# All the exciting stuff happens in parlai.scripts.train_model, essentially we can just modify
# parameters in opt (batch size, lr, etc) and/or rewrite the training loop.
trainer = TrainLoop(opt)

[ building dictionary first... ]
[ dictionary already built .]
[ no model with opt yet at: '/tmp/model'(.opt) ]
Dictionary: loading dictionary from '/tmp/model'.dict
[ num words =  18745 ]
[ Using CUDA ]
[ Loading existing model params from '/tmp/model' ]
[creating task(s): personachat]
[loading fbdialog data:C:\Users\snowy\Documents\6.864\final-project\Persona-Chat-6.864\data\Persona-Chat\personachat\train_self_original.txt]
[ training... ]


In [11]:
# We should probably delete the internal print statements ;) 
trainer.train()

[ time:30592.0s total_exs:8719308 epochs:132.68 ] {'exs': 612, 'lr': 0.01, 'num_updates': 182660, 'loss': 6.146, 'token_acc': 0.2338, 'nll_loss': 6.146, 'ppl': 466.8}
[ time:30594.0s total_exs:8720172 epochs:132.69 ] {'exs': 864, 'lr': 0.01, 'num_updates': 182684, 'loss': 141.5, 'token_acc': 0.2485, 'nll_loss': 5.897, 'ppl': 363.8}
[ time:30596.0s total_exs:8721036 epochs:132.7 ] {'exs': 864, 'lr': 0.01, 'num_updates': 182708, 'loss': 140.5, 'token_acc': 0.2506, 'nll_loss': 5.853, 'ppl': 348.4}
[ time:30598.0s total_exs:8721864 epochs:132.71 ] {'exs': 828, 'lr': 0.01, 'num_updates': 182731, 'loss': 133.6, 'token_acc': 0.2512, 'nll_loss': 5.808, 'ppl': 333.1}
[ time:30600.0s total_exs:8722692 epochs:132.73 ] {'exs': 828, 'lr': 0.01, 'num_updates': 182754, 'loss': 134.4, 'token_acc': 0.253, 'nll_loss': 5.841, 'ppl': 344.2}
[ time:30602.0s total_exs:8723556 epochs:132.74 ] {'exs': 864, 'lr': 0.01, 'num_updates': 182778, 'loss': 140.6, 'token_acc': 0.2505, 'nll_loss': 5.857, 'ppl': 349.5}


[ time:30694.0s total_exs:8762616 epochs:133.33 ] {'exs': 864, 'lr': 0.01, 'num_updates': 183863, 'loss': 138.8, 'token_acc': 0.2591, 'nll_loss': 5.783, 'ppl': 324.6}
[ time:30696.0s total_exs:8763516 epochs:133.35 ] {'exs': 900, 'lr': 0.01, 'num_updates': 183888, 'loss': 148.3, 'token_acc': 0.2467, 'nll_loss': 5.938, 'ppl': 379.2}
[ time:30698.0s total_exs:8764416 epochs:133.36 ] {'exs': 900, 'lr': 0.01, 'num_updates': 183913, 'loss': 145.7, 'token_acc': 0.2584, 'nll_loss': 5.829, 'ppl': 340.0}
[ time:30701.0s total_exs:8765280 epochs:133.38 ] {'exs': 864, 'lr': 0.01, 'num_updates': 183937, 'loss': 139.9, 'token_acc': 0.2519, 'nll_loss': 5.837, 'ppl': 342.6}
[ time:30703.0s total_exs:8766180 epochs:133.39 ] {'exs': 900, 'lr': 0.01, 'num_updates': 183962, 'loss': 144.3, 'token_acc': 0.2583, 'nll_loss': 5.777, 'ppl': 322.8}
[ time:30705.0s total_exs:8767080 epochs:133.4 ] {'exs': 900, 'lr': 0.01, 'num_updates': 183987, 'loss': 150.0, 'token_acc': 0.24, 'nll_loss': 5.999, 'ppl': 403.1}
[

[ time:30796.0s total_exs:8806860 epochs:134.01 ] {'exs': 864, 'lr': 0.01, 'num_updates': 185092, 'loss': 142.1, 'token_acc': 0.2484, 'nll_loss': 5.921, 'ppl': 372.6}
[ time:30799.0s total_exs:8807796 epochs:134.02 ] {'exs': 936, 'lr': 0.01, 'num_updates': 185118, 'loss': 151.6, 'token_acc': 0.2556, 'nll_loss': 5.832, 'ppl': 341.1}
[ time:30801.0s total_exs:8808660 epochs:134.04 ] {'exs': 864, 'lr': 0.01, 'num_updates': 185142, 'loss': 140.0, 'token_acc': 0.253, 'nll_loss': 5.832, 'ppl': 340.9}
[ time:30803.0s total_exs:8809596 epochs:134.05 ] {'exs': 936, 'lr': 0.01, 'num_updates': 185168, 'loss': 149.8, 'token_acc': 0.2598, 'nll_loss': 5.765, 'ppl': 318.8}
[ time:30805.0s total_exs:8810424 epochs:134.06 ] {'exs': 828, 'lr': 0.01, 'num_updates': 185191, 'loss': 134.7, 'token_acc': 0.2484, 'nll_loss': 5.858, 'ppl': 350.0}
[ time:30807.0s total_exs:8811288 epochs:134.08 ] {'exs': 864, 'lr': 0.01, 'num_updates': 185215, 'loss': 141.3, 'token_acc': 0.2509, 'nll_loss': 5.89, 'ppl': 361.3}


[ time:30899.0s total_exs:8851068 epochs:134.68 ] {'exs': 936, 'lr': 0.01, 'num_updates': 186320, 'loss': 152.4, 'token_acc': 0.2516, 'nll_loss': 5.861, 'ppl': 351.2}
[ time:30901.0s total_exs:8852004 epochs:134.69 ] {'exs': 936, 'lr': 0.01, 'num_updates': 186346, 'loss': 151.8, 'token_acc': 0.2541, 'nll_loss': 5.839, 'ppl': 343.6}
[ time:30903.0s total_exs:8852868 epochs:134.71 ] {'exs': 864, 'lr': 0.01, 'num_updates': 186370, 'loss': 138.6, 'token_acc': 0.2525, 'nll_loss': 5.773, 'ppl': 321.5}
[ time:30905.0s total_exs:8853732 epochs:134.72 ] {'exs': 864, 'lr': 0.01, 'num_updates': 186394, 'loss': 141.3, 'token_acc': 0.2496, 'nll_loss': 5.889, 'ppl': 361.0}
[ time:30907.0s total_exs:8854632 epochs:134.73 ] {'exs': 900, 'lr': 0.01, 'num_updates': 186419, 'loss': 147.0, 'token_acc': 0.2555, 'nll_loss': 5.881, 'ppl': 358.1}
[ time:30909.0s total_exs:8855532 epochs:134.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 186444, 'loss': 149.1, 'token_acc': 0.2435, 'nll_loss': 5.966, 'ppl': 389.8

[ time:31001.0s total_exs:8895492 epochs:135.36 ] {'exs': 900, 'lr': 0.01, 'num_updates': 187554, 'loss': 146.5, 'token_acc': 0.2547, 'nll_loss': 5.862, 'ppl': 351.4}
[ time:31003.0s total_exs:8896356 epochs:135.37 ] {'exs': 864, 'lr': 0.01, 'num_updates': 187578, 'loss': 142.8, 'token_acc': 0.2444, 'nll_loss': 5.948, 'ppl': 382.9}
[ time:31005.0s total_exs:8897256 epochs:135.38 ] {'exs': 900, 'lr': 0.01, 'num_updates': 187603, 'loss': 145.6, 'token_acc': 0.2561, 'nll_loss': 5.824, 'ppl': 338.2}
[ time:31007.0s total_exs:8898120 epochs:135.4 ] {'exs': 864, 'lr': 0.01, 'num_updates': 187627, 'loss': 142.6, 'token_acc': 0.2414, 'nll_loss': 5.944, 'ppl': 381.5}
[ time:31009.0s total_exs:8898984 epochs:135.41 ] {'exs': 864, 'lr': 0.01, 'num_updates': 187651, 'loss': 140.5, 'token_acc': 0.247, 'nll_loss': 5.856, 'ppl': 349.4}
[ time:31011.0s total_exs:8899884 epochs:135.42 ] {'exs': 900, 'lr': 0.01, 'num_updates': 187676, 'loss': 146.4, 'token_acc': 0.2517, 'nll_loss': 5.858, 'ppl': 350.0}


[ time:31103.0s total_exs:8939664 epochs:136.03 ] {'exs': 864, 'lr': 0.01, 'num_updates': 188781, 'loss': 141.5, 'token_acc': 0.2379, 'nll_loss': 5.897, 'ppl': 363.8}
[ time:31105.0s total_exs:8940564 epochs:136.04 ] {'exs': 900, 'lr': 0.01, 'num_updates': 188806, 'loss': 146.2, 'token_acc': 0.252, 'nll_loss': 5.848, 'ppl': 346.5}
[ time:31107.0s total_exs:8941464 epochs:136.06 ] {'exs': 900, 'lr': 0.01, 'num_updates': 188831, 'loss': 145.6, 'token_acc': 0.2582, 'nll_loss': 5.824, 'ppl': 338.4}
[ time:31109.0s total_exs:8942328 epochs:136.07 ] {'exs': 864, 'lr': 0.01, 'num_updates': 188855, 'loss': 140.1, 'token_acc': 0.2547, 'nll_loss': 5.836, 'ppl': 342.4}
[ time:31112.0s total_exs:8943264 epochs:136.08 ] {'exs': 936, 'lr': 0.01, 'num_updates': 188881, 'loss': 154.2, 'token_acc': 0.2401, 'nll_loss': 5.931, 'ppl': 376.5}
[ time:31114.0s total_exs:8944164 epochs:136.1 ] {'exs': 900, 'lr': 0.01, 'num_updates': 188906, 'loss': 148.7, 'token_acc': 0.2494, 'nll_loss': 5.95, 'ppl': 383.6}
[

[ time:31205.0s total_exs:8983908 epochs:136.7 ] {'exs': 900, 'lr': 0.01, 'num_updates': 190010, 'loss': 145.6, 'token_acc': 0.2494, 'nll_loss': 5.822, 'ppl': 337.7}
[ time:31207.0s total_exs:8984808 epochs:136.72 ] {'exs': 900, 'lr': 0.01, 'num_updates': 190035, 'loss': 145.4, 'token_acc': 0.2515, 'nll_loss': 5.822, 'ppl': 337.6}
[ time:31209.0s total_exs:8985672 epochs:136.73 ] {'exs': 864, 'lr': 0.01, 'num_updates': 190059, 'loss': 141.0, 'token_acc': 0.2446, 'nll_loss': 5.88, 'ppl': 357.7}
[ time:31211.0s total_exs:8986608 epochs:136.74 ] {'exs': 936, 'lr': 0.01, 'num_updates': 190085, 'loss': 153.5, 'token_acc': 0.2443, 'nll_loss': 5.904, 'ppl': 366.4}
[ time:31213.0s total_exs:8987436 epochs:136.76 ] {'exs': 828, 'lr': 0.01, 'num_updates': 190108, 'loss': 134.0, 'token_acc': 0.2542, 'nll_loss': 5.827, 'ppl': 339.3}
[ time:31216.0s total_exs:8988336 epochs:136.77 ] {'exs': 900, 'lr': 0.01, 'num_updates': 190133, 'loss': 144.7, 'token_acc': 0.2601, 'nll_loss': 5.789, 'ppl': 326.6}


[ time:31307.0s total_exs:9028008 epochs:137.37 ] {'exs': 864, 'lr': 0.01, 'num_updates': 191235, 'loss': 141.4, 'token_acc': 0.2449, 'nll_loss': 5.894, 'ppl': 362.8}
[ time:31309.0s total_exs:9028944 epochs:137.39 ] {'exs': 936, 'lr': 0.01, 'num_updates': 191261, 'loss': 151.4, 'token_acc': 0.2552, 'nll_loss': 5.825, 'ppl': 338.7}
[ time:31312.0s total_exs:9029772 epochs:137.4 ] {'exs': 828, 'lr': 0.01, 'num_updates': 191284, 'loss': 133.7, 'token_acc': 0.2527, 'nll_loss': 5.816, 'ppl': 335.7}
[ time:31314.0s total_exs:9030636 epochs:137.41 ] {'exs': 864, 'lr': 0.01, 'num_updates': 191308, 'loss': 139.8, 'token_acc': 0.2572, 'nll_loss': 5.827, 'ppl': 339.5}
[ time:31316.0s total_exs:9031536 epochs:137.43 ] {'exs': 900, 'lr': 0.01, 'num_updates': 191333, 'loss': 145.8, 'token_acc': 0.251, 'nll_loss': 5.831, 'ppl': 340.8}
[ time:31318.0s total_exs:9032400 epochs:137.44 ] {'exs': 864, 'lr': 0.01, 'num_updates': 191357, 'loss': 140.5, 'token_acc': 0.2511, 'nll_loss': 5.858, 'ppl': 349.9}


[ time:31410.0s total_exs:9072288 epochs:138.05 ] {'exs': 936, 'lr': 0.01, 'num_updates': 192465, 'loss': 152.6, 'token_acc': 0.2553, 'nll_loss': 5.869, 'ppl': 353.9}
[ time:31412.0s total_exs:9073188 epochs:138.06 ] {'exs': 900, 'lr': 0.01, 'num_updates': 192490, 'loss': 147.6, 'token_acc': 0.25, 'nll_loss': 5.906, 'ppl': 367.3}
[ time:31414.0s total_exs:9074052 epochs:138.07 ] {'exs': 864, 'lr': 0.01, 'num_updates': 192514, 'loss': 141.1, 'token_acc': 0.2506, 'nll_loss': 5.882, 'ppl': 358.7}
[ time:31416.0s total_exs:9074952 epochs:138.09 ] {'exs': 900, 'lr': 0.01, 'num_updates': 192539, 'loss': 146.2, 'token_acc': 0.2523, 'nll_loss': 5.848, 'ppl': 346.7}
[ time:31418.0s total_exs:9075816 epochs:138.1 ] {'exs': 864, 'lr': 0.01, 'num_updates': 192563, 'loss': 141.1, 'token_acc': 0.2474, 'nll_loss': 5.881, 'ppl': 358.3}
[ time:31420.0s total_exs:9076716 epochs:138.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 192588, 'loss': 148.7, 'token_acc': 0.2446, 'nll_loss': 5.946, 'ppl': 382.1}
[

[ time:31512.0s total_exs:9116676 epochs:138.72 ] {'exs': 828, 'lr': 0.01, 'num_updates': 193698, 'loss': 134.5, 'token_acc': 0.2487, 'nll_loss': 5.851, 'ppl': 347.7}
[ time:31514.0s total_exs:9117540 epochs:138.74 ] {'exs': 864, 'lr': 0.01, 'num_updates': 193722, 'loss': 138.7, 'token_acc': 0.2539, 'nll_loss': 5.781, 'ppl': 324.2}
[ time:31516.0s total_exs:9118404 epochs:138.75 ] {'exs': 864, 'lr': 0.01, 'num_updates': 193746, 'loss': 139.1, 'token_acc': 0.2546, 'nll_loss': 5.795, 'ppl': 328.8}
[ time:31518.0s total_exs:9119232 epochs:138.76 ] {'exs': 828, 'lr': 0.01, 'num_updates': 193769, 'loss': 134.4, 'token_acc': 0.2518, 'nll_loss': 5.845, 'ppl': 345.5}
[ time:31520.0s total_exs:9120168 epochs:138.78 ] {'exs': 936, 'lr': 0.01, 'num_updates': 193795, 'loss': 152.9, 'token_acc': 0.2511, 'nll_loss': 5.884, 'ppl': 359.1}
[ time:31522.0s total_exs:9121068 epochs:138.79 ] {'exs': 900, 'lr': 0.01, 'num_updates': 193820, 'loss': 145.3, 'token_acc': 0.248, 'nll_loss': 5.812, 'ppl': 334.3}

[ time:31614.0s total_exs:9161280 epochs:139.4 ] {'exs': 900, 'lr': 0.01, 'num_updates': 194937, 'loss': 145.8, 'token_acc': 0.2495, 'nll_loss': 5.831, 'ppl': 340.8}
[ time:31616.0s total_exs:9162180 epochs:139.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 194962, 'loss': 146.4, 'token_acc': 0.2472, 'nll_loss': 5.856, 'ppl': 349.2}
[ time:31618.0s total_exs:9163116 epochs:139.43 ] {'exs': 936, 'lr': 0.01, 'num_updates': 194988, 'loss': 152.5, 'token_acc': 0.2469, 'nll_loss': 5.867, 'ppl': 353.1}
[ time:31620.0s total_exs:9164016 epochs:139.44 ] {'exs': 900, 'lr': 0.01, 'num_updates': 195013, 'loss': 146.7, 'token_acc': 0.244, 'nll_loss': 5.868, 'ppl': 353.6}
[ time:31622.0s total_exs:9164916 epochs:139.46 ] {'exs': 900, 'lr': 0.01, 'num_updates': 195038, 'loss': 144.4, 'token_acc': 0.2559, 'nll_loss': 5.776, 'ppl': 322.4}
[ time:31624.0s total_exs:9165852 epochs:139.47 ] {'exs': 936, 'lr': 0.01, 'num_updates': 195064, 'loss': 152.6, 'token_acc': 0.246, 'nll_loss': 5.868, 'ppl': 353.4}
[

[ time:31716.0s total_exs:9206028 epochs:140.08 ] {'exs': 900, 'lr': 0.01, 'num_updates': 196180, 'loss': 146.0, 'token_acc': 0.2518, 'nll_loss': 5.843, 'ppl': 344.9}
[ time:31718.0s total_exs:9206928 epochs:140.1 ] {'exs': 900, 'lr': 0.01, 'num_updates': 196205, 'loss': 145.2, 'token_acc': 0.2491, 'nll_loss': 5.81, 'ppl': 333.7}
[ time:31720.0s total_exs:9207828 epochs:140.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 196230, 'loss': 143.3, 'token_acc': 0.257, 'nll_loss': 5.733, 'ppl': 308.8}
[ time:31722.0s total_exs:9208692 epochs:140.12 ] {'exs': 864, 'lr': 0.01, 'num_updates': 196254, 'loss': 137.1, 'token_acc': 0.2592, 'nll_loss': 5.71, 'ppl': 302.0}
[ time:31724.0s total_exs:9209520 epochs:140.13 ] {'exs': 828, 'lr': 0.01, 'num_updates': 196277, 'loss': 135.5, 'token_acc': 0.237, 'nll_loss': 5.895, 'ppl': 363.3}
[ time:31726.0s total_exs:9210456 epochs:140.15 ] {'exs': 936, 'lr': 0.01, 'num_updates': 196303, 'loss': 153.3, 'token_acc': 0.2446, 'nll_loss': 5.897, 'ppl': 363.8}
[ t

[ time:31818.0s total_exs:9250056 epochs:140.75 ] {'exs': 936, 'lr': 0.01, 'num_updates': 197403, 'loss': 150.6, 'token_acc': 0.255, 'nll_loss': 5.796, 'ppl': 329.1}
[ time:31820.0s total_exs:9250884 epochs:140.76 ] {'exs': 828, 'lr': 0.01, 'num_updates': 197426, 'loss': 135.1, 'token_acc': 0.2475, 'nll_loss': 5.875, 'ppl': 356.1}
[ time:31822.0s total_exs:9251784 epochs:140.78 ] {'exs': 900, 'lr': 0.01, 'num_updates': 197451, 'loss': 144.8, 'token_acc': 0.2568, 'nll_loss': 5.792, 'ppl': 327.8}
[ time:31824.0s total_exs:9252648 epochs:140.79 ] {'exs': 864, 'lr': 0.01, 'num_updates': 197475, 'loss': 138.3, 'token_acc': 0.2577, 'nll_loss': 5.764, 'ppl': 318.5}
[ time:31826.0s total_exs:9253476 epochs:140.8 ] {'exs': 828, 'lr': 0.01, 'num_updates': 197498, 'loss': 136.9, 'token_acc': 0.2426, 'nll_loss': 5.956, 'ppl': 385.9}
[ time:31828.0s total_exs:9254340 epochs:140.82 ] {'exs': 864, 'lr': 0.01, 'num_updates': 197522, 'loss': 141.0, 'token_acc': 0.2434, 'nll_loss': 5.876, 'ppl': 356.4}


[ time:31920.0s total_exs:9293904 epochs:141.42 ] {'exs': 864, 'lr': 0.01, 'num_updates': 198621, 'loss': 140.1, 'token_acc': 0.2526, 'nll_loss': 5.838, 'ppl': 343.0}
[ time:31922.0s total_exs:9294732 epochs:141.43 ] {'exs': 828, 'lr': 0.01, 'num_updates': 198644, 'loss': 133.3, 'token_acc': 0.2456, 'nll_loss': 5.802, 'ppl': 331.0}
[ time:31924.0s total_exs:9295596 epochs:141.44 ] {'exs': 864, 'lr': 0.01, 'num_updates': 198668, 'loss': 140.3, 'token_acc': 0.254, 'nll_loss': 5.845, 'ppl': 345.7}
[ time:31926.0s total_exs:9296496 epochs:141.46 ] {'exs': 900, 'lr': 0.01, 'num_updates': 198693, 'loss': 146.3, 'token_acc': 0.2512, 'nll_loss': 5.854, 'ppl': 348.5}
[ time:31928.0s total_exs:9297432 epochs:141.47 ] {'exs': 936, 'lr': 0.01, 'num_updates': 198719, 'loss': 149.1, 'token_acc': 0.257, 'nll_loss': 5.735, 'ppl': 309.4}
[ time:31930.0s total_exs:9298260 epochs:141.49 ] {'exs': 828, 'lr': 0.01, 'num_updates': 198742, 'loss': 133.8, 'token_acc': 0.2529, 'nll_loss': 5.82, 'ppl': 337.0}
[

[ time:32022.0s total_exs:9338256 epochs:142.09 ] {'exs': 900, 'lr': 0.01, 'num_updates': 199853, 'loss': 145.9, 'token_acc': 0.2516, 'nll_loss': 5.839, 'ppl': 343.5}
[ time:32024.0s total_exs:9339156 epochs:142.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 199878, 'loss': 143.9, 'token_acc': 0.2539, 'nll_loss': 5.761, 'ppl': 317.6}
[ time:32026.0s total_exs:9339984 epochs:142.12 ] {'exs': 828, 'lr': 0.01, 'num_updates': 199901, 'loss': 134.2, 'token_acc': 0.2526, 'nll_loss': 5.837, 'ppl': 342.8}
[ time:32028.0s total_exs:9340812 epochs:142.13 ] {'exs': 828, 'lr': 0.01, 'num_updates': 199924, 'loss': 133.7, 'token_acc': 0.2501, 'nll_loss': 5.816, 'ppl': 335.8}
[ time:32030.0s total_exs:9341712 epochs:142.15 ] {'exs': 900, 'lr': 0.01, 'num_updates': 199949, 'loss': 145.0, 'token_acc': 0.2543, 'nll_loss': 5.802, 'ppl': 330.8}
[ time:32032.0s total_exs:9342576 epochs:142.16 ] {'exs': 864, 'lr': 0.01, 'num_updates': 199973, 'loss': 139.6, 'token_acc': 0.2517, 'nll_loss': 5.818, 'ppl': 336.4

[ time:32124.0s total_exs:9382680 epochs:142.77 ] {'exs': 936, 'lr': 0.01, 'num_updates': 201087, 'loss': 151.7, 'token_acc': 0.2478, 'nll_loss': 5.835, 'ppl': 342.2}
[ time:32126.0s total_exs:9383616 epochs:142.78 ] {'exs': 936, 'lr': 0.01, 'num_updates': 201113, 'loss': 152.2, 'token_acc': 0.2512, 'nll_loss': 5.858, 'ppl': 350.2}
[ time:32129.0s total_exs:9384552 epochs:142.8 ] {'exs': 936, 'lr': 0.01, 'num_updates': 201139, 'loss': 151.1, 'token_acc': 0.2521, 'nll_loss': 5.814, 'ppl': 335.0}
[ time:32131.0s total_exs:9385452 epochs:142.81 ] {'exs': 900, 'lr': 0.01, 'num_updates': 201164, 'loss': 145.8, 'token_acc': 0.2557, 'nll_loss': 5.833, 'ppl': 341.3}
[ time:32133.0s total_exs:9386352 epochs:142.83 ] {'exs': 900, 'lr': 0.01, 'num_updates': 201189, 'loss': 146.6, 'token_acc': 0.2463, 'nll_loss': 5.865, 'ppl': 352.6}
[ time:32135.0s total_exs:9387216 epochs:142.84 ] {'exs': 864, 'lr': 0.01, 'num_updates': 201213, 'loss': 137.8, 'token_acc': 0.255, 'nll_loss': 5.745, 'ppl': 312.7}


[ time:32226.0s total_exs:9427392 epochs:143.45 ] {'exs': 900, 'lr': 0.01, 'num_updates': 202329, 'loss': 143.8, 'token_acc': 0.254, 'nll_loss': 5.755, 'ppl': 315.6}
[ time:32228.0s total_exs:9428292 epochs:143.46 ] {'exs': 900, 'lr': 0.01, 'num_updates': 202354, 'loss': 146.5, 'token_acc': 0.248, 'nll_loss': 5.862, 'ppl': 351.4}
[ time:32231.0s total_exs:9429192 epochs:143.48 ] {'exs': 900, 'lr': 0.01, 'num_updates': 202379, 'loss': 147.6, 'token_acc': 0.2393, 'nll_loss': 5.902, 'ppl': 365.8}
[ time:32233.0s total_exs:9430092 epochs:143.49 ] {'exs': 900, 'lr': 0.01, 'num_updates': 202404, 'loss': 144.6, 'token_acc': 0.2537, 'nll_loss': 5.785, 'ppl': 325.4}
[ time:32235.0s total_exs:9430992 epochs:143.5 ] {'exs': 900, 'lr': 0.01, 'num_updates': 202429, 'loss': 143.6, 'token_acc': 0.2565, 'nll_loss': 5.743, 'ppl': 312.1}
[ time:32237.0s total_exs:9431856 epochs:143.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 202453, 'loss': 141.6, 'token_acc': 0.2471, 'nll_loss': 5.901, 'ppl': 365.4}
[

[ time:32329.0s total_exs:9471852 epochs:144.13 ] {'exs': 864, 'lr': 0.01, 'num_updates': 203564, 'loss': 139.8, 'token_acc': 0.249, 'nll_loss': 5.826, 'ppl': 339.0}
[ time:32331.0s total_exs:9472752 epochs:144.14 ] {'exs': 900, 'lr': 0.01, 'num_updates': 203589, 'loss': 146.4, 'token_acc': 0.2513, 'nll_loss': 5.855, 'ppl': 348.8}
[ time:32333.0s total_exs:9473616 epochs:144.15 ] {'exs': 864, 'lr': 0.01, 'num_updates': 203613, 'loss': 142.2, 'token_acc': 0.2422, 'nll_loss': 5.93, 'ppl': 376.1}
[ time:32335.0s total_exs:9474408 epochs:144.17 ] {'exs': 792, 'lr': 0.01, 'num_updates': 203635, 'loss': 128.2, 'token_acc': 0.2512, 'nll_loss': 5.827, 'ppl': 339.4}
[ time:32337.0s total_exs:9475308 epochs:144.18 ] {'exs': 900, 'lr': 0.01, 'num_updates': 203660, 'loss': 148.0, 'token_acc': 0.2418, 'nll_loss': 5.922, 'ppl': 373.2}
[ time:32339.0s total_exs:9476244 epochs:144.19 ] {'exs': 936, 'lr': 0.01, 'num_updates': 203686, 'loss': 152.8, 'token_acc': 0.2477, 'nll_loss': 5.877, 'ppl': 356.7}


[ time:32431.0s total_exs:9516240 epochs:144.8 ] {'exs': 900, 'lr': 0.01, 'num_updates': 204797, 'loss': 148.5, 'token_acc': 0.2405, 'nll_loss': 5.938, 'ppl': 379.4}
[ time:32433.0s total_exs:9517140 epochs:144.82 ] {'exs': 900, 'lr': 0.01, 'num_updates': 204822, 'loss': 143.5, 'token_acc': 0.2548, 'nll_loss': 5.738, 'ppl': 310.5}
[ time:32435.0s total_exs:9518076 epochs:144.83 ] {'exs': 936, 'lr': 0.01, 'num_updates': 204848, 'loss': 149.1, 'token_acc': 0.2555, 'nll_loss': 5.735, 'ppl': 309.6}
[ time:32437.0s total_exs:9518976 epochs:144.84 ] {'exs': 900, 'lr': 0.01, 'num_updates': 204873, 'loss': 145.9, 'token_acc': 0.2537, 'nll_loss': 5.838, 'ppl': 343.1}
[ time:32439.0s total_exs:9519840 epochs:144.86 ] {'exs': 864, 'lr': 0.01, 'num_updates': 204897, 'loss': 140.0, 'token_acc': 0.2514, 'nll_loss': 5.834, 'ppl': 341.8}
[ time:32441.0s total_exs:9520776 epochs:144.87 ] {'exs': 936, 'lr': 0.01, 'num_updates': 204923, 'loss': 151.3, 'token_acc': 0.2529, 'nll_loss': 5.819, 'ppl': 336.6}

[ time:32533.0s total_exs:9560916 epochs:145.48 ] {'exs': 936, 'lr': 0.01, 'num_updates': 206038, 'loss': 150.8, 'token_acc': 0.2491, 'nll_loss': 5.8, 'ppl': 330.2}
[ time:32535.0s total_exs:9561744 epochs:145.49 ] {'exs': 828, 'lr': 0.01, 'num_updates': 206061, 'loss': 135.0, 'token_acc': 0.2447, 'nll_loss': 5.871, 'ppl': 354.7}
[ time:32537.0s total_exs:9562608 epochs:145.51 ] {'exs': 864, 'lr': 0.01, 'num_updates': 206085, 'loss': 137.8, 'token_acc': 0.2555, 'nll_loss': 5.745, 'ppl': 312.6}
[ time:32539.0s total_exs:9563472 epochs:145.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 206109, 'loss': 140.4, 'token_acc': 0.2509, 'nll_loss': 5.849, 'ppl': 346.9}
[ time:32541.0s total_exs:9564336 epochs:145.53 ] {'exs': 864, 'lr': 0.01, 'num_updates': 206133, 'loss': 139.0, 'token_acc': 0.2522, 'nll_loss': 5.796, 'ppl': 328.9}
[ time:32543.0s total_exs:9565164 epochs:145.55 ] {'exs': 828, 'lr': 0.01, 'num_updates': 206156, 'loss': 134.2, 'token_acc': 0.2467, 'nll_loss': 5.838, 'ppl': 343.3}


[ time:32635.0s total_exs:9605196 epochs:146.16 ] {'exs': 900, 'lr': 0.01, 'num_updates': 207268, 'loss': 146.1, 'token_acc': 0.2581, 'nll_loss': 5.839, 'ppl': 343.5}
[ time:32637.0s total_exs:9606060 epochs:146.17 ] {'exs': 864, 'lr': 0.01, 'num_updates': 207292, 'loss': 139.6, 'token_acc': 0.2548, 'nll_loss': 5.819, 'ppl': 336.8}
[ time:32639.0s total_exs:9606996 epochs:146.18 ] {'exs': 936, 'lr': 0.01, 'num_updates': 207318, 'loss': 151.6, 'token_acc': 0.2529, 'nll_loss': 5.833, 'ppl': 341.3}
[ time:32641.0s total_exs:9607896 epochs:146.2 ] {'exs': 900, 'lr': 0.01, 'num_updates': 207343, 'loss': 144.6, 'token_acc': 0.2554, 'nll_loss': 5.787, 'ppl': 325.9}
[ time:32643.0s total_exs:9608796 epochs:146.21 ] {'exs': 900, 'lr': 0.01, 'num_updates': 207368, 'loss': 145.8, 'token_acc': 0.2512, 'nll_loss': 5.835, 'ppl': 342.1}
[ time:32645.0s total_exs:9609696 epochs:146.22 ] {'exs': 900, 'lr': 0.01, 'num_updates': 207393, 'loss': 144.8, 'token_acc': 0.255, 'nll_loss': 5.791, 'ppl': 327.4}


[ time:32737.0s total_exs:9649764 epochs:146.83 ] {'exs': 936, 'lr': 0.01, 'num_updates': 208506, 'loss': 150.3, 'token_acc': 0.2479, 'nll_loss': 5.782, 'ppl': 324.4}
[ time:32739.0s total_exs:9650592 epochs:146.85 ] {'exs': 828, 'lr': 0.01, 'num_updates': 208529, 'loss': 133.0, 'token_acc': 0.2515, 'nll_loss': 5.782, 'ppl': 324.3}
[ time:32741.0s total_exs:9651456 epochs:146.86 ] {'exs': 864, 'lr': 0.01, 'num_updates': 208553, 'loss': 141.0, 'token_acc': 0.2478, 'nll_loss': 5.877, 'ppl': 356.9}
[ time:32743.0s total_exs:9652356 epochs:146.87 ] {'exs': 900, 'lr': 0.01, 'num_updates': 208578, 'loss': 146.3, 'token_acc': 0.2427, 'nll_loss': 5.852, 'ppl': 348.0}
[ time:32745.0s total_exs:9653256 epochs:146.89 ] {'exs': 900, 'lr': 0.01, 'num_updates': 208603, 'loss': 144.9, 'token_acc': 0.2548, 'nll_loss': 5.8, 'ppl': 330.2}
[ time:32747.0s total_exs:9654156 epochs:146.9 ] {'exs': 900, 'lr': 0.01, 'num_updates': 208628, 'loss': 145.6, 'token_acc': 0.2537, 'nll_loss': 5.825, 'ppl': 338.6}
[

[ time:32839.0s total_exs:9694260 epochs:147.51 ] {'exs': 864, 'lr': 0.01, 'num_updates': 209742, 'loss': 138.4, 'token_acc': 0.2574, 'nll_loss': 5.765, 'ppl': 319.0}
[ time:32842.0s total_exs:9695124 epochs:147.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 209766, 'loss': 140.7, 'token_acc': 0.2453, 'nll_loss': 5.867, 'ppl': 353.2}
[ time:32844.0s total_exs:9696060 epochs:147.54 ] {'exs': 936, 'lr': 0.01, 'num_updates': 209792, 'loss': 149.4, 'token_acc': 0.2614, 'nll_loss': 5.745, 'ppl': 312.6}
[ time:32846.0s total_exs:9696924 epochs:147.55 ] {'exs': 864, 'lr': 0.01, 'num_updates': 209816, 'loss': 138.9, 'token_acc': 0.2525, 'nll_loss': 5.787, 'ppl': 326.1}
[ time:32848.0s total_exs:9697788 epochs:147.56 ] {'exs': 864, 'lr': 0.01, 'num_updates': 209840, 'loss': 137.5, 'token_acc': 0.2575, 'nll_loss': 5.73, 'ppl': 307.9}
[ time:32850.0s total_exs:9698688 epochs:147.58 ] {'exs': 900, 'lr': 0.01, 'num_updates': 209865, 'loss': 143.5, 'token_acc': 0.2546, 'nll_loss': 5.745, 'ppl': 312.6}

[ time:32941.0s total_exs:9738540 epochs:148.18 ] {'exs': 864, 'lr': 0.01, 'num_updates': 210972, 'loss': 139.0, 'token_acc': 0.2497, 'nll_loss': 5.789, 'ppl': 326.8}
[ time:32943.0s total_exs:9739404 epochs:148.2 ] {'exs': 864, 'lr': 0.01, 'num_updates': 210996, 'loss': 138.4, 'token_acc': 0.2563, 'nll_loss': 5.768, 'ppl': 319.8}
[ time:32945.0s total_exs:9740304 epochs:148.21 ] {'exs': 900, 'lr': 0.01, 'num_updates': 211021, 'loss': 144.7, 'token_acc': 0.2535, 'nll_loss': 5.79, 'ppl': 327.1}
[ time:32948.0s total_exs:9741168 epochs:148.22 ] {'exs': 864, 'lr': 0.01, 'num_updates': 211045, 'loss': 137.1, 'token_acc': 0.2605, 'nll_loss': 5.719, 'ppl': 304.5}
[ time:32950.0s total_exs:9742068 epochs:148.24 ] {'exs': 900, 'lr': 0.01, 'num_updates': 211070, 'loss': 146.9, 'token_acc': 0.2458, 'nll_loss': 5.874, 'ppl': 355.7}
[ time:32952.0s total_exs:9742968 epochs:148.25 ] {'exs': 900, 'lr': 0.01, 'num_updates': 211095, 'loss': 146.3, 'token_acc': 0.2488, 'nll_loss': 5.852, 'ppl': 348.1}


[ time:33043.0s total_exs:9783288 epochs:148.87 ] {'exs': 900, 'lr': 0.01, 'num_updates': 212215, 'loss': 145.8, 'token_acc': 0.2542, 'nll_loss': 5.835, 'ppl': 342.1}
[ time:33045.0s total_exs:9784152 epochs:148.88 ] {'exs': 864, 'lr': 0.01, 'num_updates': 212239, 'loss': 138.7, 'token_acc': 0.2486, 'nll_loss': 5.779, 'ppl': 323.6}
[ time:33047.0s total_exs:9785052 epochs:148.89 ] {'exs': 900, 'lr': 0.01, 'num_updates': 212264, 'loss': 144.4, 'token_acc': 0.2517, 'nll_loss': 5.779, 'ppl': 323.4}
[ time:33049.0s total_exs:9785952 epochs:148.91 ] {'exs': 900, 'lr': 0.01, 'num_updates': 212289, 'loss': 148.8, 'token_acc': 0.2414, 'nll_loss': 5.953, 'ppl': 385.0}
[ time:33051.0s total_exs:9786816 epochs:148.92 ] {'exs': 864, 'lr': 0.01, 'num_updates': 212313, 'loss': 139.5, 'token_acc': 0.2519, 'nll_loss': 5.813, 'ppl': 334.7}
[ time:33053.0s total_exs:9787716 epochs:148.93 ] {'exs': 900, 'lr': 0.01, 'num_updates': 212338, 'loss': 146.2, 'token_acc': 0.2524, 'nll_loss': 5.85, 'ppl': 347.3}

[ time:33145.0s total_exs:9827496 epochs:149.54 ] {'exs': 900, 'lr': 0.01, 'num_updates': 213443, 'loss': 146.8, 'token_acc': 0.2491, 'nll_loss': 5.875, 'ppl': 355.9}
[ time:33147.0s total_exs:9828396 epochs:149.55 ] {'exs': 900, 'lr': 0.01, 'num_updates': 213468, 'loss': 144.4, 'token_acc': 0.2559, 'nll_loss': 5.776, 'ppl': 322.5}
[ time:33150.0s total_exs:9829296 epochs:149.57 ] {'exs': 900, 'lr': 0.01, 'num_updates': 213493, 'loss': 143.6, 'token_acc': 0.2559, 'nll_loss': 5.749, 'ppl': 314.0}
[ time:33152.0s total_exs:9830088 epochs:149.58 ] {'exs': 792, 'lr': 0.01, 'num_updates': 213515, 'loss': 126.6, 'token_acc': 0.2535, 'nll_loss': 5.759, 'ppl': 317.0}
[ time:33154.0s total_exs:9830988 epochs:149.59 ] {'exs': 900, 'lr': 0.01, 'num_updates': 213540, 'loss': 147.2, 'token_acc': 0.2484, 'nll_loss': 5.888, 'ppl': 360.8}
[ time:33156.0s total_exs:9831888 epochs:149.6 ] {'exs': 900, 'lr': 0.01, 'num_updates': 213565, 'loss': 145.2, 'token_acc': 0.2539, 'nll_loss': 5.807, 'ppl': 332.6}

[ time:33247.0s total_exs:9872136 epochs:150.22 ] {'exs': 900, 'lr': 0.01, 'num_updates': 214683, 'loss': 145.8, 'token_acc': 0.2441, 'nll_loss': 5.835, 'ppl': 342.1}
[ time:33249.0s total_exs:9873036 epochs:150.23 ] {'exs': 900, 'lr': 0.01, 'num_updates': 214708, 'loss': 143.7, 'token_acc': 0.2594, 'nll_loss': 5.747, 'ppl': 313.3}
[ time:33251.0s total_exs:9873936 epochs:150.24 ] {'exs': 900, 'lr': 0.01, 'num_updates': 214733, 'loss': 144.5, 'token_acc': 0.2553, 'nll_loss': 5.779, 'ppl': 323.5}
[ time:33253.0s total_exs:9874836 epochs:150.26 ] {'exs': 900, 'lr': 0.01, 'num_updates': 214758, 'loss': 144.9, 'token_acc': 0.253, 'nll_loss': 5.801, 'ppl': 330.6}
[ time:33255.0s total_exs:9875772 epochs:150.27 ] {'exs': 936, 'lr': 0.01, 'num_updates': 214784, 'loss': 149.3, 'token_acc': 0.257, 'nll_loss': 5.74, 'ppl': 311.2}
[ time:33257.0s total_exs:9876672 epochs:150.29 ] {'exs': 900, 'lr': 0.01, 'num_updates': 214809, 'loss': 144.7, 'token_acc': 0.2542, 'nll_loss': 5.793, 'ppl': 327.9}
[

[ time:33349.0s total_exs:9916380 epochs:150.89 ] {'exs': 936, 'lr': 0.01, 'num_updates': 215912, 'loss': 150.3, 'token_acc': 0.2558, 'nll_loss': 5.781, 'ppl': 323.9}
[ time:33351.0s total_exs:9917280 epochs:150.9 ] {'exs': 900, 'lr': 0.01, 'num_updates': 215937, 'loss': 144.2, 'token_acc': 0.2568, 'nll_loss': 5.766, 'ppl': 319.3}
[ time:33353.0s total_exs:9918180 epochs:150.92 ] {'exs': 900, 'lr': 0.01, 'num_updates': 215962, 'loss': 146.5, 'token_acc': 0.2456, 'nll_loss': 5.858, 'ppl': 349.9}
[ time:33355.0s total_exs:9919080 epochs:150.93 ] {'exs': 900, 'lr': 0.01, 'num_updates': 215987, 'loss': 144.4, 'token_acc': 0.254, 'nll_loss': 5.776, 'ppl': 322.4}
[ time:33357.0s total_exs:9919980 epochs:150.95 ] {'exs': 900, 'lr': 0.01, 'num_updates': 216012, 'loss': 145.8, 'token_acc': 0.2534, 'nll_loss': 5.834, 'ppl': 341.8}
[ time:33359.0s total_exs:9920844 epochs:150.96 ] {'exs': 864, 'lr': 0.01, 'num_updates': 216036, 'loss': 138.6, 'token_acc': 0.2533, 'nll_loss': 5.775, 'ppl': 322.3}


[ time:33451.0s total_exs:9960696 epochs:151.56 ] {'exs': 864, 'lr': 0.01, 'num_updates': 217143, 'loss': 137.6, 'token_acc': 0.259, 'nll_loss': 5.737, 'ppl': 310.1}
[ time:33453.0s total_exs:9961560 epochs:151.58 ] {'exs': 864, 'lr': 0.01, 'num_updates': 217167, 'loss': 139.8, 'token_acc': 0.2452, 'nll_loss': 5.826, 'ppl': 339.2}
[ time:33455.0s total_exs:9962460 epochs:151.59 ] {'exs': 900, 'lr': 0.01, 'num_updates': 217192, 'loss': 144.7, 'token_acc': 0.2487, 'nll_loss': 5.79, 'ppl': 327.1}
[ time:33458.0s total_exs:9963360 epochs:151.61 ] {'exs': 900, 'lr': 0.01, 'num_updates': 217217, 'loss': 144.5, 'token_acc': 0.2557, 'nll_loss': 5.785, 'ppl': 325.4}
[ time:33460.0s total_exs:9964260 epochs:151.62 ] {'exs': 900, 'lr': 0.01, 'num_updates': 217242, 'loss': 145.7, 'token_acc': 0.2502, 'nll_loss': 5.831, 'ppl': 340.6}
[ time:33462.0s total_exs:9965160 epochs:151.63 ] {'exs': 900, 'lr': 0.01, 'num_updates': 217267, 'loss': 146.4, 'token_acc': 0.2508, 'nll_loss': 5.859, 'ppl': 350.2}


[ time:33553.0s total_exs:10004976 epochs:152.24 ] {'exs': 900, 'lr': 0.01, 'num_updates': 218373, 'loss': 144.8, 'token_acc': 0.2476, 'nll_loss': 5.793, 'ppl': 328.1}
[ time:33556.0s total_exs:10005840 epochs:152.25 ] {'exs': 864, 'lr': 0.01, 'num_updates': 218397, 'loss': 138.7, 'token_acc': 0.2435, 'nll_loss': 5.78, 'ppl': 323.7}
[ time:33558.0s total_exs:10006704 epochs:152.27 ] {'exs': 864, 'lr': 0.01, 'num_updates': 218421, 'loss': 139.4, 'token_acc': 0.2486, 'nll_loss': 5.807, 'ppl': 332.7}
[ time:33560.0s total_exs:10007568 epochs:152.28 ] {'exs': 864, 'lr': 0.01, 'num_updates': 218445, 'loss': 139.1, 'token_acc': 0.2508, 'nll_loss': 5.797, 'ppl': 329.3}
[ time:33562.0s total_exs:10008468 epochs:152.29 ] {'exs': 900, 'lr': 0.01, 'num_updates': 218470, 'loss': 146.7, 'token_acc': 0.2482, 'nll_loss': 5.87, 'ppl': 354.4}
[ time:33564.0s total_exs:10009368 epochs:152.31 ] {'exs': 900, 'lr': 0.01, 'num_updates': 218495, 'loss': 145.1, 'token_acc': 0.2488, 'nll_loss': 5.804, 'ppl': 3

[ time:33654.0s total_exs:10048608 epochs:152.9 ] {'exs': 900, 'lr': 0.01, 'num_updates': 219585, 'loss': 144.5, 'token_acc': 0.2522, 'nll_loss': 5.782, 'ppl': 324.3}
[ time:33656.0s total_exs:10049436 epochs:152.92 ] {'exs': 828, 'lr': 0.01, 'num_updates': 219608, 'loss': 134.1, 'token_acc': 0.2453, 'nll_loss': 5.831, 'ppl': 340.7}
[ time:33658.0s total_exs:10050336 epochs:152.93 ] {'exs': 900, 'lr': 0.01, 'num_updates': 219633, 'loss': 143.1, 'token_acc': 0.2624, 'nll_loss': 5.721, 'ppl': 305.1}
[ time:33660.0s total_exs:10051272 epochs:152.94 ] {'exs': 936, 'lr': 0.01, 'num_updates': 219659, 'loss': 152.6, 'token_acc': 0.2456, 'nll_loss': 5.87, 'ppl': 354.3}
[ time:33662.0s total_exs:10052136 epochs:152.96 ] {'exs': 864, 'lr': 0.01, 'num_updates': 219683, 'loss': 138.6, 'token_acc': 0.256, 'nll_loss': 5.773, 'ppl': 321.4}
[ time:33664.0s total_exs:10053036 epochs:152.97 ] {'exs': 900, 'lr': 0.01, 'num_updates': 219708, 'loss': 146.2, 'token_acc': 0.2445, 'nll_loss': 5.848, 'ppl': 34

[ time:33754.0s total_exs:10092348 epochs:153.57 ] {'exs': 936, 'lr': 0.01, 'num_updates': 220800, 'loss': 151.4, 'token_acc': 0.2512, 'nll_loss': 5.822, 'ppl': 337.8}
[ time:33756.0s total_exs:10093248 epochs:153.58 ] {'exs': 900, 'lr': 0.01, 'num_updates': 220825, 'loss': 143.8, 'token_acc': 0.2562, 'nll_loss': 5.751, 'ppl': 314.6}
[ time:33758.0s total_exs:10094076 epochs:153.59 ] {'exs': 828, 'lr': 0.01, 'num_updates': 220848, 'loss': 133.1, 'token_acc': 0.2589, 'nll_loss': 5.789, 'ppl': 326.8}
[ time:33760.0s total_exs:10094940 epochs:153.61 ] {'exs': 864, 'lr': 0.01, 'num_updates': 220872, 'loss': 136.5, 'token_acc': 0.2609, 'nll_loss': 5.689, 'ppl': 295.5}
[ time:33762.0s total_exs:10095840 epochs:153.62 ] {'exs': 900, 'lr': 0.01, 'num_updates': 220897, 'loss': 145.5, 'token_acc': 0.2482, 'nll_loss': 5.817, 'ppl': 336.1}
[ time:33764.0s total_exs:10096776 epochs:153.64 ] {'exs': 936, 'lr': 0.01, 'num_updates': 220923, 'loss': 152.6, 'token_acc': 0.2452, 'nll_loss': 5.873, 'ppl':

[ time:33854.0s total_exs:10135260 epochs:154.22 ] {'exs': 864, 'lr': 0.01, 'num_updates': 221992, 'loss': 139.6, 'token_acc': 0.2499, 'nll_loss': 5.816, 'ppl': 335.6}
[ time:33856.0s total_exs:10136088 epochs:154.23 ] {'exs': 828, 'lr': 0.01, 'num_updates': 222015, 'loss': 133.5, 'token_acc': 0.2553, 'nll_loss': 5.805, 'ppl': 332.0}
[ time:33858.0s total_exs:10136916 epochs:154.25 ] {'exs': 828, 'lr': 0.01, 'num_updates': 222038, 'loss': 135.8, 'token_acc': 0.2494, 'nll_loss': 5.903, 'ppl': 366.1}
[ time:33860.0s total_exs:10137780 epochs:154.26 ] {'exs': 864, 'lr': 0.01, 'num_updates': 222062, 'loss': 139.8, 'token_acc': 0.2602, 'nll_loss': 5.825, 'ppl': 338.5}
[ time:33862.0s total_exs:10138644 epochs:154.27 ] {'exs': 864, 'lr': 0.01, 'num_updates': 222086, 'loss': 141.3, 'token_acc': 0.2485, 'nll_loss': 5.888, 'ppl': 360.7}
[ time:33864.0s total_exs:10139544 epochs:154.29 ] {'exs': 900, 'lr': 0.01, 'num_updates': 222111, 'loss': 145.4, 'token_acc': 0.2505, 'nll_loss': 5.816, 'ppl':

[ time:33954.0s total_exs:10177920 epochs:154.87 ] {'exs': 900, 'lr': 0.01, 'num_updates': 223177, 'loss': 142.1, 'token_acc': 0.2612, 'nll_loss': 5.681, 'ppl': 293.2}
[ time:33956.0s total_exs:10178784 epochs:154.88 ] {'exs': 864, 'lr': 0.01, 'num_updates': 223201, 'loss': 139.2, 'token_acc': 0.2525, 'nll_loss': 5.8, 'ppl': 330.2}
[ time:33958.0s total_exs:10179648 epochs:154.9 ] {'exs': 864, 'lr': 0.01, 'num_updates': 223225, 'loss': 138.2, 'token_acc': 0.2554, 'nll_loss': 5.761, 'ppl': 317.7}
[ time:33960.0s total_exs:10180512 epochs:154.91 ] {'exs': 864, 'lr': 0.01, 'num_updates': 223249, 'loss': 140.2, 'token_acc': 0.2448, 'nll_loss': 5.841, 'ppl': 344.0}
[ time:33962.0s total_exs:10181376 epochs:154.92 ] {'exs': 864, 'lr': 0.01, 'num_updates': 223273, 'loss': 139.5, 'token_acc': 0.2519, 'nll_loss': 5.811, 'ppl': 334.0}
[ time:33964.0s total_exs:10182312 epochs:154.94 ] {'exs': 936, 'lr': 0.01, 'num_updates': 223299, 'loss': 149.3, 'token_acc': 0.2529, 'nll_loss': 5.747, 'ppl': 31

[ time:34054.0s total_exs:10221372 epochs:155.53 ] {'exs': 900, 'lr': 0.01, 'num_updates': 224384, 'loss': 143.1, 'token_acc': 0.256, 'nll_loss': 5.727, 'ppl': 307.2}
[ time:34056.0s total_exs:10222272 epochs:155.55 ] {'exs': 900, 'lr': 0.01, 'num_updates': 224409, 'loss': 145.3, 'token_acc': 0.2498, 'nll_loss': 5.815, 'ppl': 335.1}
[ time:34058.0s total_exs:10223172 epochs:155.56 ] {'exs': 900, 'lr': 0.01, 'num_updates': 224434, 'loss': 143.3, 'token_acc': 0.2561, 'nll_loss': 5.733, 'ppl': 308.8}
[ time:34060.0s total_exs:10224072 epochs:155.57 ] {'exs': 900, 'lr': 0.01, 'num_updates': 224459, 'loss': 144.5, 'token_acc': 0.2538, 'nll_loss': 5.784, 'ppl': 325.1}
[ time:34062.0s total_exs:10224936 epochs:155.59 ] {'exs': 864, 'lr': 0.01, 'num_updates': 224483, 'loss': 141.4, 'token_acc': 0.2532, 'nll_loss': 5.889, 'ppl': 361.1}
[ time:34064.0s total_exs:10225836 epochs:155.6 ] {'exs': 900, 'lr': 0.01, 'num_updates': 224508, 'loss': 147.6, 'token_acc': 0.2428, 'nll_loss': 5.905, 'ppl': 3

[ time:34154.0s total_exs:10264752 epochs:156.19 ] {'exs': 864, 'lr': 0.01, 'num_updates': 225589, 'loss': 139.7, 'token_acc': 0.2503, 'nll_loss': 5.823, 'ppl': 338.1}
[ time:34156.0s total_exs:10265616 epochs:156.2 ] {'exs': 864, 'lr': 0.01, 'num_updates': 225613, 'loss': 139.2, 'token_acc': 0.2498, 'nll_loss': 5.8, 'ppl': 330.3}
[ time:34158.0s total_exs:10266480 epochs:156.22 ] {'exs': 864, 'lr': 0.01, 'num_updates': 225637, 'loss': 138.5, 'token_acc': 0.2579, 'nll_loss': 5.77, 'ppl': 320.4}
[ time:34160.0s total_exs:10267380 epochs:156.23 ] {'exs': 900, 'lr': 0.01, 'num_updates': 225662, 'loss': 145.0, 'token_acc': 0.2516, 'nll_loss': 5.805, 'ppl': 331.8}
[ time:34162.0s total_exs:10268280 epochs:156.25 ] {'exs': 900, 'lr': 0.01, 'num_updates': 225687, 'loss': 143.5, 'token_acc': 0.2536, 'nll_loss': 5.74, 'ppl': 311.1}
[ time:34164.0s total_exs:10269180 epochs:156.26 ] {'exs': 900, 'lr': 0.01, 'num_updates': 225712, 'loss': 144.3, 'token_acc': 0.2551, 'nll_loss': 5.775, 'ppl': 322.

[ time:34254.0s total_exs:10308240 epochs:156.85 ] {'exs': 936, 'lr': 0.01, 'num_updates': 226797, 'loss': 150.4, 'token_acc': 0.2497, 'nll_loss': 5.787, 'ppl': 325.9}
[ time:34256.0s total_exs:10309140 epochs:156.87 ] {'exs': 900, 'lr': 0.01, 'num_updates': 226822, 'loss': 145.0, 'token_acc': 0.2484, 'nll_loss': 5.804, 'ppl': 331.5}
[ time:34258.0s total_exs:10310004 epochs:156.88 ] {'exs': 864, 'lr': 0.01, 'num_updates': 226846, 'loss': 139.2, 'token_acc': 0.2513, 'nll_loss': 5.798, 'ppl': 329.7}
[ time:34260.0s total_exs:10310868 epochs:156.89 ] {'exs': 864, 'lr': 0.01, 'num_updates': 226870, 'loss': 138.5, 'token_acc': 0.2557, 'nll_loss': 5.773, 'ppl': 321.7}
[ time:34262.0s total_exs:10311768 epochs:156.91 ] {'exs': 900, 'lr': 0.01, 'num_updates': 226895, 'loss': 143.5, 'token_acc': 0.2568, 'nll_loss': 5.741, 'ppl': 311.4}
[ time:34264.0s total_exs:10312668 epochs:156.92 ] {'exs': 900, 'lr': 0.01, 'num_updates': 226920, 'loss': 147.0, 'token_acc': 0.2426, 'nll_loss': 5.878, 'ppl':

[ time:34354.0s total_exs:10351152 epochs:157.51 ] {'exs': 828, 'lr': 0.01, 'num_updates': 227989, 'loss': 134.0, 'token_acc': 0.2438, 'nll_loss': 5.829, 'ppl': 340.1}
[ time:34356.0s total_exs:10351944 epochs:157.52 ] {'exs': 792, 'lr': 0.01, 'num_updates': 228011, 'loss': 130.8, 'token_acc': 0.2355, 'nll_loss': 5.947, 'ppl': 382.5}
[ time:34358.0s total_exs:10352772 epochs:157.53 ] {'exs': 828, 'lr': 0.01, 'num_updates': 228034, 'loss': 131.9, 'token_acc': 0.2579, 'nll_loss': 5.734, 'ppl': 309.1}
[ time:34361.0s total_exs:10353636 epochs:157.54 ] {'exs': 864, 'lr': 0.01, 'num_updates': 228058, 'loss': 139.6, 'token_acc': 0.2495, 'nll_loss': 5.814, 'ppl': 334.9}
[ time:34363.0s total_exs:10354464 epochs:157.56 ] {'exs': 828, 'lr': 0.01, 'num_updates': 228081, 'loss': 132.3, 'token_acc': 0.2553, 'nll_loss': 5.751, 'ppl': 314.6}
[ time:34365.0s total_exs:10355328 epochs:157.57 ] {'exs': 864, 'lr': 0.01, 'num_updates': 228105, 'loss': 139.9, 'token_acc': 0.2523, 'nll_loss': 5.828, 'ppl':

[ time:34454.0s total_exs:10394172 epochs:158.16 ] {'exs': 900, 'lr': 0.01, 'num_updates': 229184, 'loss': 144.5, 'token_acc': 0.2474, 'nll_loss': 5.776, 'ppl': 322.5}
[ time:34456.0s total_exs:10395108 epochs:158.18 ] {'exs': 936, 'lr': 0.01, 'num_updates': 229210, 'loss': 149.4, 'token_acc': 0.2574, 'nll_loss': 5.748, 'ppl': 313.4}
[ time:34458.0s total_exs:10396008 epochs:158.19 ] {'exs': 900, 'lr': 0.01, 'num_updates': 229235, 'loss': 144.0, 'token_acc': 0.248, 'nll_loss': 5.76, 'ppl': 317.3}
[ time:34460.0s total_exs:10396908 epochs:158.2 ] {'exs': 900, 'lr': 0.01, 'num_updates': 229260, 'loss': 146.6, 'token_acc': 0.25, 'nll_loss': 5.863, 'ppl': 351.9}
[ time:34462.0s total_exs:10397808 epochs:158.22 ] {'exs': 900, 'lr': 0.01, 'num_updates': 229285, 'loss': 142.8, 'token_acc': 0.252, 'nll_loss': 5.712, 'ppl': 302.5}
[ time:34464.0s total_exs:10398708 epochs:158.23 ] {'exs': 900, 'lr': 0.01, 'num_updates': 229310, 'loss': 145.3, 'token_acc': 0.2542, 'nll_loss': 5.81, 'ppl': 333.5}

[ time:34554.0s total_exs:10437876 epochs:158.83 ] {'exs': 900, 'lr': 0.01, 'num_updates': 230398, 'loss': 145.3, 'token_acc': 0.2546, 'nll_loss': 5.812, 'ppl': 334.3}
[ time:34556.0s total_exs:10438776 epochs:158.84 ] {'exs': 900, 'lr': 0.01, 'num_updates': 230423, 'loss': 146.1, 'token_acc': 0.2472, 'nll_loss': 5.845, 'ppl': 345.4}
[ time:34558.0s total_exs:10439640 epochs:158.85 ] {'exs': 864, 'lr': 0.01, 'num_updates': 230447, 'loss': 137.8, 'token_acc': 0.2545, 'nll_loss': 5.743, 'ppl': 312.1}
[ time:34560.0s total_exs:10440576 epochs:158.87 ] {'exs': 936, 'lr': 0.01, 'num_updates': 230473, 'loss': 149.5, 'token_acc': 0.252, 'nll_loss': 5.752, 'ppl': 314.7}
[ time:34562.0s total_exs:10441476 epochs:158.88 ] {'exs': 900, 'lr': 0.01, 'num_updates': 230498, 'loss': 144.5, 'token_acc': 0.2542, 'nll_loss': 5.785, 'ppl': 325.3}
[ time:34564.0s total_exs:10442412 epochs:158.89 ] {'exs': 936, 'lr': 0.01, 'num_updates': 230524, 'loss': 150.5, 'token_acc': 0.2486, 'nll_loss': 5.789, 'ppl': 

[ time:34655.0s total_exs:10481832 epochs:159.49 ] {'exs': 900, 'lr': 0.01, 'num_updates': 231619, 'loss': 143.1, 'token_acc': 0.2579, 'nll_loss': 5.727, 'ppl': 306.9}
[ time:34657.0s total_exs:10482732 epochs:159.51 ] {'exs': 900, 'lr': 0.01, 'num_updates': 231644, 'loss': 141.9, 'token_acc': 0.2648, 'nll_loss': 5.68, 'ppl': 293.1}
[ time:34659.0s total_exs:10483596 epochs:159.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 231668, 'loss': 137.9, 'token_acc': 0.2539, 'nll_loss': 5.752, 'ppl': 314.7}
[ time:34661.0s total_exs:10484460 epochs:159.53 ] {'exs': 864, 'lr': 0.01, 'num_updates': 231692, 'loss': 137.8, 'token_acc': 0.2614, 'nll_loss': 5.746, 'ppl': 312.9}
[ time:34663.0s total_exs:10485324 epochs:159.55 ] {'exs': 864, 'lr': 0.01, 'num_updates': 231716, 'loss': 137.5, 'token_acc': 0.2597, 'nll_loss': 5.731, 'ppl': 308.3}
[ time:34665.0s total_exs:10486260 epochs:159.56 ] {'exs': 936, 'lr': 0.01, 'num_updates': 231742, 'loss': 149.2, 'token_acc': 0.2584, 'nll_loss': 5.736, 'ppl': 

[ time:34755.0s total_exs:10525608 epochs:160.16 ] {'exs': 864, 'lr': 0.01, 'num_updates': 232835, 'loss': 140.1, 'token_acc': 0.2507, 'nll_loss': 5.835, 'ppl': 342.1}
[ time:34757.0s total_exs:10526508 epochs:160.17 ] {'exs': 900, 'lr': 0.01, 'num_updates': 232860, 'loss': 144.8, 'token_acc': 0.2487, 'nll_loss': 5.79, 'ppl': 327.0}
[ time:34759.0s total_exs:10527408 epochs:160.19 ] {'exs': 900, 'lr': 0.01, 'num_updates': 232885, 'loss': 146.0, 'token_acc': 0.246, 'nll_loss': 5.842, 'ppl': 344.6}
[ time:34761.0s total_exs:10528272 epochs:160.2 ] {'exs': 864, 'lr': 0.01, 'num_updates': 232909, 'loss': 138.2, 'token_acc': 0.2611, 'nll_loss': 5.759, 'ppl': 317.0}
[ time:34763.0s total_exs:10529208 epochs:160.22 ] {'exs': 936, 'lr': 0.01, 'num_updates': 232935, 'loss': 150.2, 'token_acc': 0.2508, 'nll_loss': 5.777, 'ppl': 322.7}
[ time:34765.0s total_exs:10530036 epochs:160.23 ] {'exs': 828, 'lr': 0.01, 'num_updates': 232958, 'loss': 130.9, 'token_acc': 0.2651, 'nll_loss': 5.691, 'ppl': 29

[ time:34855.0s total_exs:10569420 epochs:160.83 ] {'exs': 900, 'lr': 0.01, 'num_updates': 234052, 'loss': 143.5, 'token_acc': 0.2559, 'nll_loss': 5.736, 'ppl': 309.9}
[ time:34857.0s total_exs:10570284 epochs:160.84 ] {'exs': 864, 'lr': 0.01, 'num_updates': 234076, 'loss': 139.7, 'token_acc': 0.2453, 'nll_loss': 5.825, 'ppl': 338.5}
[ time:34859.0s total_exs:10571184 epochs:160.85 ] {'exs': 900, 'lr': 0.01, 'num_updates': 234101, 'loss': 144.1, 'token_acc': 0.2485, 'nll_loss': 5.766, 'ppl': 319.3}
[ time:34861.0s total_exs:10572084 epochs:160.87 ] {'exs': 900, 'lr': 0.01, 'num_updates': 234126, 'loss': 145.6, 'token_acc': 0.248, 'nll_loss': 5.823, 'ppl': 337.8}
[ time:34863.0s total_exs:10573020 epochs:160.88 ] {'exs': 936, 'lr': 0.01, 'num_updates': 234152, 'loss': 150.7, 'token_acc': 0.2554, 'nll_loss': 5.797, 'ppl': 329.3}
[ time:34865.0s total_exs:10573884 epochs:160.9 ] {'exs': 864, 'lr': 0.01, 'num_updates': 234176, 'loss': 139.6, 'token_acc': 0.2467, 'nll_loss': 5.82, 'ppl': 33

[ time:34955.0s total_exs:10612908 epochs:161.49 ] {'exs': 864, 'lr': 0.01, 'num_updates': 235260, 'loss': 138.0, 'token_acc': 0.2512, 'nll_loss': 5.752, 'ppl': 314.8}
[ time:34957.0s total_exs:10613772 epochs:161.5 ] {'exs': 864, 'lr': 0.01, 'num_updates': 235284, 'loss': 138.0, 'token_acc': 0.2484, 'nll_loss': 5.75, 'ppl': 314.2}
[ time:34959.0s total_exs:10614636 epochs:161.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 235308, 'loss': 139.3, 'token_acc': 0.2534, 'nll_loss': 5.804, 'ppl': 331.5}
[ time:34961.0s total_exs:10615572 epochs:161.53 ] {'exs': 936, 'lr': 0.01, 'num_updates': 235334, 'loss': 151.1, 'token_acc': 0.2524, 'nll_loss': 5.813, 'ppl': 334.7}
[ time:34963.0s total_exs:10616400 epochs:161.54 ] {'exs': 828, 'lr': 0.01, 'num_updates': 235357, 'loss': 133.4, 'token_acc': 0.2529, 'nll_loss': 5.8, 'ppl': 330.4}
[ time:34965.0s total_exs:10617264 epochs:161.56 ] {'exs': 864, 'lr': 0.01, 'num_updates': 235381, 'loss': 138.9, 'token_acc': 0.2557, 'nll_loss': 5.788, 'ppl': 326

[ time:35055.0s total_exs:10656360 epochs:162.15 ] {'exs': 864, 'lr': 0.01, 'num_updates': 236467, 'loss': 136.8, 'token_acc': 0.2563, 'nll_loss': 5.7, 'ppl': 298.9}
[ time:35057.0s total_exs:10657260 epochs:162.16 ] {'exs': 900, 'lr': 0.01, 'num_updates': 236492, 'loss': 144.0, 'token_acc': 0.2543, 'nll_loss': 5.76, 'ppl': 317.5}
[ time:35059.0s total_exs:10658196 epochs:162.18 ] {'exs': 936, 'lr': 0.01, 'num_updates': 236518, 'loss': 150.1, 'token_acc': 0.2554, 'nll_loss': 5.773, 'ppl': 321.6}
[ time:35061.0s total_exs:10659096 epochs:162.19 ] {'exs': 900, 'lr': 0.01, 'num_updates': 236543, 'loss': 143.5, 'token_acc': 0.2569, 'nll_loss': 5.742, 'ppl': 311.6}
[ time:35063.0s total_exs:10659924 epochs:162.2 ] {'exs': 828, 'lr': 0.01, 'num_updates': 236566, 'loss': 132.8, 'token_acc': 0.2629, 'nll_loss': 5.772, 'ppl': 321.2}
[ time:35065.0s total_exs:10660752 epochs:162.22 ] {'exs': 828, 'lr': 0.01, 'num_updates': 236589, 'loss': 131.8, 'token_acc': 0.2578, 'nll_loss': 5.733, 'ppl': 309

[ time:35155.0s total_exs:10699740 epochs:162.81 ] {'exs': 936, 'lr': 0.01, 'num_updates': 237672, 'loss': 150.4, 'token_acc': 0.2512, 'nll_loss': 5.786, 'ppl': 325.6}
[ time:35157.0s total_exs:10700640 epochs:162.82 ] {'exs': 900, 'lr': 0.01, 'num_updates': 237697, 'loss': 144.9, 'token_acc': 0.2532, 'nll_loss': 5.802, 'ppl': 330.9}
[ time:35159.0s total_exs:10701504 epochs:162.84 ] {'exs': 864, 'lr': 0.01, 'num_updates': 237721, 'loss': 136.0, 'token_acc': 0.265, 'nll_loss': 5.666, 'ppl': 288.9}
[ time:35161.0s total_exs:10702368 epochs:162.85 ] {'exs': 864, 'lr': 0.01, 'num_updates': 237745, 'loss': 139.5, 'token_acc': 0.2471, 'nll_loss': 5.815, 'ppl': 335.4}
[ time:35163.0s total_exs:10703268 epochs:162.86 ] {'exs': 900, 'lr': 0.01, 'num_updates': 237770, 'loss': 145.0, 'token_acc': 0.2431, 'nll_loss': 5.798, 'ppl': 329.7}
[ time:35165.0s total_exs:10704132 epochs:162.88 ] {'exs': 864, 'lr': 0.01, 'num_updates': 237794, 'loss': 139.9, 'token_acc': 0.2518, 'nll_loss': 5.829, 'ppl': 

[ time:35255.0s total_exs:10743156 epochs:163.47 ] {'exs': 900, 'lr': 0.01, 'num_updates': 238878, 'loss': 143.6, 'token_acc': 0.259, 'nll_loss': 5.744, 'ppl': 312.4}
[ time:35257.0s total_exs:10744020 epochs:163.48 ] {'exs': 864, 'lr': 0.01, 'num_updates': 238902, 'loss': 138.6, 'token_acc': 0.2467, 'nll_loss': 5.778, 'ppl': 323.3}
[ time:35259.0s total_exs:10744848 epochs:163.5 ] {'exs': 828, 'lr': 0.01, 'num_updates': 238925, 'loss': 132.7, 'token_acc': 0.2566, 'nll_loss': 5.774, 'ppl': 321.9}
[ time:35261.0s total_exs:10745748 epochs:163.51 ] {'exs': 900, 'lr': 0.01, 'num_updates': 238950, 'loss': 143.2, 'token_acc': 0.2526, 'nll_loss': 5.728, 'ppl': 307.2}
[ time:35264.0s total_exs:10746612 epochs:163.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 238974, 'loss': 136.5, 'token_acc': 0.2583, 'nll_loss': 5.69, 'ppl': 295.9}
[ time:35266.0s total_exs:10747512 epochs:163.54 ] {'exs': 900, 'lr': 0.01, 'num_updates': 238999, 'loss': 143.1, 'token_acc': 0.2609, 'nll_loss': 5.726, 'ppl': 30

[ time:35355.0s total_exs:10786392 epochs:164.13 ] {'exs': 864, 'lr': 0.01, 'num_updates': 240079, 'loss': 139.4, 'token_acc': 0.257, 'nll_loss': 5.808, 'ppl': 333.0}
[ time:35357.0s total_exs:10787292 epochs:164.14 ] {'exs': 900, 'lr': 0.01, 'num_updates': 240104, 'loss': 143.6, 'token_acc': 0.259, 'nll_loss': 5.748, 'ppl': 313.5}
[ time:35359.0s total_exs:10788228 epochs:164.16 ] {'exs': 936, 'lr': 0.01, 'num_updates': 240130, 'loss': 149.0, 'token_acc': 0.2552, 'nll_loss': 5.737, 'ppl': 310.0}
[ time:35361.0s total_exs:10789128 epochs:164.17 ] {'exs': 900, 'lr': 0.01, 'num_updates': 240155, 'loss': 145.3, 'token_acc': 0.2445, 'nll_loss': 5.812, 'ppl': 334.2}
[ time:35363.0s total_exs:10790064 epochs:164.18 ] {'exs': 936, 'lr': 0.01, 'num_updates': 240181, 'loss': 151.3, 'token_acc': 0.254, 'nll_loss': 5.82, 'ppl': 336.9}
[ time:35365.0s total_exs:10790928 epochs:164.2 ] {'exs': 864, 'lr': 0.01, 'num_updates': 240205, 'loss': 138.5, 'token_acc': 0.2545, 'nll_loss': 5.772, 'ppl': 321.

[ time:35455.0s total_exs:10830060 epochs:164.79 ] {'exs': 900, 'lr': 0.01, 'num_updates': 241292, 'loss': 143.5, 'token_acc': 0.2563, 'nll_loss': 5.748, 'ppl': 313.5}
[ time:35457.0s total_exs:10830960 epochs:164.81 ] {'exs': 900, 'lr': 0.01, 'num_updates': 241317, 'loss': 143.0, 'token_acc': 0.2564, 'nll_loss': 5.723, 'ppl': 305.7}
[ time:35459.0s total_exs:10831896 epochs:164.82 ] {'exs': 936, 'lr': 0.01, 'num_updates': 241343, 'loss': 150.5, 'token_acc': 0.2547, 'nll_loss': 5.785, 'ppl': 325.4}
[ time:35461.0s total_exs:10832760 epochs:164.83 ] {'exs': 864, 'lr': 0.01, 'num_updates': 241367, 'loss': 135.1, 'token_acc': 0.2644, 'nll_loss': 5.627, 'ppl': 277.8}
[ time:35463.0s total_exs:10833624 epochs:164.85 ] {'exs': 864, 'lr': 0.01, 'num_updates': 241391, 'loss': 138.5, 'token_acc': 0.2424, 'nll_loss': 5.771, 'ppl': 320.9}
[ time:35465.0s total_exs:10834452 epochs:164.86 ] {'exs': 828, 'lr': 0.01, 'num_updates': 241414, 'loss': 131.6, 'token_acc': 0.2506, 'nll_loss': 5.726, 'ppl':

[ time:35555.0s total_exs:10873476 epochs:165.45 ] {'exs': 900, 'lr': 0.01, 'num_updates': 242498, 'loss': 144.8, 'token_acc': 0.2546, 'nll_loss': 5.789, 'ppl': 326.6}
[ time:35557.0s total_exs:10874340 epochs:165.47 ] {'exs': 864, 'lr': 0.01, 'num_updates': 242522, 'loss': 137.3, 'token_acc': 0.2575, 'nll_loss': 5.72, 'ppl': 305.0}
[ time:35559.0s total_exs:10875240 epochs:165.48 ] {'exs': 900, 'lr': 0.01, 'num_updates': 242547, 'loss': 146.5, 'token_acc': 0.2381, 'nll_loss': 5.861, 'ppl': 351.1}
[ time:35561.0s total_exs:10876140 epochs:165.49 ] {'exs': 900, 'lr': 0.01, 'num_updates': 242572, 'loss': 146.6, 'token_acc': 0.2397, 'nll_loss': 5.863, 'ppl': 351.9}
[ time:35563.0s total_exs:10877040 epochs:165.51 ] {'exs': 900, 'lr': 0.01, 'num_updates': 242597, 'loss': 148.7, 'token_acc': 0.2414, 'nll_loss': 5.949, 'ppl': 383.2}
[ time:35565.0s total_exs:10877904 epochs:165.52 ] {'exs': 864, 'lr': 0.01, 'num_updates': 242621, 'loss': 137.6, 'token_acc': 0.2537, 'nll_loss': 5.735, 'ppl': 

[ time:35655.0s total_exs:10916748 epochs:166.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 243700, 'loss': 145.1, 'token_acc': 0.2518, 'nll_loss': 5.803, 'ppl': 331.3}
[ time:35657.0s total_exs:10917648 epochs:166.13 ] {'exs': 900, 'lr': 0.01, 'num_updates': 243725, 'loss': 144.7, 'token_acc': 0.2526, 'nll_loss': 5.788, 'ppl': 326.5}
[ time:35659.0s total_exs:10918548 epochs:166.14 ] {'exs': 900, 'lr': 0.01, 'num_updates': 243750, 'loss': 143.3, 'token_acc': 0.2561, 'nll_loss': 5.732, 'ppl': 308.7}
[ time:35661.0s total_exs:10919412 epochs:166.15 ] {'exs': 864, 'lr': 0.01, 'num_updates': 243774, 'loss': 138.1, 'token_acc': 0.2518, 'nll_loss': 5.757, 'ppl': 316.4}
[ time:35663.0s total_exs:10920312 epochs:166.17 ] {'exs': 900, 'lr': 0.01, 'num_updates': 243799, 'loss': 143.1, 'token_acc': 0.2534, 'nll_loss': 5.721, 'ppl': 305.3}
[ time:35665.0s total_exs:10921176 epochs:166.18 ] {'exs': 864, 'lr': 0.01, 'num_updates': 243823, 'loss': 138.5, 'token_acc': 0.2606, 'nll_loss': 5.77, 'ppl': 

[ time:35755.0s total_exs:10960164 epochs:166.77 ] {'exs': 864, 'lr': 0.01, 'num_updates': 244906, 'loss': 139.7, 'token_acc': 0.2465, 'nll_loss': 5.824, 'ppl': 338.2}
[ time:35757.0s total_exs:10961064 epochs:166.79 ] {'exs': 900, 'lr': 0.01, 'num_updates': 244931, 'loss': 144.7, 'token_acc': 0.2525, 'nll_loss': 5.787, 'ppl': 326.0}
[ time:35759.0s total_exs:10961964 epochs:166.8 ] {'exs': 900, 'lr': 0.01, 'num_updates': 244956, 'loss': 142.1, 'token_acc': 0.263, 'nll_loss': 5.688, 'ppl': 295.2}
[ time:35761.0s total_exs:10962828 epochs:166.81 ] {'exs': 864, 'lr': 0.01, 'num_updates': 244980, 'loss': 136.9, 'token_acc': 0.257, 'nll_loss': 5.703, 'ppl': 299.8}
[ time:35763.0s total_exs:10963728 epochs:166.83 ] {'exs': 900, 'lr': 0.01, 'num_updates': 245005, 'loss': 144.9, 'token_acc': 0.255, 'nll_loss': 5.797, 'ppl': 329.3}
[ time:35765.0s total_exs:10964592 epochs:166.84 ] {'exs': 864, 'lr': 0.01, 'num_updates': 245029, 'loss': 138.8, 'token_acc': 0.2533, 'nll_loss': 5.784, 'ppl': 325

[ time:35855.0s total_exs:11003832 epochs:167.44 ] {'exs': 900, 'lr': 0.01, 'num_updates': 246119, 'loss': 146.4, 'token_acc': 0.2461, 'nll_loss': 5.856, 'ppl': 349.5}
[ time:35857.0s total_exs:11004732 epochs:167.45 ] {'exs': 900, 'lr': 0.01, 'num_updates': 246144, 'loss': 144.2, 'token_acc': 0.2516, 'nll_loss': 5.767, 'ppl': 319.7}
[ time:35859.0s total_exs:11005596 epochs:167.46 ] {'exs': 864, 'lr': 0.01, 'num_updates': 246168, 'loss': 140.5, 'token_acc': 0.251, 'nll_loss': 5.859, 'ppl': 350.3}
[ time:35861.0s total_exs:11006424 epochs:167.48 ] {'exs': 828, 'lr': 0.01, 'num_updates': 246191, 'loss': 132.1, 'token_acc': 0.2572, 'nll_loss': 5.742, 'ppl': 311.7}
[ time:35864.0s total_exs:11007324 epochs:167.49 ] {'exs': 900, 'lr': 0.01, 'num_updates': 246216, 'loss': 145.6, 'token_acc': 0.2486, 'nll_loss': 5.826, 'ppl': 338.9}
[ time:35866.0s total_exs:11008152 epochs:167.5 ] {'exs': 828, 'lr': 0.01, 'num_updates': 246239, 'loss': 130.3, 'token_acc': 0.2611, 'nll_loss': 5.667, 'ppl': 2

[ time:35955.0s total_exs:11047032 epochs:168.09 ] {'exs': 864, 'lr': 0.01, 'num_updates': 247319, 'loss': 136.4, 'token_acc': 0.261, 'nll_loss': 5.684, 'ppl': 294.0}
[ time:35957.0s total_exs:11047932 epochs:168.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 247344, 'loss': 145.0, 'token_acc': 0.2523, 'nll_loss': 5.802, 'ppl': 331.1}
[ time:35959.0s total_exs:11048796 epochs:168.12 ] {'exs': 864, 'lr': 0.01, 'num_updates': 247368, 'loss': 135.1, 'token_acc': 0.2644, 'nll_loss': 5.629, 'ppl': 278.3}
[ time:35961.0s total_exs:11049696 epochs:168.14 ] {'exs': 900, 'lr': 0.01, 'num_updates': 247393, 'loss': 141.4, 'token_acc': 0.2641, 'nll_loss': 5.654, 'ppl': 285.5}
[ time:35963.0s total_exs:11050632 epochs:168.15 ] {'exs': 936, 'lr': 0.01, 'num_updates': 247419, 'loss': 147.2, 'token_acc': 0.2626, 'nll_loss': 5.659, 'ppl': 286.9}
[ time:35965.0s total_exs:11051460 epochs:168.16 ] {'exs': 828, 'lr': 0.01, 'num_updates': 247442, 'loss': 130.9, 'token_acc': 0.2578, 'nll_loss': 5.693, 'ppl': 

[ time:36055.0s total_exs:11090340 epochs:168.75 ] {'exs': 864, 'lr': 0.01, 'num_updates': 248522, 'loss': 138.0, 'token_acc': 0.2542, 'nll_loss': 5.749, 'ppl': 314.0}
[ time:36057.0s total_exs:11091204 epochs:168.77 ] {'exs': 864, 'lr': 0.01, 'num_updates': 248546, 'loss': 137.7, 'token_acc': 0.2507, 'nll_loss': 5.742, 'ppl': 311.6}
[ time:36060.0s total_exs:11092104 epochs:168.78 ] {'exs': 900, 'lr': 0.01, 'num_updates': 248571, 'loss': 144.9, 'token_acc': 0.2533, 'nll_loss': 5.792, 'ppl': 327.6}
[ time:36062.0s total_exs:11092968 epochs:168.79 ] {'exs': 864, 'lr': 0.01, 'num_updates': 248595, 'loss': 137.0, 'token_acc': 0.2526, 'nll_loss': 5.708, 'ppl': 301.3}
[ time:36064.0s total_exs:11093832 epochs:168.81 ] {'exs': 864, 'lr': 0.01, 'num_updates': 248619, 'loss': 136.4, 'token_acc': 0.2594, 'nll_loss': 5.683, 'ppl': 293.9}
[ time:36066.0s total_exs:11094732 epochs:168.82 ] {'exs': 900, 'lr': 0.01, 'num_updates': 248644, 'loss': 141.6, 'token_acc': 0.2594, 'nll_loss': 5.669, 'ppl':

[ time:36156.0s total_exs:11133468 epochs:169.41 ] {'exs': 864, 'lr': 0.01, 'num_updates': 249720, 'loss': 136.4, 'token_acc': 0.2602, 'nll_loss': 5.683, 'ppl': 293.9}
[ time:36158.0s total_exs:11134332 epochs:169.42 ] {'exs': 864, 'lr': 0.01, 'num_updates': 249744, 'loss': 136.4, 'token_acc': 0.2547, 'nll_loss': 5.684, 'ppl': 294.1}
[ time:36160.0s total_exs:11135232 epochs:169.44 ] {'exs': 900, 'lr': 0.01, 'num_updates': 249769, 'loss': 144.0, 'token_acc': 0.2577, 'nll_loss': 5.761, 'ppl': 317.6}
[ time:36162.0s total_exs:11136060 epochs:169.45 ] {'exs': 828, 'lr': 0.01, 'num_updates': 249792, 'loss': 132.5, 'token_acc': 0.2489, 'nll_loss': 5.764, 'ppl': 318.6}
[ time:36164.0s total_exs:11136960 epochs:169.46 ] {'exs': 900, 'lr': 0.01, 'num_updates': 249817, 'loss': 142.1, 'token_acc': 0.2557, 'nll_loss': 5.685, 'ppl': 294.6}
[ time:36166.0s total_exs:11137860 epochs:169.48 ] {'exs': 900, 'lr': 0.01, 'num_updates': 249842, 'loss': 142.7, 'token_acc': 0.2588, 'nll_loss': 5.711, 'ppl':

[ time:36256.0s total_exs:11176884 epochs:170.07 ] {'exs': 828, 'lr': 0.01, 'num_updates': 250926, 'loss': 131.6, 'token_acc': 0.2588, 'nll_loss': 5.723, 'ppl': 305.8}
[ time:36258.0s total_exs:11177784 epochs:170.08 ] {'exs': 900, 'lr': 0.01, 'num_updates': 250951, 'loss': 144.5, 'token_acc': 0.2511, 'nll_loss': 5.784, 'ppl': 325.1}
[ time:36260.0s total_exs:11178612 epochs:170.1 ] {'exs': 828, 'lr': 0.01, 'num_updates': 250974, 'loss': 130.2, 'token_acc': 0.2606, 'nll_loss': 5.662, 'ppl': 287.8}
[ time:36262.0s total_exs:11179512 epochs:170.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 250999, 'loss': 146.3, 'token_acc': 0.2416, 'nll_loss': 5.851, 'ppl': 347.5}
[ time:36264.0s total_exs:11180376 epochs:170.12 ] {'exs': 864, 'lr': 0.01, 'num_updates': 251023, 'loss': 139.2, 'token_acc': 0.2538, 'nll_loss': 5.805, 'ppl': 331.9}
[ time:36266.0s total_exs:11181312 epochs:170.14 ] {'exs': 936, 'lr': 0.01, 'num_updates': 251049, 'loss': 149.7, 'token_acc': 0.2543, 'nll_loss': 5.757, 'ppl': 

[ time:36356.0s total_exs:11220372 epochs:170.73 ] {'exs': 864, 'lr': 0.01, 'num_updates': 252134, 'loss': 138.3, 'token_acc': 0.2533, 'nll_loss': 5.765, 'ppl': 318.8}
[ time:36358.0s total_exs:11221272 epochs:170.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 252159, 'loss': 142.7, 'token_acc': 0.2593, 'nll_loss': 5.708, 'ppl': 301.1}
[ time:36360.0s total_exs:11222172 epochs:170.76 ] {'exs': 900, 'lr': 0.01, 'num_updates': 252184, 'loss': 143.3, 'token_acc': 0.2555, 'nll_loss': 5.736, 'ppl': 309.9}
[ time:36362.0s total_exs:11223036 epochs:170.77 ] {'exs': 864, 'lr': 0.01, 'num_updates': 252208, 'loss': 136.3, 'token_acc': 0.2551, 'nll_loss': 5.683, 'ppl': 293.8}
[ time:36364.0s total_exs:11223972 epochs:170.79 ] {'exs': 936, 'lr': 0.01, 'num_updates': 252234, 'loss': 150.0, 'token_acc': 0.2472, 'nll_loss': 5.768, 'ppl': 319.9}
[ time:36366.0s total_exs:11224836 epochs:170.8 ] {'exs': 864, 'lr': 0.01, 'num_updates': 252258, 'loss': 137.6, 'token_acc': 0.2568, 'nll_loss': 5.735, 'ppl': 

[ time:36456.0s total_exs:11264040 epochs:171.4 ] {'exs': 900, 'lr': 0.01, 'num_updates': 253347, 'loss': 142.2, 'token_acc': 0.2618, 'nll_loss': 5.689, 'ppl': 295.6}
[ time:36458.0s total_exs:11264940 epochs:171.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 253372, 'loss': 143.3, 'token_acc': 0.2579, 'nll_loss': 5.733, 'ppl': 309.0}
[ time:36460.0s total_exs:11265804 epochs:171.42 ] {'exs': 864, 'lr': 0.01, 'num_updates': 253396, 'loss': 137.7, 'token_acc': 0.2511, 'nll_loss': 5.734, 'ppl': 309.3}
[ time:36462.0s total_exs:11266668 epochs:171.44 ] {'exs': 864, 'lr': 0.01, 'num_updates': 253420, 'loss': 136.9, 'token_acc': 0.2589, 'nll_loss': 5.706, 'ppl': 300.8}
[ time:36464.0s total_exs:11267568 epochs:171.45 ] {'exs': 900, 'lr': 0.01, 'num_updates': 253445, 'loss': 140.8, 'token_acc': 0.262, 'nll_loss': 5.636, 'ppl': 280.4}
[ time:36466.0s total_exs:11268504 epochs:171.46 ] {'exs': 936, 'lr': 0.01, 'num_updates': 253471, 'loss': 148.6, 'token_acc': 0.2529, 'nll_loss': 5.716, 'ppl': 3

[ time:36556.0s total_exs:11307456 epochs:172.06 ] {'exs': 900, 'lr': 0.01, 'num_updates': 254553, 'loss': 142.7, 'token_acc': 0.2561, 'nll_loss': 5.707, 'ppl': 300.9}
[ time:36558.0s total_exs:11308320 epochs:172.07 ] {'exs': 864, 'lr': 0.01, 'num_updates': 254577, 'loss': 135.7, 'token_acc': 0.2589, 'nll_loss': 5.658, 'ppl': 286.5}
[ time:36560.0s total_exs:11309220 epochs:172.08 ] {'exs': 900, 'lr': 0.01, 'num_updates': 254602, 'loss': 142.0, 'token_acc': 0.2561, 'nll_loss': 5.678, 'ppl': 292.4}
[ time:36562.0s total_exs:11310120 epochs:172.1 ] {'exs': 900, 'lr': 0.01, 'num_updates': 254627, 'loss': 143.0, 'token_acc': 0.2533, 'nll_loss': 5.719, 'ppl': 304.7}
[ time:36564.0s total_exs:11311020 epochs:172.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 254652, 'loss': 142.2, 'token_acc': 0.2561, 'nll_loss': 5.69, 'ppl': 295.9}
[ time:36566.0s total_exs:11311884 epochs:172.13 ] {'exs': 864, 'lr': 0.01, 'num_updates': 254676, 'loss': 138.2, 'token_acc': 0.2502, 'nll_loss': 5.755, 'ppl': 3

[ time:36656.0s total_exs:11351232 epochs:172.72 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255769, 'loss': 142.6, 'token_acc': 0.2552, 'nll_loss': 5.703, 'ppl': 299.7}
[ time:36658.0s total_exs:11352132 epochs:172.74 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255794, 'loss': 142.2, 'token_acc': 0.2567, 'nll_loss': 5.688, 'ppl': 295.3}
[ time:36660.0s total_exs:11353032 epochs:172.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255819, 'loss': 145.4, 'token_acc': 0.2525, 'nll_loss': 5.819, 'ppl': 336.6}
[ time:36662.0s total_exs:11353932 epochs:172.76 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255844, 'loss': 143.6, 'token_acc': 0.2515, 'nll_loss': 5.748, 'ppl': 313.6}
[ time:36664.0s total_exs:11354832 epochs:172.78 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255869, 'loss': 142.6, 'token_acc': 0.2535, 'nll_loss': 5.709, 'ppl': 301.5}
[ time:36666.0s total_exs:11355732 epochs:172.79 ] {'exs': 900, 'lr': 0.01, 'num_updates': 255894, 'loss': 140.9, 'token_acc': 0.2633, 'nll_loss': 5.638, 'ppl':

[ time:36756.0s total_exs:11394720 epochs:173.39 ] {'exs': 900, 'lr': 0.01, 'num_updates': 256977, 'loss': 141.3, 'token_acc': 0.2603, 'nll_loss': 5.655, 'ppl': 285.7}
[ time:36758.0s total_exs:11395620 epochs:173.4 ] {'exs': 900, 'lr': 0.01, 'num_updates': 257002, 'loss': 142.7, 'token_acc': 0.2583, 'nll_loss': 5.707, 'ppl': 301.1}
[ time:36760.0s total_exs:11396520 epochs:173.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 257027, 'loss': 140.3, 'token_acc': 0.2613, 'nll_loss': 5.612, 'ppl': 273.8}
[ time:36762.0s total_exs:11397384 epochs:173.43 ] {'exs': 864, 'lr': 0.01, 'num_updates': 257051, 'loss': 137.7, 'token_acc': 0.2546, 'nll_loss': 5.739, 'ppl': 310.9}
[ time:36764.0s total_exs:11398320 epochs:173.44 ] {'exs': 936, 'lr': 0.01, 'num_updates': 257077, 'loss': 149.0, 'token_acc': 0.2538, 'nll_loss': 5.733, 'ppl': 309.0}
[ time:36766.0s total_exs:11399184 epochs:173.45 ] {'exs': 864, 'lr': 0.01, 'num_updates': 257101, 'loss': 134.9, 'token_acc': 0.2572, 'nll_loss': 5.624, 'ppl': 

[ time:36856.0s total_exs:11438496 epochs:174.05 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258193, 'loss': 144.1, 'token_acc': 0.2485, 'nll_loss': 5.766, 'ppl': 319.2}
[ time:36858.0s total_exs:11439396 epochs:174.07 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258218, 'loss': 143.3, 'token_acc': 0.2582, 'nll_loss': 5.729, 'ppl': 307.6}
[ time:36860.0s total_exs:11440296 epochs:174.08 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258243, 'loss': 144.2, 'token_acc': 0.2479, 'nll_loss': 5.771, 'ppl': 320.7}
[ time:36862.0s total_exs:11441196 epochs:174.09 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258268, 'loss': 143.0, 'token_acc': 0.2581, 'nll_loss': 5.722, 'ppl': 305.6}
[ time:36864.0s total_exs:11442096 epochs:174.11 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258293, 'loss': 143.9, 'token_acc': 0.2536, 'nll_loss': 5.756, 'ppl': 316.2}
[ time:36866.0s total_exs:11442996 epochs:174.12 ] {'exs': 900, 'lr': 0.01, 'num_updates': 258318, 'loss': 144.4, 'token_acc': 0.2558, 'nll_loss': 5.775, 'ppl':

[ time:36956.0s total_exs:11482344 epochs:174.72 ] {'exs': 864, 'lr': 0.01, 'num_updates': 259411, 'loss': 137.1, 'token_acc': 0.2561, 'nll_loss': 5.716, 'ppl': 303.6}
[ time:36958.0s total_exs:11483208 epochs:174.73 ] {'exs': 864, 'lr': 0.01, 'num_updates': 259435, 'loss': 138.0, 'token_acc': 0.2519, 'nll_loss': 5.75, 'ppl': 314.1}
[ time:36960.0s total_exs:11484108 epochs:174.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 259460, 'loss': 144.6, 'token_acc': 0.2509, 'nll_loss': 5.782, 'ppl': 324.3}
[ time:36962.0s total_exs:11484972 epochs:174.76 ] {'exs': 864, 'lr': 0.01, 'num_updates': 259484, 'loss': 136.1, 'token_acc': 0.2562, 'nll_loss': 5.674, 'ppl': 291.3}
[ time:36964.0s total_exs:11485836 epochs:174.77 ] {'exs': 864, 'lr': 0.01, 'num_updates': 259508, 'loss': 138.5, 'token_acc': 0.2531, 'nll_loss': 5.773, 'ppl': 321.4}
[ time:36966.0s total_exs:11486736 epochs:174.79 ] {'exs': 900, 'lr': 0.01, 'num_updates': 259533, 'loss': 143.4, 'token_acc': 0.2535, 'nll_loss': 5.74, 'ppl': 3

[ time:37056.0s total_exs:11526048 epochs:175.38 ] {'exs': 936, 'lr': 0.01, 'num_updates': 260625, 'loss': 148.1, 'token_acc': 0.2603, 'nll_loss': 5.696, 'ppl': 297.8}
[ time:37058.0s total_exs:11526948 epochs:175.4 ] {'exs': 900, 'lr': 0.01, 'num_updates': 260650, 'loss': 141.2, 'token_acc': 0.2622, 'nll_loss': 5.646, 'ppl': 283.2}
[ time:37060.0s total_exs:11527848 epochs:175.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 260675, 'loss': 143.9, 'token_acc': 0.2539, 'nll_loss': 5.76, 'ppl': 317.5}
[ time:37062.0s total_exs:11528748 epochs:175.42 ] {'exs': 900, 'lr': 0.01, 'num_updates': 260700, 'loss': 141.6, 'token_acc': 0.2659, 'nll_loss': 5.67, 'ppl': 289.9}
[ time:37064.0s total_exs:11529612 epochs:175.44 ] {'exs': 864, 'lr': 0.01, 'num_updates': 260724, 'loss': 137.2, 'token_acc': 0.2503, 'nll_loss': 5.718, 'ppl': 304.3}
[ time:37066.0s total_exs:11530476 epochs:175.45 ] {'exs': 864, 'lr': 0.01, 'num_updates': 260748, 'loss': 138.3, 'token_acc': 0.2524, 'nll_loss': 5.764, 'ppl': 31

[ time:37156.0s total_exs:11569752 epochs:176.05 ] {'exs': 936, 'lr': 0.01, 'num_updates': 261839, 'loss': 145.0, 'token_acc': 0.2666, 'nll_loss': 5.58, 'ppl': 265.0}
[ time:37158.0s total_exs:11570508 epochs:176.06 ] {'exs': 756, 'lr': 0.01, 'num_updates': 261860, 'loss': 119.9, 'token_acc': 0.2504, 'nll_loss': 5.71, 'ppl': 301.9}
[ time:37160.0s total_exs:11571336 epochs:176.07 ] {'exs': 828, 'lr': 0.01, 'num_updates': 261883, 'loss': 129.8, 'token_acc': 0.2561, 'nll_loss': 5.644, 'ppl': 282.6}
[ time:37162.0s total_exs:11572200 epochs:176.09 ] {'exs': 864, 'lr': 0.01, 'num_updates': 261907, 'loss': 136.9, 'token_acc': 0.2549, 'nll_loss': 5.705, 'ppl': 300.3}
[ time:37164.0s total_exs:11573100 epochs:176.1 ] {'exs': 900, 'lr': 0.01, 'num_updates': 261932, 'loss': 142.0, 'token_acc': 0.2608, 'nll_loss': 5.68, 'ppl': 292.8}
[ time:37166.0s total_exs:11573928 epochs:176.11 ] {'exs': 828, 'lr': 0.01, 'num_updates': 261955, 'loss': 131.8, 'token_acc': 0.2531, 'nll_loss': 5.73, 'ppl': 308.

[ time:37256.0s total_exs:11612664 epochs:176.7 ] {'exs': 864, 'lr': 0.01, 'num_updates': 263031, 'loss': 137.1, 'token_acc': 0.2574, 'nll_loss': 5.719, 'ppl': 304.6}
[ time:37258.0s total_exs:11613528 epochs:176.71 ] {'exs': 864, 'lr': 0.01, 'num_updates': 263055, 'loss': 135.0, 'token_acc': 0.2608, 'nll_loss': 5.627, 'ppl': 277.8}
[ time:37260.0s total_exs:11614356 epochs:176.73 ] {'exs': 828, 'lr': 0.01, 'num_updates': 263078, 'loss': 129.1, 'token_acc': 0.2691, 'nll_loss': 5.617, 'ppl': 275.0}
[ time:37262.0s total_exs:11615256 epochs:176.74 ] {'exs': 900, 'lr': 0.01, 'num_updates': 263103, 'loss': 141.6, 'token_acc': 0.2629, 'nll_loss': 5.667, 'ppl': 289.2}
[ time:37265.0s total_exs:11616120 epochs:176.75 ] {'exs': 864, 'lr': 0.01, 'num_updates': 263127, 'loss': 137.6, 'token_acc': 0.2509, 'nll_loss': 5.737, 'ppl': 310.0}
[ time:37267.0s total_exs:11617056 epochs:176.77 ] {'exs': 936, 'lr': 0.01, 'num_updates': 263153, 'loss': 149.5, 'token_acc': 0.2596, 'nll_loss': 5.751, 'ppl': 

[ time:37356.0s total_exs:11655900 epochs:177.36 ] {'exs': 900, 'lr': 0.01, 'num_updates': 264232, 'loss': 141.7, 'token_acc': 0.2528, 'nll_loss': 5.67, 'ppl': 289.9}
[ time:37358.0s total_exs:11656800 epochs:177.37 ] {'exs': 900, 'lr': 0.01, 'num_updates': 264257, 'loss': 141.8, 'token_acc': 0.2646, 'nll_loss': 5.674, 'ppl': 291.3}
[ time:37361.0s total_exs:11657664 epochs:177.39 ] {'exs': 864, 'lr': 0.01, 'num_updates': 264281, 'loss': 138.8, 'token_acc': 0.2508, 'nll_loss': 5.784, 'ppl': 325.1}
[ time:37363.0s total_exs:11658528 epochs:177.4 ] {'exs': 864, 'lr': 0.01, 'num_updates': 264305, 'loss': 137.3, 'token_acc': 0.2502, 'nll_loss': 5.717, 'ppl': 304.0}
[ time:37365.0s total_exs:11659428 epochs:177.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 264330, 'loss': 143.1, 'token_acc': 0.2577, 'nll_loss': 5.724, 'ppl': 306.1}
[ time:37367.0s total_exs:11660328 epochs:177.43 ] {'exs': 900, 'lr': 0.01, 'num_updates': 264355, 'loss': 142.4, 'token_acc': 0.2587, 'nll_loss': 5.7, 'ppl': 298

[ time:37457.0s total_exs:11699676 epochs:178.03 ] {'exs': 900, 'lr': 0.01, 'num_updates': 265448, 'loss': 138.7, 'token_acc': 0.2709, 'nll_loss': 5.551, 'ppl': 257.6}
[ time:37459.0s total_exs:11700576 epochs:178.04 ] {'exs': 900, 'lr': 0.01, 'num_updates': 265473, 'loss': 145.6, 'token_acc': 0.25, 'nll_loss': 5.821, 'ppl': 337.4}
[ time:37461.0s total_exs:11701476 epochs:178.05 ] {'exs': 900, 'lr': 0.01, 'num_updates': 265498, 'loss': 143.0, 'token_acc': 0.2536, 'nll_loss': 5.722, 'ppl': 305.6}
[ time:37463.0s total_exs:11702376 epochs:178.07 ] {'exs': 900, 'lr': 0.01, 'num_updates': 265523, 'loss': 143.8, 'token_acc': 0.254, 'nll_loss': 5.752, 'ppl': 314.8}
[ time:37465.0s total_exs:11703240 epochs:178.08 ] {'exs': 864, 'lr': 0.01, 'num_updates': 265547, 'loss': 137.4, 'token_acc': 0.2554, 'nll_loss': 5.728, 'ppl': 307.3}
[ time:37467.0s total_exs:11704104 epochs:178.09 ] {'exs': 864, 'lr': 0.01, 'num_updates': 265571, 'loss': 137.3, 'token_acc': 0.2553, 'nll_loss': 5.722, 'ppl': 30

[ time:37557.0s total_exs:11743308 epochs:178.69 ] {'exs': 900, 'lr': 0.01, 'num_updates': 266660, 'loss': 140.7, 'token_acc': 0.2628, 'nll_loss': 5.632, 'ppl': 279.1}
[ time:37559.0s total_exs:11744244 epochs:178.7 ] {'exs': 936, 'lr': 0.01, 'num_updates': 266686, 'loss': 146.6, 'token_acc': 0.261, 'nll_loss': 5.645, 'ppl': 282.8}
[ time:37561.0s total_exs:11745180 epochs:178.72 ] {'exs': 936, 'lr': 0.01, 'num_updates': 266712, 'loss': 148.4, 'token_acc': 0.2591, 'nll_loss': 5.708, 'ppl': 301.3}
[ time:37563.0s total_exs:11746080 epochs:178.73 ] {'exs': 900, 'lr': 0.01, 'num_updates': 266737, 'loss': 144.3, 'token_acc': 0.2542, 'nll_loss': 5.773, 'ppl': 321.7}
[ time:37565.0s total_exs:11746980 epochs:178.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 266762, 'loss': 143.2, 'token_acc': 0.2547, 'nll_loss': 5.729, 'ppl': 307.6}
[ time:37567.0s total_exs:11747844 epochs:178.76 ] {'exs': 864, 'lr': 0.01, 'num_updates': 266786, 'loss': 138.5, 'token_acc': 0.2534, 'nll_loss': 5.771, 'ppl': 3

[ time:37657.0s total_exs:11786760 epochs:179.35 ] {'exs': 900, 'lr': 0.01, 'num_updates': 267867, 'loss': 142.1, 'token_acc': 0.2558, 'nll_loss': 5.685, 'ppl': 294.3}
[ time:37659.0s total_exs:11787660 epochs:179.36 ] {'exs': 900, 'lr': 0.01, 'num_updates': 267892, 'loss': 142.6, 'token_acc': 0.2532, 'nll_loss': 5.704, 'ppl': 300.0}
[ time:37661.0s total_exs:11788560 epochs:179.38 ] {'exs': 900, 'lr': 0.01, 'num_updates': 267917, 'loss': 139.5, 'token_acc': 0.2696, 'nll_loss': 5.583, 'ppl': 265.9}
[ time:37663.0s total_exs:11789424 epochs:179.39 ] {'exs': 864, 'lr': 0.01, 'num_updates': 267941, 'loss': 136.0, 'token_acc': 0.2581, 'nll_loss': 5.672, 'ppl': 290.6}
[ time:37665.0s total_exs:11790324 epochs:179.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 267966, 'loss': 143.9, 'token_acc': 0.2521, 'nll_loss': 5.757, 'ppl': 316.4}
[ time:37667.0s total_exs:11791188 epochs:179.42 ] {'exs': 864, 'lr': 0.01, 'num_updates': 267990, 'loss': 135.0, 'token_acc': 0.262, 'nll_loss': 5.627, 'ppl': 

[ time:37757.0s total_exs:11830536 epochs:180.02 ] {'exs': 936, 'lr': 0.01, 'num_updates': 269083, 'loss': 146.5, 'token_acc': 0.2588, 'nll_loss': 5.638, 'ppl': 281.0}
[ time:37759.0s total_exs:11831436 epochs:180.03 ] {'exs': 900, 'lr': 0.01, 'num_updates': 269108, 'loss': 141.1, 'token_acc': 0.2554, 'nll_loss': 5.642, 'ppl': 282.1}
[ time:37761.0s total_exs:11832336 epochs:180.04 ] {'exs': 900, 'lr': 0.01, 'num_updates': 269133, 'loss': 140.5, 'token_acc': 0.2662, 'nll_loss': 5.622, 'ppl': 276.5}
[ time:37763.0s total_exs:11833200 epochs:180.06 ] {'exs': 864, 'lr': 0.01, 'num_updates': 269157, 'loss': 139.8, 'token_acc': 0.2472, 'nll_loss': 5.829, 'ppl': 340.1}
[ time:37765.0s total_exs:11834064 epochs:180.07 ] {'exs': 864, 'lr': 0.01, 'num_updates': 269181, 'loss': 138.2, 'token_acc': 0.2505, 'nll_loss': 5.758, 'ppl': 316.8}
[ time:37767.0s total_exs:11834964 epochs:180.08 ] {'exs': 900, 'lr': 0.01, 'num_updates': 269206, 'loss': 141.5, 'token_acc': 0.2579, 'nll_loss': 5.668, 'ppl':

[ time:37858.0s total_exs:11873952 epochs:180.68 ] {'exs': 900, 'lr': 0.01, 'num_updates': 270289, 'loss': 140.5, 'token_acc': 0.2629, 'nll_loss': 5.62, 'ppl': 276.0}
[ time:37860.0s total_exs:11874852 epochs:180.69 ] {'exs': 900, 'lr': 0.01, 'num_updates': 270314, 'loss': 142.5, 'token_acc': 0.2588, 'nll_loss': 5.699, 'ppl': 298.7}
[ time:37862.0s total_exs:11875752 epochs:180.71 ] {'exs': 900, 'lr': 0.01, 'num_updates': 270339, 'loss': 139.6, 'token_acc': 0.2666, 'nll_loss': 5.585, 'ppl': 266.5}
[ time:37864.0s total_exs:11876616 epochs:180.72 ] {'exs': 864, 'lr': 0.01, 'num_updates': 270363, 'loss': 138.6, 'token_acc': 0.2473, 'nll_loss': 5.774, 'ppl': 321.7}
[ time:37866.0s total_exs:11877516 epochs:180.73 ] {'exs': 900, 'lr': 0.01, 'num_updates': 270388, 'loss': 142.7, 'token_acc': 0.2527, 'nll_loss': 5.707, 'ppl': 300.9}
[ time:37868.0s total_exs:11878416 epochs:180.75 ] {'exs': 900, 'lr': 0.01, 'num_updates': 270413, 'loss': 144.2, 'token_acc': 0.2524, 'nll_loss': 5.767, 'ppl': 

[ time:37957.0s total_exs:11917656 epochs:181.34 ] {'exs': 828, 'lr': 0.01, 'num_updates': 271503, 'loss': 130.7, 'token_acc': 0.2568, 'nll_loss': 5.687, 'ppl': 294.9}
[ time:37959.0s total_exs:11918520 epochs:181.36 ] {'exs': 864, 'lr': 0.01, 'num_updates': 271527, 'loss': 135.9, 'token_acc': 0.2614, 'nll_loss': 5.664, 'ppl': 288.3}
[ time:37961.0s total_exs:11919456 epochs:181.37 ] {'exs': 936, 'lr': 0.01, 'num_updates': 271553, 'loss': 148.0, 'token_acc': 0.2593, 'nll_loss': 5.692, 'ppl': 296.6}
[ time:37963.0s total_exs:11920320 epochs:181.38 ] {'exs': 864, 'lr': 0.01, 'num_updates': 271577, 'loss': 139.4, 'token_acc': 0.2487, 'nll_loss': 5.807, 'ppl': 332.5}
[ time:37966.0s total_exs:11921184 epochs:181.4 ] {'exs': 864, 'lr': 0.01, 'num_updates': 271601, 'loss': 137.2, 'token_acc': 0.2589, 'nll_loss': 5.721, 'ppl': 305.3}
[ time:37968.0s total_exs:11922084 epochs:181.41 ] {'exs': 900, 'lr': 0.01, 'num_updates': 271626, 'loss': 144.9, 'token_acc': 0.2476, 'nll_loss': 5.799, 'ppl': 

[ time:38057.0s total_exs:11961072 epochs:182.0 ] {'exs': 864, 'lr': 0.01, 'num_updates': 272709, 'loss': 136.0, 'token_acc': 0.2547, 'nll_loss': 5.669, 'ppl': 289.7}
[ time:38059.0s total_exs:11961936 epochs:182.02 ] {'exs': 864, 'lr': 0.01, 'num_updates': 272733, 'loss': 137.8, 'token_acc': 0.2536, 'nll_loss': 5.743, 'ppl': 311.9}
[ time:38061.0s total_exs:11962836 epochs:182.03 ] {'exs': 900, 'lr': 0.01, 'num_updates': 272758, 'loss': 142.0, 'token_acc': 0.256, 'nll_loss': 5.686, 'ppl': 294.7}
[ time:38063.0s total_exs:11963772 epochs:182.04 ] {'exs': 936, 'lr': 0.01, 'num_updates': 272784, 'loss': 148.1, 'token_acc': 0.2536, 'nll_loss': 5.698, 'ppl': 298.4}
[ time:38065.0s total_exs:11964672 epochs:182.06 ] {'exs': 900, 'lr': 0.01, 'num_updates': 272809, 'loss': 143.2, 'token_acc': 0.2557, 'nll_loss': 5.727, 'ppl': 307.1}
[ time:38067.0s total_exs:11965500 epochs:182.07 ] {'exs': 828, 'lr': 0.01, 'num_updates': 272832, 'loss': 131.0, 'token_acc': 0.2594, 'nll_loss': 5.696, 'ppl': 2

[ time:38157.0s total_exs:12004524 epochs:182.66 ] {'exs': 864, 'lr': 0.01, 'num_updates': 273916, 'loss': 135.3, 'token_acc': 0.2663, 'nll_loss': 5.637, 'ppl': 280.7}
[ time:38159.0s total_exs:12005388 epochs:182.68 ] {'exs': 864, 'lr': 0.01, 'num_updates': 273940, 'loss': 139.5, 'token_acc': 0.2474, 'nll_loss': 5.818, 'ppl': 336.3}
[ time:38161.0s total_exs:12006216 epochs:182.69 ] {'exs': 828, 'lr': 0.01, 'num_updates': 273963, 'loss': 131.7, 'token_acc': 0.2513, 'nll_loss': 5.726, 'ppl': 306.7}
[ time:38163.0s total_exs:12007152 epochs:182.7 ] {'exs': 936, 'lr': 0.01, 'num_updates': 273989, 'loss': 148.8, 'token_acc': 0.253, 'nll_loss': 5.724, 'ppl': 306.1}
[ time:38165.0s total_exs:12008016 epochs:182.72 ] {'exs': 864, 'lr': 0.01, 'num_updates': 274013, 'loss': 137.5, 'token_acc': 0.2557, 'nll_loss': 5.727, 'ppl': 306.9}
[ time:38167.0s total_exs:12008916 epochs:182.73 ] {'exs': 900, 'lr': 0.01, 'num_updates': 274038, 'loss': 142.1, 'token_acc': 0.2633, 'nll_loss': 5.683, 'ppl': 2

[ time:38257.0s total_exs:12048120 epochs:183.33 ] {'exs': 900, 'lr': 0.01, 'num_updates': 275127, 'loss': 142.0, 'token_acc': 0.2562, 'nll_loss': 5.679, 'ppl': 292.8}
[ time:38259.0s total_exs:12049056 epochs:183.34 ] {'exs': 936, 'lr': 0.01, 'num_updates': 275153, 'loss': 147.1, 'token_acc': 0.2587, 'nll_loss': 5.66, 'ppl': 287.2}
[ time:38261.0s total_exs:12049956 epochs:183.36 ] {'exs': 900, 'lr': 0.01, 'num_updates': 275178, 'loss': 140.1, 'token_acc': 0.2563, 'nll_loss': 5.604, 'ppl': 271.5}
[ time:38263.0s total_exs:12050820 epochs:183.37 ] {'exs': 864, 'lr': 0.01, 'num_updates': 275202, 'loss': 135.3, 'token_acc': 0.2638, 'nll_loss': 5.639, 'ppl': 281.0}
[ time:38265.0s total_exs:12051720 epochs:183.38 ] {'exs': 900, 'lr': 0.01, 'num_updates': 275227, 'loss': 143.0, 'token_acc': 0.2531, 'nll_loss': 5.719, 'ppl': 304.5}
[ time:38267.0s total_exs:12052656 epochs:183.4 ] {'exs': 936, 'lr': 0.01, 'num_updates': 275253, 'loss': 148.3, 'token_acc': 0.2584, 'nll_loss': 5.706, 'ppl': 3

({'exs': 7801,
  'accuracy': 0.0007691,
  'f1': 0.1531,
  'bleu': 0.001849,
  'lr': 0.01,
  'num_updates': 276197,
  'loss': 1222.0,
  'token_acc': 0.2774,
  'nll_loss': 5.55,
  'ppl': 257.1},
 {'exs': 7512,
  'accuracy': 0.0007987,
  'f1': 0.1587,
  'bleu': 0.002091,
  'lr': 0.01,
  'num_updates': 276197,
  'loss': 1169.0,
  'token_acc': 0.2822,
  'nll_loss': 5.509,
  'ppl': 247.0})

In [12]:
import transformers

In [None]:
transformers.AutoTokenizer.from_pretrained

In [13]:
transformers.Seq2SeqTrainer

transformers.trainer_seq2seq.Seq2SeqTrainer