In [1]:
import sys
sys.path.append('..')

In [2]:
import os
os.getcwd()

'/mnt/3B1D7BDC2D2641B0/Documents/GIT/SQUAD_NLP/pythonbooks'

In [3]:
os.chdir('..')
os.getcwd()

'/mnt/3B1D7BDC2D2641B0/Documents/GIT/SQUAD_NLP'

# Хочется вытащить все в питонбук, потому что из консоли работать неудобно

# Обучение

In [4]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data
import util

from args import get_train_args
from collections import OrderedDict
from json import dumps
from models_att import BiDAF_attDCA
from tensorboardX import SummaryWriter
from tqdm import tqdm
from ujson import load as json_load
from util import collate_fn, SQuAD

In [5]:
train_args = type('', (), {})() # empty "structure" where you can add fields

train_args.train_record_file = './data/train.npz'
train_args.dev_record_file = './data/dev.npz'
train_args.test_record_file = './data/test.npz'
train_args.word_emb_file = './data/word_emb.json'
train_args.char_emb_file = './data/char_emb.json'
train_args.train_eval_file = './data/train_eval.json'
train_args.dev_eval_file = './data/dev_eval.json'
train_args.test_eval_file = './data/test_eval.json'

train_args.name = 'train'       # ???
train_args.max_ans_len = 15     # Maximum length of a predicted answer.
train_args.num_workers = 4      # Number of sub-processes to use per data loader.
train_args.save_dir = './save/' # Base directory for saving information.
train_args.batch_size = 64      # Batch size per GPU. Scales automatically when \
                                #    multiple GPUs are available.
train_args.use_squad_v2 = True  # Whether to use SQuAD 2.0 (unanswerable) questions.
train_args.hidden_size = 100    # Number of features in encoder hidden layers
train_args.num_visuals = 10     # Number of examples to visualize in TensorBoard.
train_args.load_path = None     # Path to load as a model checkpoint.

train_args.eval_steps = 50000   # Number of steps between successive evaluations
train_args.lr = 0.5             # learning rate
train_args.l2_wd = 0.0          # L2 weight decay.
train_args.num_epochs = 30      # Number of epochs for which to train. Negative means forever.
train_args.drop_prob = 0.2      # Probability of zeroing an activation in dropout layers.
train_args.metric_name = 'F1'   # choices=('NLL', 'EM', 'F1'), name of dev metric to determine best checkpoint.
train_args.max_checkpoints = 5  # Maximum number of checkpoints to keep on disk.
train_args.max_grad_norm = 5.0  # Maximum gradient norm for gradient clipping.
train_args.seed = 224           # Random seed
train_args.ema_decay = 0.999    # Decay rate for exponential moving average of parameters.

if train_args.metric_name == 'NLL':
    # Best checkpoint is the one that minimizes negative log-likelihood
    train_args.maximize_metric = False
elif train_args.metric_name in ('EM', 'F1'):
    # Best checkpoint is the one that maximizes EM or F1
    train_args.maximize_metric = True

In [6]:
# from train.py

def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF_attDCA(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)

In [7]:
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict,
                                           ids.tolist(),
                                           starts.tolist(),
                                           ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg),
                    ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict

In [8]:
main(train_args)

[05.01.19 16:00:50] Args: {
    "batch_size": 64,
    "char_emb_file": "./data/char_emb.json",
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "./data/dev.npz",
    "drop_prob": 0.2,
    "ema_decay": 0.999,
    "eval_steps": 50000,
    "gpu_ids": [
        0
    ],
    "hidden_size": 100,
    "l2_wd": 0.0,
    "load_path": null,
    "lr": 0.5,
    "max_ans_len": 15,
    "max_checkpoints": 5,
    "max_grad_norm": 5.0,
    "maximize_metric": true,
    "metric_name": "F1",
    "name": "train",
    "num_epochs": 30,
    "num_visuals": 10,
    "num_workers": 4,
    "save_dir": "./save/train/train-58",
    "seed": 224,
    "test_eval_file": "./data/test_eval.json",
    "test_record_file": "./data/test.npz",
    "train_eval_file": "./data/train_eval.json",
    "train_record_file": "./data/train.npz",
    "use_squad_v2": true,
    "word_emb_file": "./data/word_emb.json"
}
[05.01.19 16:00:50] Using random seed 224...
[05.01.19 16:00:50] Loading embeddings...
[05.01.19 16:00:

  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 16:01:55] Training...
[05.01.19 16:01:55] Starting epoch 1...


 39%|███▊      | 50048/129922 [05:05<08:19, 159.88it/s, NLL=6.71, epoch=1]

[05.01.19 16:07:00] Evaluating at step 50048...



  self.num_layers, self.dropout, self.training, self.bidirectional)

  1%|          | 64/5951 [00:00<00:44, 132.72it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.72it/s, NLL=4.96][A
  2%|▏         | 128/5951 [00:00<00:43, 132.72it/s, NLL=5]  [A
  3%|▎         | 192/5951 [00:00<00:31, 180.49it/s, NLL=5][A
  3%|▎         | 192/5951 [00:00<00:31, 180.49it/s, NLL=4.9][A
  4%|▍         | 256/5951 [00:00<00:31, 180.49it/s, NLL=4.96][A
  5%|▌         | 320/5951 [00:00<00:23, 239.30it/s, NLL=4.96][A
  5%|▌         | 320/5951 [00:00<00:23, 239.30it/s, NLL=4.97][A
  6%|▋         | 384/5951 [00:00<00:23, 239.30it/s, NLL=4.96][A
  8%|▊         | 448/5951 [00:00<00:22, 239.30it/s, NLL=5]   [A
  9%|▊         | 512/5951 [00:00<00:17, 319.25it/s, NLL=5][A
  9%|▊         | 512/5951 [00:00<00:17, 319.25it/s, NLL=4.97][A
 10%|▉         | 576/5951 [00:00<00:16, 319.25it/s, NLL=4.97][A
 11%|█         | 640/5951 [00:00<00:13, 403.88it/s, NLL=4.97][A
 11%|█         | 640/5951 [00:00<00:13, 

 88%|████████▊ | 5248/5951 [00:05<00:00, 1139.98it/s, NLL=5.34][A
 88%|████████▊ | 5248/5951 [00:05<00:00, 1139.98it/s, NLL=5.34][A
 89%|████████▉ | 5312/5951 [00:05<00:00, 1139.98it/s, NLL=5.33][A
 90%|█████████ | 5376/5951 [00:05<00:00, 1114.32it/s, NLL=5.33][A
 90%|█████████ | 5376/5951 [00:05<00:00, 1114.32it/s, NLL=5.34][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1114.32it/s, NLL=5.34][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1144.64it/s, NLL=5.34][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1144.64it/s, NLL=5.33][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1144.64it/s, NLL=5.33][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1146.69it/s, NLL=5.33][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1146.69it/s, NLL=5.34][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1146.69it/s, NLL=5.34][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1115.66it/s, NLL=5.34][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1115.66it/s, NLL=5.35][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1115.66it/s, NLL=5.35

[05.01.19 16:07:09] Saved checkpoint: ./save/train/train-58/step_50048.pth.tar


 39%|███▊      | 50048/129922 [05:14<08:19, 159.88it/s, NLL=6.71, epoch=1]

[05.01.19 16:07:09] New best checkpoint at step 50048...
[05.01.19 16:07:09] Dev NLL: 05.36, F1: 52.19, EM: 52.19, AvNA: 52.14
[05.01.19 16:07:09] Visualizing in TensorBoard...


 77%|███████▋  | 100096/129922 [10:14<02:56, 168.79it/s, NLL=6.13, epoch=1]

[05.01.19 16:12:09] Evaluating at step 100096...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:41, 140.35it/s][A
  1%|          | 64/5951 [00:00<00:41, 140.35it/s, NLL=4.73][A
  2%|▏         | 128/5951 [00:00<00:41, 140.35it/s, NLL=4.65][A
  3%|▎         | 192/5951 [00:00<00:30, 190.17it/s, NLL=4.65][A
  3%|▎         | 192/5951 [00:00<00:30, 190.17it/s, NLL=4.55][A
  4%|▍         | 256/5951 [00:00<00:29, 190.17it/s, NLL=4.65][A
  5%|▌         | 320/5951 [00:00<00:29, 190.17it/s, NLL=4.71][A
  6%|▋         | 384/5951 [00:00<00:21, 256.62it/s, NLL=4.71][A
  6%|▋         | 384/5951 [00:00<00:21, 256.62it/s, NLL=4.73][A
  8%|▊         | 448/5951 [00:00<00:21, 256.62it/s, NLL=4.78][A
  9%|▊         | 512/5951 [00:00<00:21, 256.62it/s, NLL=4.76][A
 10%|▉         | 576/5951 [00:00<00:15, 338.49it/s, NLL=4.76][A
 10%|▉         | 576/5951 [00:00<00:15, 338.49it/s, NLL=4.78][A
 11%|█         | 640/5951 [00:00<00:15, 338.49it/s, NLL=4.76][A
 12%|█▏        | 704/5951 [00:00<00:12, 427.26it/s, NLL=4

 94%|█████████▎| 5568/5951 [00:05<00:00, 1168.03it/s, NLL=5.11][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1168.03it/s, NLL=5.11][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1168.03it/s, NLL=5.11][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1170.31it/s, NLL=5.11][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1170.31it/s, NLL=5.12][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1170.31it/s, NLL=5.13][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1029.65it/s, NLL=5.13][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1029.65it/s, NLL=5.13][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1029.65it/s, NLL=5.13][A
100%|██████████| 5951/5951 [00:05<00:00, 963.49it/s, NLL=5.13] [A
 77%|███████▋  | 100096/129922 [10:22<02:56, 168.79it/s, NLL=6.13, epoch=1]

[05.01.19 16:12:17] Saved checkpoint: ./save/train/train-58/step_100096.pth.tar
[05.01.19 16:12:17] Dev NLL: 05.13, F1: 51.55, EM: 51.55, AvNA: 51.71
[05.01.19 16:12:17] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:21<00:00, 162.04it/s, NLL=7.01, epoch=1]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 16:15:17] Starting epoch 2...


 16%|█▌        | 20224/129922 [02:02<10:44, 170.18it/s, NLL=5.83, epoch=2]

[05.01.19 16:17:19] Evaluating at step 150146...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:51, 113.74it/s][A
  1%|          | 64/5951 [00:00<00:51, 113.74it/s, NLL=4.35][A
  2%|▏         | 128/5951 [00:00<00:51, 113.74it/s, NLL=4.27][A
  3%|▎         | 192/5951 [00:00<00:37, 154.45it/s, NLL=4.27][A
  3%|▎         | 192/5951 [00:00<00:37, 154.45it/s, NLL=4.13][A
  4%|▍         | 256/5951 [00:00<00:36, 154.45it/s, NLL=4.33][A
  5%|▌         | 320/5951 [00:00<00:36, 154.45it/s, NLL=4.45][A
  6%|▋         | 384/5951 [00:00<00:26, 210.29it/s, NLL=4.45][A
  6%|▋         | 384/5951 [00:00<00:26, 210.29it/s, NLL=4.5] [A
  8%|▊         | 448/5951 [00:00<00:26, 210.29it/s, NLL=4.58][A
  9%|▊         | 512/5951 [00:00<00:20, 271.68it/s, NLL=4.58][A
  9%|▊         | 512/5951 [00:00<00:20, 271.68it/s, NLL=4.58][A
 10%|▉         | 576/5951 [00:01<00:19, 271.68it/s, NLL=4.6] [A
 11%|█         | 640/5951 [00:01<00:15, 346.08it/s, NLL=4.6][A
 11%|█         | 640/5951 [00:01<00:15, 346.08it/s, NLL=4.

 94%|█████████▎| 5568/5951 [00:05<00:00, 1243.21it/s, NLL=4.81][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1243.21it/s, NLL=4.8] [A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1243.21it/s, NLL=4.81][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1128.07it/s, NLL=4.81][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1128.07it/s, NLL=4.82][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1128.07it/s, NLL=4.83][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1080.38it/s, NLL=4.83][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1080.38it/s, NLL=4.82][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1080.38it/s, NLL=4.83][A
100%|██████████| 5951/5951 [00:05<00:00, 1080.38it/s, NLL=4.83][A
 16%|█▌        | 20224/129922 [02:10<10:44, 170.18it/s, NLL=5.83, epoch=2]

[05.01.19 16:17:27] Saved checkpoint: ./save/train/train-58/step_150146.pth.tar
[05.01.19 16:17:27] Dev NLL: 04.83, F1: 50.39, EM: 50.26, AvNA: 51.81
[05.01.19 16:17:27] Visualizing in TensorBoard...


 54%|█████▍    | 70272/129922 [07:10<05:46, 172.22it/s, NLL=5.37, epoch=2] 

[05.01.19 16:22:28] Evaluating at step 200194...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:45, 129.18it/s][A
  1%|          | 64/5951 [00:00<00:45, 129.18it/s, NLL=3.98][A
  2%|▏         | 128/5951 [00:00<00:45, 129.18it/s, NLL=3.86][A
  3%|▎         | 192/5951 [00:00<00:32, 175.67it/s, NLL=3.86][A
  3%|▎         | 192/5951 [00:00<00:32, 175.67it/s, NLL=3.72][A
  4%|▍         | 256/5951 [00:00<00:32, 175.67it/s, NLL=3.92][A
  5%|▌         | 320/5951 [00:00<00:32, 175.67it/s, NLL=4.09][A
  6%|▋         | 384/5951 [00:00<00:23, 237.45it/s, NLL=4.09][A
  6%|▋         | 384/5951 [00:00<00:23, 237.45it/s, NLL=4.17][A
  8%|▊         | 448/5951 [00:00<00:23, 237.45it/s, NLL=4.26][A
  9%|▊         | 512/5951 [00:00<00:22, 237.45it/s, NLL=4.26][A
 10%|▉         | 576/5951 [00:00<00:17, 314.48it/s, NLL=4.26][A
 10%|▉         | 576/5951 [00:00<00:17, 314.48it/s, NLL=4.27][A
 11%|█         | 640/5951 [00:00<00:16, 314.48it/s, NLL=4.26][A
 12%|█▏        | 704/5951 [00:01<00:13, 400.32it/s, NLL=4

 95%|█████████▍| 5632/5951 [00:05<00:00, 1243.70it/s, NLL=4.41][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1231.67it/s, NLL=4.41][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1231.67it/s, NLL=4.41][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1231.67it/s, NLL=4.42][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1156.82it/s, NLL=4.42][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1156.82it/s, NLL=4.42][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1156.82it/s, NLL=4.43][A
100%|██████████| 5951/5951 [00:05<00:00, 1156.82it/s, NLL=4.43][A
 54%|█████▍    | 70272/129922 [07:17<05:46, 172.22it/s, NLL=5.37, epoch=2]

[05.01.19 16:22:35] Saved checkpoint: ./save/train/train-58/step_200194.pth.tar
[05.01.19 16:22:35] Dev NLL: 04.43, F1: 50.22, EM: 49.77, AvNA: 52.85
[05.01.19 16:22:35] Visualizing in TensorBoard...


 93%|█████████▎| 120320/129922 [12:17<01:01, 156.10it/s, NLL=4.74, epoch=2]

[05.01.19 16:27:35] Evaluating at step 250242...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:45, 128.57it/s][A
  1%|          | 64/5951 [00:00<00:45, 128.57it/s, NLL=3.54][A
  2%|▏         | 128/5951 [00:00<00:45, 128.57it/s, NLL=3.47][A
  3%|▎         | 192/5951 [00:00<00:32, 175.35it/s, NLL=3.47][A
  3%|▎         | 192/5951 [00:00<00:32, 175.35it/s, NLL=3.34][A
  4%|▍         | 256/5951 [00:00<00:32, 175.35it/s, NLL=3.52][A
  5%|▌         | 320/5951 [00:00<00:32, 175.35it/s, NLL=3.67][A
  6%|▋         | 384/5951 [00:00<00:23, 237.29it/s, NLL=3.67][A
  6%|▋         | 384/5951 [00:00<00:23, 237.29it/s, NLL=3.77][A
  8%|▊         | 448/5951 [00:00<00:23, 237.29it/s, NLL=3.82][A
  9%|▊         | 512/5951 [00:00<00:22, 237.29it/s, NLL=3.85][A
 10%|▉         | 576/5951 [00:00<00:17, 313.39it/s, NLL=3.85][A
 10%|▉         | 576/5951 [00:00<00:17, 313.39it/s, NLL=3.86][A
 11%|█         | 640/5951 [00:00<00:16, 313.39it/s, NLL=3.85][A
 12%|█▏        | 704/5951 [00:01<00:13, 401.95it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1084.80it/s, NLL=3.94][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1084.80it/s, NLL=3.95][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1097.22it/s, NLL=3.95][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1097.22it/s, NLL=3.96][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1097.22it/s, NLL=3.96][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1053.20it/s, NLL=3.96][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1053.20it/s, NLL=3.97][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1053.20it/s, NLL=3.97][A
100%|██████████| 5951/5951 [00:05<00:00, 1053.20it/s, NLL=3.98][A
 93%|█████████▎| 120320/129922 [12:25<01:01, 156.10it/s, NLL=4.74, epoch=2]

[05.01.19 16:27:42] Saved checkpoint: ./save/train/train-58/step_250242.pth.tar
[05.01.19 16:27:42] Dev NLL: 03.98, F1: 48.90, EM: 47.35, AvNA: 54.58
[05.01.19 16:27:42] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:23<00:00, 161.68it/s, NLL=3.34, epoch=2]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 16:28:40] Starting epoch 3...


 31%|███       | 40448/129922 [04:06<08:26, 176.61it/s, NLL=4.67, epoch=3]

[05.01.19 16:32:47] Evaluating at step 300292...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:43, 134.84it/s][A
  1%|          | 64/5951 [00:00<00:43, 134.84it/s, NLL=3.44][A
  2%|▏         | 128/5951 [00:00<00:43, 134.84it/s, NLL=3.33][A
  3%|▎         | 192/5951 [00:00<00:31, 183.26it/s, NLL=3.33][A
  3%|▎         | 192/5951 [00:00<00:31, 183.26it/s, NLL=3.22][A
  4%|▍         | 256/5951 [00:00<00:31, 183.26it/s, NLL=3.41][A
  5%|▌         | 320/5951 [00:00<00:30, 183.26it/s, NLL=3.54][A
  6%|▋         | 384/5951 [00:00<00:22, 247.35it/s, NLL=3.54][A
  6%|▋         | 384/5951 [00:00<00:22, 247.35it/s, NLL=3.65][A
  8%|▊         | 448/5951 [00:00<00:22, 247.35it/s, NLL=3.65][A
  9%|▊         | 512/5951 [00:00<00:21, 247.35it/s, NLL=3.67][A
 10%|▉         | 576/5951 [00:00<00:16, 320.17it/s, NLL=3.67][A
 10%|▉         | 576/5951 [00:00<00:16, 320.17it/s, NLL=3.69][A
 11%|█         | 640/5951 [00:01<00:16, 320.17it/s, NLL=3.66][A
 12%|█▏        | 704/5951 [00:01<00:13, 386.52it/s, NLL=3

 92%|█████████▏| 5504/5951 [00:05<00:00, 1164.10it/s, NLL=3.68][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1164.10it/s, NLL=3.68][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1164.10it/s, NLL=3.69][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1150.73it/s, NLL=3.69][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1150.73it/s, NLL=3.69][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1150.73it/s, NLL=3.7] [A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1105.06it/s, NLL=3.7][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1105.06it/s, NLL=3.71][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1105.06it/s, NLL=3.71][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1104.24it/s, NLL=3.71][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1104.24it/s, NLL=3.71][A
100%|██████████| 5951/5951 [00:06<00:00, 1104.24it/s, NLL=3.72][A
 31%|███       | 40448/129922 [04:14<08:26, 176.61it/s, NLL=4.67, epoch=3]

[05.01.19 16:32:55] Saved checkpoint: ./save/train/train-58/step_300292.pth.tar
[05.01.19 16:32:55] Removed checkpoint: ./save/train/train-58/step_250242.pth.tar
[05.01.19 16:32:55] Dev NLL: 03.72, F1: 49.58, EM: 47.22, AvNA: 56.34
[05.01.19 16:32:55] Visualizing in TensorBoard...


 70%|██████▉   | 90496/129922 [09:16<04:09, 158.15it/s, NLL=4.12, epoch=3] 

[05.01.19 16:37:57] Evaluating at step 350340...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.27it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.27it/s, NLL=3.52][A
  2%|▏         | 128/5951 [00:00<00:44, 132.27it/s, NLL=3.29][A
  3%|▎         | 192/5951 [00:00<00:31, 179.99it/s, NLL=3.29][A
  3%|▎         | 192/5951 [00:00<00:31, 179.99it/s, NLL=3.2] [A
  4%|▍         | 256/5951 [00:00<00:31, 179.99it/s, NLL=3.39][A
  5%|▌         | 320/5951 [00:00<00:23, 241.31it/s, NLL=3.39][A
  5%|▌         | 320/5951 [00:00<00:23, 241.31it/s, NLL=3.5] [A
  6%|▋         | 384/5951 [00:00<00:23, 241.31it/s, NLL=3.57][A
  8%|▊         | 448/5951 [00:00<00:22, 241.31it/s, NLL=3.55][A
  9%|▊         | 512/5951 [00:00<00:16, 320.02it/s, NLL=3.55][A
  9%|▊         | 512/5951 [00:00<00:16, 320.02it/s, NLL=3.54][A
 10%|▉         | 576/5951 [00:00<00:16, 320.02it/s, NLL=3.57][A
 11%|█         | 640/5951 [00:01<00:13, 392.48it/s, NLL=3.57][A
 11%|█         | 640/5951 [00:01<00:13, 392.48it/s, NLL=3

 92%|█████████▏| 5504/5951 [00:05<00:00, 848.90it/s, NLL=3.5] [A
 94%|█████████▎| 5568/5951 [00:05<00:00, 942.10it/s, NLL=3.5][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 942.10it/s, NLL=3.51][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 942.10it/s, NLL=3.51][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 989.00it/s, NLL=3.51][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 989.00it/s, NLL=3.52][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 989.00it/s, NLL=3.52][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 982.98it/s, NLL=3.52][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 982.98it/s, NLL=3.53][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 982.98it/s, NLL=3.53][A
100%|██████████| 5951/5951 [00:06<00:00, 982.98it/s, NLL=3.54][A
 70%|██████▉   | 90496/129922 [09:25<04:09, 158.15it/s, NLL=4.12, epoch=3]

[05.01.19 16:38:06] Saved checkpoint: ./save/train/train-58/step_350340.pth.tar
[05.01.19 16:38:06] Removed checkpoint: ./save/train/train-58/step_300292.pth.tar
[05.01.19 16:38:06] Dev NLL: 03.54, F1: 51.41, EM: 48.63, AvNA: 58.56
[05.01.19 16:38:06] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:24<00:00, 161.53it/s, NLL=1.56, epoch=3]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 16:42:05] Starting epoch 4...


  8%|▊         | 10624/129922 [01:07<12:35, 157.81it/s, NLL=3.6, epoch=4] 

[05.01.19 16:43:12] Evaluating at step 400390...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:42, 138.84it/s][A
  1%|          | 64/5951 [00:00<00:42, 138.84it/s, NLL=3.5][A
  2%|▏         | 128/5951 [00:00<00:41, 138.84it/s, NLL=3.21][A
  3%|▎         | 192/5951 [00:00<00:30, 186.11it/s, NLL=3.21][A
  3%|▎         | 192/5951 [00:00<00:30, 186.11it/s, NLL=3.1] [A
  4%|▍         | 256/5951 [00:00<00:30, 186.11it/s, NLL=3.27][A
  5%|▌         | 320/5951 [00:00<00:23, 243.88it/s, NLL=3.27][A
  5%|▌         | 320/5951 [00:00<00:23, 243.88it/s, NLL=3.37][A
  6%|▋         | 384/5951 [00:00<00:22, 243.88it/s, NLL=3.43][A
  8%|▊         | 448/5951 [00:00<00:17, 314.59it/s, NLL=3.43][A
  8%|▊         | 448/5951 [00:00<00:17, 314.59it/s, NLL=3.4] [A
  9%|▊         | 512/5951 [00:00<00:17, 314.59it/s, NLL=3.4][A
 10%|▉         | 576/5951 [00:01<00:13, 397.59it/s, NLL=3.4][A
 10%|▉         | 576/5951 [00:01<00:13, 397.59it/s, NLL=3.44][A
 11%|█         | 640/5951 [00:01<00:13, 397.59it/s, NLL=3.42

 92%|█████████▏| 5504/5951 [00:05<00:00, 1164.26it/s, NLL=3.36][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1164.26it/s, NLL=3.37][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1164.26it/s, NLL=3.38][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1143.19it/s, NLL=3.38][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1143.19it/s, NLL=3.38][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1143.19it/s, NLL=3.39][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1111.37it/s, NLL=3.39][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1111.37it/s, NLL=3.4] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1111.37it/s, NLL=3.4][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1131.55it/s, NLL=3.4][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1131.55it/s, NLL=3.41][A
100%|██████████| 5951/5951 [00:06<00:00, 1131.55it/s, NLL=3.41][A
  8%|▊         | 10624/129922 [01:15<12:35, 157.81it/s, NLL=3.6, epoch=4]

[05.01.19 16:43:20] Saved checkpoint: ./save/train/train-58/step_400390.pth.tar


  8%|▊         | 10624/129922 [01:16<12:35, 157.81it/s, NLL=3.6, epoch=4]

[05.01.19 16:43:21] New best checkpoint at step 400390...
[05.01.19 16:43:21] Removed checkpoint: ./save/train/train-58/step_200194.pth.tar
[05.01.19 16:43:21] Dev NLL: 03.41, F1: 52.38, EM: 49.62, AvNA: 59.87
[05.01.19 16:43:21] Visualizing in TensorBoard...


 47%|████▋     | 60672/129922 [06:21<07:08, 161.49it/s, NLL=3.7, epoch=4]  

[05.01.19 16:48:27] Evaluating at step 450438...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<01:11, 81.81it/s][A
  1%|          | 64/5951 [00:00<01:11, 81.81it/s, NLL=3.51][A
  2%|▏         | 128/5951 [00:00<01:11, 81.81it/s, NLL=3.23][A
  3%|▎         | 192/5951 [00:00<00:50, 113.38it/s, NLL=3.23][A
  3%|▎         | 192/5951 [00:00<00:50, 113.38it/s, NLL=3.07][A
  4%|▍         | 256/5951 [00:00<00:50, 113.38it/s, NLL=3.24][A
  5%|▌         | 320/5951 [00:00<00:49, 113.38it/s, NLL=3.34][A
  6%|▋         | 384/5951 [00:01<00:35, 156.52it/s, NLL=3.34][A
  6%|▋         | 384/5951 [00:01<00:35, 156.52it/s, NLL=3.4] [A
  8%|▊         | 448/5951 [00:01<00:35, 156.52it/s, NLL=3.35][A
  9%|▊         | 512/5951 [00:01<00:34, 156.52it/s, NLL=3.34][A
 10%|▉         | 576/5951 [00:01<00:25, 211.79it/s, NLL=3.34][A
 10%|▉         | 576/5951 [00:01<00:25, 211.79it/s, NLL=3.38][A
 11%|█         | 640/5951 [00:01<00:25, 211.79it/s, NLL=3.35][A
 12%|█▏        | 704/5951 [00:01<00:18, 279.65it/s, NLL=3.35

 91%|█████████▏| 5440/5951 [00:05<00:00, 1018.71it/s, NLL=3.26][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1018.71it/s, NLL=3.26][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1052.88it/s, NLL=3.26][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1052.88it/s, NLL=3.28][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1052.88it/s, NLL=3.28][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1027.86it/s, NLL=3.28][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1027.86it/s, NLL=3.29][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1027.86it/s, NLL=3.3] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 878.58it/s, NLL=3.3] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 878.58it/s, NLL=3.3][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 878.58it/s, NLL=3.31][A
100%|██████████| 5951/5951 [00:06<00:00, 881.32it/s, NLL=3.31][A
 47%|████▋     | 60672/129922 [06:30<07:08, 161.49it/s, NLL=3.7, epoch=4]

[05.01.19 16:48:36] Saved checkpoint: ./save/train/train-58/step_450438.pth.tar


 47%|████▋     | 60672/129922 [06:31<07:08, 161.49it/s, NLL=3.7, epoch=4]

[05.01.19 16:48:37] New best checkpoint at step 450438...
[05.01.19 16:48:37] Removed checkpoint: ./save/train/train-58/step_150146.pth.tar
[05.01.19 16:48:37] Dev NLL: 03.32, F1: 53.35, EM: 50.61, AvNA: 60.83
[05.01.19 16:48:37] Visualizing in TensorBoard...


 85%|████████▌ | 110720/129922 [11:34<02:03, 155.68it/s, NLL=4.33, epoch=4]

[05.01.19 16:53:40] Evaluating at step 500486...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.20it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.20it/s, NLL=3.47][A
  2%|▏         | 128/5951 [00:00<00:44, 132.20it/s, NLL=3.17][A
  3%|▎         | 192/5951 [00:00<00:32, 179.37it/s, NLL=3.17][A
  3%|▎         | 192/5951 [00:00<00:32, 179.37it/s, NLL=3.04][A
  4%|▍         | 256/5951 [00:00<00:31, 179.37it/s, NLL=3.21][A
  5%|▌         | 320/5951 [00:00<00:31, 179.37it/s, NLL=3.29][A
  6%|▋         | 384/5951 [00:00<00:22, 242.57it/s, NLL=3.29][A
  6%|▋         | 384/5951 [00:00<00:22, 242.57it/s, NLL=3.34][A
  8%|▊         | 448/5951 [00:00<00:22, 242.57it/s, NLL=3.3] [A
  9%|▊         | 512/5951 [00:00<00:22, 242.57it/s, NLL=3.28][A
 10%|▉         | 576/5951 [00:00<00:16, 318.76it/s, NLL=3.28][A
 10%|▉         | 576/5951 [00:00<00:16, 318.76it/s, NLL=3.33][A
 11%|█         | 640/5951 [00:00<00:16, 318.76it/s, NLL=3.29][A
 12%|█▏        | 704/5951 [00:01<00:13, 401.51it/s, NLL=3

 92%|█████████▏| 5504/5951 [00:05<00:00, 1118.32it/s, NLL=3.2][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1118.32it/s, NLL=3.21][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1118.32it/s, NLL=3.22][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1119.38it/s, NLL=3.22][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1119.38it/s, NLL=3.22][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1119.38it/s, NLL=3.24][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1078.08it/s, NLL=3.24][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1078.08it/s, NLL=3.24][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1078.08it/s, NLL=3.25][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1095.24it/s, NLL=3.25][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1095.24it/s, NLL=3.25][A
100%|██████████| 5951/5951 [00:06<00:00, 1095.24it/s, NLL=3.26][A
 85%|████████▌ | 110720/129922 [11:42<02:03, 155.68it/s, NLL=4.33, epoch=4]

[05.01.19 16:53:48] Saved checkpoint: ./save/train/train-58/step_500486.pth.tar


 85%|████████▌ | 110720/129922 [11:43<02:03, 155.68it/s, NLL=4.33, epoch=4]

[05.01.19 16:53:49] New best checkpoint at step 500486...
[05.01.19 16:53:49] Removed checkpoint: ./save/train/train-58/step_350340.pth.tar
[05.01.19 16:53:49] Dev NLL: 03.26, F1: 53.79, EM: 50.90, AvNA: 61.20
[05.01.19 16:53:49] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:41<00:00, 158.15it/s, NLL=1.63, epoch=4]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 16:55:46] Starting epoch 5...


 24%|██▎       | 30848/129922 [03:06<09:10, 179.83it/s, NLL=4.09, epoch=5]

[05.01.19 16:58:52] Evaluating at step 550536...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.63it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.63it/s, NLL=3.48][A
  2%|▏         | 128/5951 [00:00<00:43, 132.63it/s, NLL=3.15][A
  3%|▎         | 192/5951 [00:00<00:31, 180.00it/s, NLL=3.15][A
  3%|▎         | 192/5951 [00:00<00:31, 180.00it/s, NLL=3.01][A
  4%|▍         | 256/5951 [00:00<00:31, 180.00it/s, NLL=3.18][A
  5%|▌         | 320/5951 [00:00<00:31, 180.00it/s, NLL=3.26][A
  6%|▋         | 384/5951 [00:00<00:23, 241.95it/s, NLL=3.26][A
  6%|▋         | 384/5951 [00:00<00:23, 241.95it/s, NLL=3.3] [A
  8%|▊         | 448/5951 [00:00<00:22, 241.95it/s, NLL=3.25][A
  9%|▊         | 512/5951 [00:00<00:17, 314.42it/s, NLL=3.25][A
  9%|▊         | 512/5951 [00:00<00:17, 314.42it/s, NLL=3.22][A
 10%|▉         | 576/5951 [00:00<00:17, 314.42it/s, NLL=3.27][A
 11%|█         | 640/5951 [00:01<00:13, 384.34it/s, NLL=3.27][A
 11%|█         | 640/5951 [00:01<00:13, 384.34it/s, NLL=3

 91%|█████████▏| 5440/5951 [00:05<00:00, 1082.10it/s, NLL=3.15][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1082.10it/s, NLL=3.16][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1082.10it/s, NLL=3.16][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1108.68it/s, NLL=3.16][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1108.68it/s, NLL=3.18][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1108.68it/s, NLL=3.18][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1028.97it/s, NLL=3.18][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1028.97it/s, NLL=3.2] [A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1028.97it/s, NLL=3.2][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 998.65it/s, NLL=3.2] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 998.65it/s, NLL=3.21][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 998.65it/s, NLL=3.21][A
100%|██████████| 5951/5951 [00:06<00:00, 1064.29it/s, NLL=3.21][A
 24%|██▎       | 30848/129922 [03:14<09:10, 179.83it/s, NLL=4.09, epoch=5]

[05.01.19 16:59:01] Saved checkpoint: ./save/train/train-58/step_550536.pth.tar


 24%|██▎       | 30848/129922 [03:15<09:10, 179.83it/s, NLL=4.09, epoch=5]

[05.01.19 16:59:01] New best checkpoint at step 550536...
[05.01.19 16:59:01] Removed checkpoint: ./save/train/train-58/step_100096.pth.tar
[05.01.19 16:59:01] Dev NLL: 03.22, F1: 54.57, EM: 51.62, AvNA: 61.84
[05.01.19 16:59:01] Visualizing in TensorBoard...


 62%|██████▏   | 80896/129922 [08:21<05:19, 153.30it/s, NLL=3.01, epoch=5] 

[05.01.19 17:04:08] Evaluating at step 600584...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<01:20, 73.31it/s][A
  1%|          | 64/5951 [00:00<01:20, 73.31it/s, NLL=3.37][A
  2%|▏         | 128/5951 [00:00<01:19, 73.31it/s, NLL=3.1][A
  3%|▎         | 192/5951 [00:00<00:56, 101.59it/s, NLL=3.1][A
  3%|▎         | 192/5951 [00:01<00:56, 101.59it/s, NLL=2.96][A
  4%|▍         | 256/5951 [00:01<00:56, 101.59it/s, NLL=3.12][A
  5%|▌         | 320/5951 [00:01<00:40, 138.21it/s, NLL=3.12][A
  5%|▌         | 320/5951 [00:01<00:40, 138.21it/s, NLL=3.22][A
  6%|▋         | 384/5951 [00:01<00:40, 138.21it/s, NLL=3.25][A
  8%|▊         | 448/5951 [00:01<00:29, 185.81it/s, NLL=3.25][A
  8%|▊         | 448/5951 [00:01<00:29, 185.81it/s, NLL=3.19][A
  9%|▊         | 512/5951 [00:01<00:29, 185.81it/s, NLL=3.17][A
 10%|▉         | 576/5951 [00:01<00:22, 239.69it/s, NLL=3.17][A
 10%|▉         | 576/5951 [00:01<00:22, 239.69it/s, NLL=3.2] [A
 11%|█         | 640/5951 [00:01<00:22, 239.69it/s, NLL=3.17]

 90%|█████████ | 5376/5951 [00:06<00:00, 1176.42it/s, NLL=3.1] [A
 91%|█████████▏| 5440/5951 [00:06<00:00, 1176.42it/s, NLL=3.1][A
 92%|█████████▏| 5504/5951 [00:06<00:00, 1195.93it/s, NLL=3.1][A
 92%|█████████▏| 5504/5951 [00:06<00:00, 1195.93it/s, NLL=3.11][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1195.93it/s, NLL=3.13][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1153.49it/s, NLL=3.13][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1153.49it/s, NLL=3.13][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1153.49it/s, NLL=3.14][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1099.43it/s, NLL=3.14][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1099.43it/s, NLL=3.15][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1099.43it/s, NLL=3.16][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1110.12it/s, NLL=3.16][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1110.12it/s, NLL=3.16][A
100%|██████████| 5951/5951 [00:06<00:00, 1110.12it/s, NLL=3.17][A
 62%|██████▏   | 80896/129922 [08:31<05:19, 153.30it/s, NLL=3.01

[05.01.19 17:04:17] Saved checkpoint: ./save/train/train-58/step_600584.pth.tar


 62%|██████▏   | 80896/129922 [08:31<05:19, 153.30it/s, NLL=3.01, epoch=5]

[05.01.19 17:04:18] New best checkpoint at step 600584...
[05.01.19 17:04:18] Removed checkpoint: ./save/train/train-58/step_50048.pth.tar
[05.01.19 17:04:18] Dev NLL: 03.17, F1: 55.20, EM: 52.31, AvNA: 62.36
[05.01.19 17:04:18] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:33<00:00, 159.74it/s, NLL=3.9, epoch=5] 
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 17:09:20] Starting epoch 6...


  1%|          | 1024/129922 [00:07<13:42, 156.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:09:27] Evaluating at step 650634...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:51, 114.25it/s][A
  1%|          | 64/5951 [00:00<00:51, 114.25it/s, NLL=3.34][A
  2%|▏         | 128/5951 [00:00<00:50, 114.25it/s, NLL=3.04][A
  3%|▎         | 192/5951 [00:00<00:36, 155.83it/s, NLL=3.04][A
  3%|▎         | 192/5951 [00:00<00:36, 155.83it/s, NLL=2.92][A
  4%|▍         | 256/5951 [00:00<00:36, 155.83it/s, NLL=3.08][A
  5%|▌         | 320/5951 [00:00<00:36, 155.83it/s, NLL=3.18][A
  6%|▋         | 384/5951 [00:00<00:26, 211.84it/s, NLL=3.18][A
  6%|▋         | 384/5951 [00:00<00:26, 211.84it/s, NLL=3.2] [A
  8%|▊         | 448/5951 [00:00<00:25, 211.84it/s, NLL=3.14][A
  9%|▊         | 512/5951 [00:00<00:25, 211.84it/s, NLL=3.13][A
 10%|▉         | 576/5951 [00:00<00:19, 281.68it/s, NLL=3.13][A
 10%|▉         | 576/5951 [00:00<00:19, 281.68it/s, NLL=3.15][A
 11%|█         | 640/5951 [00:01<00:18, 281.68it/s, NLL=3.12][A
 12%|█▏        | 704/5951 [00:01<00:14, 350.80it/s, NLL=3

 92%|█████████▏| 5504/5951 [00:05<00:00, 1123.77it/s, NLL=3.05][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1123.77it/s, NLL=3.06][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1123.77it/s, NLL=3.08][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1072.22it/s, NLL=3.08][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1072.22it/s, NLL=3.08][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1072.22it/s, NLL=3.09][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1025.70it/s, NLL=3.09][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1025.70it/s, NLL=3.1] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1025.70it/s, NLL=3.1][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1045.05it/s, NLL=3.1][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1045.05it/s, NLL=3.11][A
100%|██████████| 5951/5951 [00:06<00:00, 1045.05it/s, NLL=3.12][A
  1%|          | 1024/129922 [00:15<13:42, 156.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:09:35] Saved checkpoint: ./save/train/train-58/step_650634.pth.tar


  1%|          | 1024/129922 [00:16<13:42, 156.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:09:36] New best checkpoint at step 650634...
[05.01.19 17:09:36] Removed checkpoint: ./save/train/train-58/step_400390.pth.tar
[05.01.19 17:09:36] Dev NLL: 03.12, F1: 55.62, EM: 52.87, AvNA: 62.46
[05.01.19 17:09:36] Visualizing in TensorBoard...


 39%|███▉      | 51072/129922 [05:19<07:41, 170.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:14:39] Evaluating at step 700682...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:43, 133.85it/s][A
  1%|          | 64/5951 [00:00<00:43, 133.85it/s, NLL=3.44][A
  2%|▏         | 128/5951 [00:00<00:43, 133.85it/s, NLL=3.06][A
  3%|▎         | 192/5951 [00:00<00:31, 182.48it/s, NLL=3.06][A
  3%|▎         | 192/5951 [00:00<00:31, 182.48it/s, NLL=2.91][A
  4%|▍         | 256/5951 [00:00<00:31, 182.48it/s, NLL=3.08][A
  5%|▌         | 320/5951 [00:00<00:22, 245.56it/s, NLL=3.08][A
  5%|▌         | 320/5951 [00:00<00:22, 245.56it/s, NLL=3.17][A
  6%|▋         | 384/5951 [00:00<00:22, 245.56it/s, NLL=3.19][A
  8%|▊         | 448/5951 [00:00<00:22, 245.56it/s, NLL=3.14][A
  9%|▊         | 512/5951 [00:00<00:16, 327.04it/s, NLL=3.14][A
  9%|▊         | 512/5951 [00:00<00:16, 327.04it/s, NLL=3.13][A
 10%|▉         | 576/5951 [00:00<00:16, 327.04it/s, NLL=3.15][A
 11%|█         | 640/5951 [00:00<00:12, 411.53it/s, NLL=3.15][A
 11%|█         | 640/5951 [00:00<00:12, 411.53it/s, NLL=3

 91%|█████████▏| 5440/5951 [00:05<00:00, 971.52it/s, NLL=3.03][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 971.52it/s, NLL=3.04][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1035.72it/s, NLL=3.04][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1035.72it/s, NLL=3.06][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1035.72it/s, NLL=3.07][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1053.77it/s, NLL=3.07][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1053.77it/s, NLL=3.08][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1053.77it/s, NLL=3.08][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1009.69it/s, NLL=3.08][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1009.69it/s, NLL=3.09][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1009.69it/s, NLL=3.1] [A
100%|██████████| 5951/5951 [00:06<00:00, 1059.62it/s, NLL=3.1][A
 39%|███▉      | 51072/129922 [05:27<07:41, 170.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:14:47] Saved checkpoint: ./save/train/train-58/step_700682.pth.tar


 39%|███▉      | 51072/129922 [05:28<07:41, 170.81it/s, NLL=3.46, epoch=6]

[05.01.19 17:14:48] New best checkpoint at step 700682...
[05.01.19 17:14:48] Removed checkpoint: ./save/train/train-58/step_450438.pth.tar
[05.01.19 17:14:48] Dev NLL: 03.11, F1: 55.97, EM: 53.18, AvNA: 62.71
[05.01.19 17:14:48] Visualizing in TensorBoard...


 78%|███████▊  | 101120/129922 [10:34<03:04, 156.35it/s, NLL=3.88, epoch=6]

[05.01.19 17:19:54] Evaluating at step 750730...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<01:10, 82.93it/s][A
  1%|          | 64/5951 [00:00<01:10, 82.93it/s, NLL=3.32][A
  2%|▏         | 128/5951 [00:00<01:10, 82.93it/s, NLL=2.97][A
  3%|▎         | 192/5951 [00:00<00:50, 113.88it/s, NLL=2.97][A
  3%|▎         | 192/5951 [00:00<00:50, 113.88it/s, NLL=2.85][A
  4%|▍         | 256/5951 [00:00<00:50, 113.88it/s, NLL=3.02][A
  5%|▌         | 320/5951 [00:01<00:36, 154.87it/s, NLL=3.02][A
  5%|▌         | 320/5951 [00:01<00:36, 154.87it/s, NLL=3.13][A
  6%|▋         | 384/5951 [00:01<00:35, 154.87it/s, NLL=3.16][A
  8%|▊         | 448/5951 [00:01<00:26, 205.40it/s, NLL=3.16][A
  8%|▊         | 448/5951 [00:01<00:26, 205.40it/s, NLL=3.11][A
  9%|▊         | 512/5951 [00:01<00:26, 205.40it/s, NLL=3.09][A
 10%|▉         | 576/5951 [00:01<00:20, 260.84it/s, NLL=3.09][A
 10%|▉         | 576/5951 [00:01<00:20, 260.84it/s, NLL=3.12][A
 11%|█         | 640/5951 [00:01<00:20, 260.84it/s, NLL=3.08

 90%|█████████ | 5376/5951 [00:06<00:00, 841.53it/s, NLL=2.99][A
 91%|█████████▏| 5440/5951 [00:06<00:00, 847.26it/s, NLL=2.99][A
 91%|█████████▏| 5440/5951 [00:06<00:00, 847.26it/s, NLL=3]   [A
 92%|█████████▏| 5504/5951 [00:06<00:00, 847.26it/s, NLL=3.01][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 917.26it/s, NLL=3.01][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 917.26it/s, NLL=3.02][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 917.26it/s, NLL=3.03][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 954.58it/s, NLL=3.03][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 954.58it/s, NLL=3.04][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 954.58it/s, NLL=3.04][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 953.15it/s, NLL=3.04][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 953.15it/s, NLL=3.05][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 953.15it/s, NLL=3.06][A
100%|██████████| 5951/5951 [00:06<00:00, 976.77it/s, NLL=3.06][A
 78%|███████▊  | 101120/129922 [10:43<03:04, 156.35it/s, NLL=3.88, epoch=6]

[05.01.19 17:20:04] Saved checkpoint: ./save/train/train-58/step_750730.pth.tar


 78%|███████▊  | 101120/129922 [10:44<03:04, 156.35it/s, NLL=3.88, epoch=6]

[05.01.19 17:20:04] New best checkpoint at step 750730...
[05.01.19 17:20:04] Removed checkpoint: ./save/train/train-58/step_500486.pth.tar
[05.01.19 17:20:04] Dev NLL: 03.07, F1: 56.40, EM: 53.67, AvNA: 63.18
[05.01.19 17:20:04] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:43<00:00, 157.78it/s, NLL=3.71, epoch=6]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 17:23:03] Starting epoch 7...


 16%|█▋        | 21248/129922 [02:08<10:15, 176.70it/s, NLL=3.24, epoch=7]

[05.01.19 17:25:12] Evaluating at step 800780...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:45, 128.33it/s][A
  1%|          | 64/5951 [00:00<00:45, 128.33it/s, NLL=3.1][A
  2%|▏         | 128/5951 [00:00<00:45, 128.33it/s, NLL=2.78][A
  3%|▎         | 192/5951 [00:00<00:32, 175.09it/s, NLL=2.78][A
  3%|▎         | 192/5951 [00:00<00:32, 175.09it/s, NLL=2.71][A
  4%|▍         | 256/5951 [00:00<00:32, 175.09it/s, NLL=2.9] [A
  5%|▌         | 320/5951 [00:00<00:32, 175.09it/s, NLL=3.02][A
  6%|▋         | 384/5951 [00:00<00:23, 237.95it/s, NLL=3.02][A
  6%|▋         | 384/5951 [00:00<00:23, 237.95it/s, NLL=3.07][A
  8%|▊         | 448/5951 [00:00<00:23, 237.95it/s, NLL=3.03][A
  9%|▊         | 512/5951 [00:00<00:22, 237.95it/s, NLL=3.01][A
 10%|▉         | 576/5951 [00:00<00:17, 313.52it/s, NLL=3.01][A
 10%|▉         | 576/5951 [00:00<00:17, 313.52it/s, NLL=3.05][A
 11%|█         | 640/5951 [00:00<00:16, 313.52it/s, NLL=3.01][A
 12%|█▏        | 704/5951 [00:01<00:13, 400.11it/s, NLL=3.

 95%|█████████▍| 5632/5951 [00:05<00:00, 1036.02it/s, NLL=3.02][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1036.02it/s, NLL=3.02][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1036.02it/s, NLL=3.03][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 938.60it/s, NLL=3.03] [A
 97%|█████████▋| 5760/5951 [00:05<00:00, 938.60it/s, NLL=3.04][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 938.60it/s, NLL=3.04][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 909.44it/s, NLL=3.04][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 909.44it/s, NLL=3.05][A
100%|██████████| 5951/5951 [00:05<00:00, 909.44it/s, NLL=3.06][A
 16%|█▋        | 21248/129922 [02:16<10:15, 176.70it/s, NLL=3.24, epoch=7]

[05.01.19 17:25:20] Saved checkpoint: ./save/train/train-58/step_800780.pth.tar
[05.01.19 17:25:20] Removed checkpoint: ./save/train/train-58/step_550536.pth.tar
[05.01.19 17:25:20] Dev NLL: 03.06, F1: 56.18, EM: 53.35, AvNA: 62.80
[05.01.19 17:25:20] Visualizing in TensorBoard...


 55%|█████▍    | 71296/129922 [07:21<05:43, 170.66it/s, NLL=2.95, epoch=7] 

[05.01.19 17:30:25] Evaluating at step 850828...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:48, 121.54it/s][A
  1%|          | 64/5951 [00:00<00:48, 121.54it/s, NLL=3.18][A
  2%|▏         | 128/5951 [00:00<00:47, 121.54it/s, NLL=2.81][A
  3%|▎         | 192/5951 [00:00<00:34, 164.73it/s, NLL=2.81][A
  3%|▎         | 192/5951 [00:00<00:34, 164.73it/s, NLL=2.75][A
  4%|▍         | 256/5951 [00:00<00:34, 164.73it/s, NLL=2.91][A
  5%|▌         | 320/5951 [00:00<00:34, 164.73it/s, NLL=3.02][A
  6%|▋         | 384/5951 [00:00<00:24, 223.55it/s, NLL=3.02][A
  6%|▋         | 384/5951 [00:00<00:24, 223.55it/s, NLL=3.05][A
  8%|▊         | 448/5951 [00:00<00:24, 223.55it/s, NLL=3.01][A
  9%|▊         | 512/5951 [00:00<00:24, 223.55it/s, NLL=2.99][A
 10%|▉         | 576/5951 [00:00<00:18, 295.47it/s, NLL=2.99][A
 10%|▉         | 576/5951 [00:00<00:18, 295.47it/s, NLL=3.03][A
 11%|█         | 640/5951 [00:01<00:17, 295.47it/s, NLL=2.98][A
 12%|█▏        | 704/5951 [00:01<00:13, 379.65it/s, NLL=2

 94%|█████████▎| 5568/5951 [00:05<00:00, 1227.10it/s, NLL=3][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1227.10it/s, NLL=3.02][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1227.10it/s, NLL=3.02][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1203.70it/s, NLL=3.02][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1203.70it/s, NLL=3.03][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1203.70it/s, NLL=3.03][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1119.51it/s, NLL=3.03][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1119.51it/s, NLL=3.04][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1119.51it/s, NLL=3.05][A
100%|██████████| 5951/5951 [00:05<00:00, 1119.51it/s, NLL=3.06][A
 55%|█████▍    | 71296/129922 [07:29<05:43, 170.66it/s, NLL=2.95, epoch=7]

[05.01.19 17:30:33] Saved checkpoint: ./save/train/train-58/step_850828.pth.tar
[05.01.19 17:30:33] Removed checkpoint: ./save/train/train-58/step_600584.pth.tar
[05.01.19 17:30:33] Dev NLL: 03.06, F1: 56.37, EM: 53.67, AvNA: 62.93
[05.01.19 17:30:33] Visualizing in TensorBoard...


 93%|█████████▎| 121344/129922 [12:33<00:52, 163.87it/s, NLL=3.49, epoch=7]

[05.01.19 17:35:36] Evaluating at step 900876...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:59, 99.62it/s][A
  1%|          | 64/5951 [00:00<00:59, 99.62it/s, NLL=3.2][A
  2%|▏         | 128/5951 [00:00<00:58, 99.62it/s, NLL=2.84][A
  3%|▎         | 192/5951 [00:00<00:42, 136.68it/s, NLL=2.84][A
  3%|▎         | 192/5951 [00:00<00:42, 136.68it/s, NLL=2.8] [A
  4%|▍         | 256/5951 [00:00<00:41, 136.68it/s, NLL=2.94][A
  5%|▌         | 320/5951 [00:00<00:30, 182.59it/s, NLL=2.94][A
  5%|▌         | 320/5951 [00:00<00:30, 182.59it/s, NLL=3.05][A
  6%|▋         | 384/5951 [00:00<00:30, 182.59it/s, NLL=3.07][A
  8%|▊         | 448/5951 [00:01<00:22, 244.65it/s, NLL=3.07][A
  8%|▊         | 448/5951 [00:01<00:22, 244.65it/s, NLL=3.03][A
  9%|▊         | 512/5951 [00:01<00:22, 244.65it/s, NLL=3]   [A
 10%|▉         | 576/5951 [00:01<00:17, 313.44it/s, NLL=3][A
 10%|▉         | 576/5951 [00:01<00:17, 313.44it/s, NLL=3.04][A
 11%|█         | 640/5951 [00:01<00:16, 313.44it/s, NLL=2.99][A

 92%|█████████▏| 5504/5951 [00:05<00:00, 1201.03it/s, NLL=3][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1168.96it/s, NLL=3][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1168.96it/s, NLL=3.02][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1168.96it/s, NLL=3.02][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1145.57it/s, NLL=3.02][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1145.57it/s, NLL=3.03][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1145.57it/s, NLL=3.03][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1083.48it/s, NLL=3.03][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1083.48it/s, NLL=3.03][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1083.48it/s, NLL=3.04][A
100%|██████████| 5951/5951 [00:05<00:00, 1083.48it/s, NLL=3.05][A
 93%|█████████▎| 121344/129922 [12:41<00:52, 163.87it/s, NLL=3.49, epoch=7]

[05.01.19 17:35:45] Saved checkpoint: ./save/train/train-58/step_900876.pth.tar


 93%|█████████▎| 121344/129922 [12:42<00:52, 163.87it/s, NLL=3.49, epoch=7]

[05.01.19 17:35:45] New best checkpoint at step 900876...
[05.01.19 17:35:45] Removed checkpoint: ./save/train/train-58/step_650634.pth.tar
[05.01.19 17:35:45] Dev NLL: 03.05, F1: 56.73, EM: 53.86, AvNA: 63.32
[05.01.19 17:35:45] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:36<00:00, 159.22it/s, NLL=2.55, epoch=7]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 17:36:39] Starting epoch 8...


 32%|███▏      | 41472/129922 [04:13<09:06, 161.85it/s, NLL=3.42, epoch=8]

[05.01.19 17:40:53] Evaluating at step 950926...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<01:12, 80.76it/s][A
  1%|          | 64/5951 [00:00<01:12, 80.76it/s, NLL=3.17][A
  2%|▏         | 128/5951 [00:00<01:12, 80.76it/s, NLL=2.82][A
  3%|▎         | 192/5951 [00:00<00:51, 111.17it/s, NLL=2.82][A
  3%|▎         | 192/5951 [00:00<00:51, 111.17it/s, NLL=2.84][A
  4%|▍         | 256/5951 [00:00<00:51, 111.17it/s, NLL=2.98][A
  5%|▌         | 320/5951 [00:01<00:50, 111.17it/s, NLL=3.08][A
  6%|▋         | 384/5951 [00:01<00:36, 153.58it/s, NLL=3.08][A
  6%|▋         | 384/5951 [00:01<00:36, 153.58it/s, NLL=3.1] [A
  8%|▊         | 448/5951 [00:01<00:35, 153.58it/s, NLL=3.05][A
  9%|▊         | 512/5951 [00:01<00:35, 153.58it/s, NLL=3.02][A
 10%|▉         | 576/5951 [00:01<00:25, 208.08it/s, NLL=3.02][A
 10%|▉         | 576/5951 [00:01<00:25, 208.08it/s, NLL=3.05][A
 11%|█         | 640/5951 [00:01<00:25, 208.08it/s, NLL=3]   [A
 12%|█▏        | 704/5951 [00:01<00:18, 276.47it/s, NLL=3][

 94%|█████████▎| 5568/5951 [00:05<00:00, 1126.05it/s, NLL=2.99][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1126.05it/s, NLL=3]   [A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1126.05it/s, NLL=3][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1141.56it/s, NLL=3][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1141.56it/s, NLL=3.01][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1141.56it/s, NLL=3.02][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1026.25it/s, NLL=3.02][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1026.25it/s, NLL=3.02][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1026.25it/s, NLL=3.02][A
100%|██████████| 5951/5951 [00:06<00:00, 1083.35it/s, NLL=3.02][A
 32%|███▏      | 41472/129922 [04:21<09:06, 161.85it/s, NLL=3.42, epoch=8]

[05.01.19 17:41:01] Saved checkpoint: ./save/train/train-58/step_950926.pth.tar
[05.01.19 17:41:01] Removed checkpoint: ./save/train/train-58/step_700682.pth.tar
[05.01.19 17:41:01] Dev NLL: 03.04, F1: 56.68, EM: 53.97, AvNA: 63.33
[05.01.19 17:41:01] Visualizing in TensorBoard...


 70%|███████   | 91520/129922 [09:28<03:59, 160.27it/s, NLL=2.67, epoch=8] 

[05.01.19 17:46:08] Evaluating at step 1000974...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:42, 137.00it/s][A
  1%|          | 64/5951 [00:00<00:42, 137.00it/s, NLL=3.14][A
  2%|▏         | 128/5951 [00:00<00:42, 137.00it/s, NLL=2.8][A
  3%|▎         | 192/5951 [00:00<00:30, 186.08it/s, NLL=2.8][A
  3%|▎         | 192/5951 [00:00<00:30, 186.08it/s, NLL=2.82][A
  4%|▍         | 256/5951 [00:00<00:30, 186.08it/s, NLL=2.96][A
  5%|▌         | 320/5951 [00:00<00:30, 186.08it/s, NLL=3.07][A
  6%|▋         | 384/5951 [00:00<00:22, 249.91it/s, NLL=3.07][A
  6%|▋         | 384/5951 [00:00<00:22, 249.91it/s, NLL=3.08][A
  8%|▊         | 448/5951 [00:00<00:22, 249.91it/s, NLL=3.04][A
  9%|▊         | 512/5951 [00:00<00:21, 249.91it/s, NLL=3]   [A
 10%|▉         | 576/5951 [00:00<00:16, 328.75it/s, NLL=3][A
 10%|▉         | 576/5951 [00:00<00:16, 328.75it/s, NLL=3.04][A
 11%|█         | 640/5951 [00:00<00:16, 328.75it/s, NLL=2.98][A
 12%|█▏        | 704/5951 [00:01<00:13, 391.94it/s, NLL=2.98]

 94%|█████████▎| 5568/5951 [00:05<00:00, 1135.47it/s, NLL=2.98][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1006.75it/s, NLL=2.98][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1006.75it/s, NLL=2.98][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1006.75it/s, NLL=2.99][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1014.85it/s, NLL=2.99][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1014.85it/s, NLL=3]   [A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1014.85it/s, NLL=3][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1060.28it/s, NLL=3][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1060.28it/s, NLL=3.01][A
100%|██████████| 5951/5951 [00:05<00:00, 1060.28it/s, NLL=3.02][A
 70%|███████   | 91520/129922 [09:36<03:59, 160.27it/s, NLL=2.67, epoch=8]

[05.01.19 17:46:16] Saved checkpoint: ./save/train/train-58/step_1000974.pth.tar


 70%|███████   | 91520/129922 [09:37<03:59, 160.27it/s, NLL=2.67, epoch=8]

[05.01.19 17:46:16] New best checkpoint at step 1000974...
[05.01.19 17:46:16] Removed checkpoint: ./save/train/train-58/step_800780.pth.tar
[05.01.19 17:46:16] Dev NLL: 03.02, F1: 57.42, EM: 54.56, AvNA: 63.91
[05.01.19 17:46:16] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:33<00:00, 159.64it/s, NLL=1.6, epoch=8] 
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 17:50:13] Starting epoch 9...


  9%|▉         | 11648/129922 [01:11<11:21, 173.61it/s, NLL=2.67, epoch=9]

[05.01.19 17:51:24] Evaluating at step 1051024...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 133.20it/s][A
  1%|          | 64/5951 [00:00<00:44, 133.20it/s, NLL=3.3][A
  2%|▏         | 128/5951 [00:00<00:43, 133.20it/s, NLL=2.92][A
  3%|▎         | 192/5951 [00:00<00:31, 181.13it/s, NLL=2.92][A
  3%|▎         | 192/5951 [00:00<00:31, 181.13it/s, NLL=2.9] [A
  4%|▍         | 256/5951 [00:00<00:31, 181.13it/s, NLL=3.02][A
  5%|▌         | 320/5951 [00:00<00:31, 181.13it/s, NLL=3.13][A
  6%|▋         | 384/5951 [00:00<00:22, 245.27it/s, NLL=3.13][A
  6%|▋         | 384/5951 [00:00<00:22, 245.27it/s, NLL=3.14][A
  8%|▊         | 448/5951 [00:00<00:22, 245.27it/s, NLL=3.08][A
  9%|▊         | 512/5951 [00:00<00:22, 245.27it/s, NLL=3.04][A
 10%|▉         | 576/5951 [00:00<00:16, 324.36it/s, NLL=3.04][A
 10%|▉         | 576/5951 [00:00<00:16, 324.36it/s, NLL=3.07][A
 11%|█         | 640/5951 [00:00<00:16, 324.36it/s, NLL=3.01][A
 12%|█▏        | 704/5951 [00:00<00:12, 412.61it/s, NLL=3.

 92%|█████████▏| 5504/5951 [00:06<00:00, 1125.37it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1168.05it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1168.05it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1168.05it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1157.16it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1157.16it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1157.16it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1106.73it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1106.73it/s, NLL=2.99][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1106.73it/s, NLL=2.99][A
100%|██████████| 5951/5951 [00:06<00:00, 1106.73it/s, NLL=3]   [A
  9%|▉         | 11648/129922 [01:19<11:21, 173.61it/s, NLL=2.67, epoch=9]

[05.01.19 17:51:33] Saved checkpoint: ./save/train/train-58/step_1051024.pth.tar


  9%|▉         | 11648/129922 [01:20<11:21, 173.61it/s, NLL=2.67, epoch=9]

[05.01.19 17:51:34] New best checkpoint at step 1051024...
[05.01.19 17:51:34] Removed checkpoint: ./save/train/train-58/step_850828.pth.tar
[05.01.19 17:51:34] Dev NLL: 03.00, F1: 57.44, EM: 54.58, AvNA: 63.94
[05.01.19 17:51:34] Visualizing in TensorBoard...


 47%|████▋     | 61696/129922 [06:27<06:51, 165.99it/s, NLL=3.14, epoch=9] 

[05.01.19 17:56:40] Evaluating at step 1101072...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<01:11, 82.13it/s][A
  1%|          | 64/5951 [00:00<01:11, 82.13it/s, NLL=3.51][A
  2%|▏         | 128/5951 [00:00<01:10, 82.13it/s, NLL=3.07][A
  3%|▎         | 192/5951 [00:00<00:50, 113.64it/s, NLL=3.07][A
  3%|▎         | 192/5951 [00:00<00:50, 113.64it/s, NLL=3.04][A
  4%|▍         | 256/5951 [00:00<00:50, 113.64it/s, NLL=3.14][A
  5%|▌         | 320/5951 [00:00<00:49, 113.64it/s, NLL=3.23][A
  6%|▋         | 384/5951 [00:01<00:35, 157.03it/s, NLL=3.23][A
  6%|▋         | 384/5951 [00:01<00:35, 157.03it/s, NLL=3.22][A
  8%|▊         | 448/5951 [00:01<00:35, 157.03it/s, NLL=3.15][A
  9%|▊         | 512/5951 [00:01<00:25, 211.73it/s, NLL=3.15][A
  9%|▊         | 512/5951 [00:01<00:25, 211.73it/s, NLL=3.1] [A
 10%|▉         | 576/5951 [00:01<00:25, 211.73it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:19, 273.39it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:19, 273.39it/s, NLL=3.07

 92%|█████████▏| 5504/5951 [00:05<00:00, 1123.32it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1123.32it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 983.18it/s, NLL=2.97] [A
 95%|█████████▍| 5632/5951 [00:05<00:00, 983.18it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 983.18it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 879.23it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 879.23it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 879.23it/s, NLL=2.99][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 952.88it/s, NLL=2.99][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 952.88it/s, NLL=2.99][A
100%|██████████| 5951/5951 [00:06<00:00, 952.88it/s, NLL=3]   [A
 47%|████▋     | 61696/129922 [06:36<06:51, 165.99it/s, NLL=3.14, epoch=9]

[05.01.19 17:56:49] Saved checkpoint: ./save/train/train-58/step_1101072.pth.tar


 47%|████▋     | 61696/129922 [06:37<06:51, 165.99it/s, NLL=3.14, epoch=9]

[05.01.19 17:56:50] New best checkpoint at step 1101072...
[05.01.19 17:56:50] Removed checkpoint: ./save/train/train-58/step_750730.pth.tar
[05.01.19 17:56:50] Dev NLL: 03.00, F1: 57.62, EM: 54.85, AvNA: 64.12
[05.01.19 17:56:50] Visualizing in TensorBoard...


 86%|████████▌ | 111744/129922 [11:41<01:47, 168.56it/s, NLL=2.94, epoch=9]

[05.01.19 18:01:54] Evaluating at step 1151120...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:42, 137.38it/s][A
  1%|          | 64/5951 [00:00<00:42, 137.38it/s, NLL=3.67][A
  2%|▏         | 128/5951 [00:00<00:42, 137.38it/s, NLL=3.15][A
  3%|▎         | 192/5951 [00:00<00:30, 186.25it/s, NLL=3.15][A
  3%|▎         | 192/5951 [00:00<00:30, 186.25it/s, NLL=3.11][A
  4%|▍         | 256/5951 [00:00<00:30, 186.25it/s, NLL=3.19][A
  5%|▌         | 320/5951 [00:00<00:22, 248.08it/s, NLL=3.19][A
  5%|▌         | 320/5951 [00:00<00:22, 248.08it/s, NLL=3.26][A
  6%|▋         | 384/5951 [00:00<00:22, 248.08it/s, NLL=3.22][A
  8%|▊         | 448/5951 [00:00<00:22, 248.08it/s, NLL=3.15][A
  9%|▊         | 512/5951 [00:00<00:16, 329.51it/s, NLL=3.15][A
  9%|▊         | 512/5951 [00:00<00:16, 329.51it/s, NLL=3.1] [A
 10%|▉         | 576/5951 [00:00<00:16, 329.51it/s, NLL=3.12][A
 11%|█         | 640/5951 [00:00<00:12, 414.57it/s, NLL=3.12][A
 11%|█         | 640/5951 [00:00<00:12, 414.57it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1056.93it/s, NLL=2.95][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1056.93it/s, NLL=2.96][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1056.93it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1081.58it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1081.58it/s, NLL=2.97][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1081.58it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1058.01it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1058.01it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1058.01it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:05<00:00, 1058.01it/s, NLL=2.99][A
 86%|████████▌ | 111744/129922 [11:48<01:47, 168.56it/s, NLL=2.94, epoch=9]

[05.01.19 18:02:02] Saved checkpoint: ./save/train/train-58/step_1151120.pth.tar


 86%|████████▌ | 111744/129922 [11:49<01:47, 168.56it/s, NLL=2.94, epoch=9]

[05.01.19 18:02:02] New best checkpoint at step 1151120...
[05.01.19 18:02:02] Removed checkpoint: ./save/train/train-58/step_950926.pth.tar
[05.01.19 18:02:02] Dev NLL: 02.99, F1: 58.18, EM: 55.42, AvNA: 64.56
[05.01.19 18:02:02] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:40<00:00, 158.39it/s, NLL=3.11, epoch=9]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 18:03:53] Starting epoch 10...


 25%|██▍       | 31872/129922 [03:15<10:02, 162.75it/s, NLL=3.85, epoch=10]

[05.01.19 18:07:09] Evaluating at step 1201170...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:43, 135.87it/s][A
  1%|          | 64/5951 [00:00<00:43, 135.87it/s, NLL=3.71][A
  2%|▏         | 128/5951 [00:00<00:42, 135.87it/s, NLL=3.17][A
  3%|▎         | 192/5951 [00:00<00:31, 183.51it/s, NLL=3.17][A
  3%|▎         | 192/5951 [00:00<00:31, 183.51it/s, NLL=3.11][A
  4%|▍         | 256/5951 [00:00<00:31, 183.51it/s, NLL=3.2] [A
  5%|▌         | 320/5951 [00:00<00:30, 183.51it/s, NLL=3.25][A
  6%|▋         | 384/5951 [00:00<00:22, 248.26it/s, NLL=3.25][A
  6%|▋         | 384/5951 [00:00<00:22, 248.26it/s, NLL=3.21][A
  8%|▊         | 448/5951 [00:00<00:22, 248.26it/s, NLL=3.14][A
  9%|▊         | 512/5951 [00:00<00:21, 248.26it/s, NLL=3.09][A
 10%|▉         | 576/5951 [00:00<00:16, 325.58it/s, NLL=3.09][A
 10%|▉         | 576/5951 [00:00<00:16, 325.58it/s, NLL=3.11][A
 11%|█         | 640/5951 [00:00<00:16, 325.58it/s, NLL=3.03][A
 12%|█▏        | 704/5951 [00:01<00:13, 392.30it/s, NLL=3

 90%|█████████ | 5376/5951 [00:05<00:00, 1146.08it/s, NLL=2.92][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1146.08it/s, NLL=2.92][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1119.94it/s, NLL=2.92][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1119.94it/s, NLL=2.94][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1119.94it/s, NLL=2.96][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1079.70it/s, NLL=2.96][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1079.70it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1079.70it/s, NLL=2.97][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1045.59it/s, NLL=2.97][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1045.59it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1045.59it/s, NLL=2.97][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1054.77it/s, NLL=2.97][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 1054.77it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:06<00:00, 1054.77it/s, NLL=2.99][A
 25%|██▍       | 31872/129922 [03:23<10:02, 162.75it/s, NLL=3.

[05.01.19 18:07:17] Saved checkpoint: ./save/train/train-58/step_1201170.pth.tar


 25%|██▍       | 31872/129922 [03:24<10:02, 162.75it/s, NLL=3.85, epoch=10]

[05.01.19 18:07:18] New best checkpoint at step 1201170...
[05.01.19 18:07:18] Removed checkpoint: ./save/train/train-58/step_900876.pth.tar
[05.01.19 18:07:18] Dev NLL: 02.99, F1: 58.69, EM: 55.91, AvNA: 64.95
[05.01.19 18:07:18] Visualizing in TensorBoard...


 63%|██████▎   | 81920/129922 [08:29<05:00, 159.74it/s, NLL=3.01, epoch=10] 

[05.01.19 18:12:22] Evaluating at step 1251218...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:51, 114.46it/s][A
  1%|          | 64/5951 [00:00<00:51, 114.46it/s, NLL=4][A
  2%|▏         | 128/5951 [00:00<00:50, 114.46it/s, NLL=3.37][A
  3%|▎         | 192/5951 [00:00<00:37, 154.76it/s, NLL=3.37][A
  3%|▎         | 192/5951 [00:00<00:37, 154.76it/s, NLL=3.21][A
  4%|▍         | 256/5951 [00:00<00:36, 154.76it/s, NLL=3.25][A
  5%|▌         | 320/5951 [00:00<00:26, 209.33it/s, NLL=3.25][A
  5%|▌         | 320/5951 [00:00<00:26, 209.33it/s, NLL=3.28][A
  6%|▋         | 384/5951 [00:00<00:26, 209.33it/s, NLL=3.24][A
  8%|▊         | 448/5951 [00:00<00:20, 272.27it/s, NLL=3.24][A
  8%|▊         | 448/5951 [00:00<00:20, 272.27it/s, NLL=3.17][A
  9%|▊         | 512/5951 [00:01<00:19, 272.27it/s, NLL=3.11][A
 10%|▉         | 576/5951 [00:01<00:15, 354.17it/s, NLL=3.11][A
 10%|▉         | 576/5951 [00:01<00:15, 354.17it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:14, 354.17it/s, NLL=3.06

 90%|█████████ | 5376/5951 [00:06<00:00, 830.60it/s, NLL=2.96][A
 91%|█████████▏| 5440/5951 [00:06<00:00, 719.08it/s, NLL=2.96][A
 91%|█████████▏| 5440/5951 [00:06<00:00, 719.08it/s, NLL=2.97][A
 92%|█████████▏| 5504/5951 [00:06<00:00, 719.08it/s, NLL=2.98][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 772.03it/s, NLL=2.98][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 772.03it/s, NLL=3]   [A
 95%|█████████▍| 5632/5951 [00:06<00:00, 772.03it/s, NLL=3][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 794.52it/s, NLL=3][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 794.52it/s, NLL=3.01][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 794.52it/s, NLL=3.01][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 805.45it/s, NLL=3.01][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 805.45it/s, NLL=3.01][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 805.45it/s, NLL=3.02][A
100%|██████████| 5951/5951 [00:06<00:00, 885.73it/s, NLL=3.02][A
 63%|██████▎   | 81920/129922 [08:38<05:00, 159.74it/s, NLL=3.01, epoch=10]

[05.01.19 18:12:31] Saved checkpoint: ./save/train/train-58/step_1251218.pth.tar
[05.01.19 18:12:31] Removed checkpoint: ./save/train/train-58/step_1000974.pth.tar
[05.01.19 18:12:31] Dev NLL: 03.03, F1: 58.23, EM: 55.27, AvNA: 64.76
[05.01.19 18:12:31] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:32<00:00, 159.85it/s, NLL=5.4, epoch=10] 
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 18:17:26] Starting epoch 11...


  2%|▏         | 2048/129922 [00:13<12:57, 164.42it/s, NLL=3.15, epoch=11]

[05.01.19 18:17:39] Evaluating at step 1301268...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:50, 115.80it/s][A
  1%|          | 64/5951 [00:00<00:50, 115.80it/s, NLL=4.17][A
  2%|▏         | 128/5951 [00:00<00:50, 115.80it/s, NLL=3.55][A
  3%|▎         | 192/5951 [00:00<00:36, 157.81it/s, NLL=3.55][A
  3%|▎         | 192/5951 [00:00<00:36, 157.81it/s, NLL=3.45][A
  4%|▍         | 256/5951 [00:00<00:36, 157.81it/s, NLL=3.43][A
  5%|▌         | 320/5951 [00:00<00:35, 157.81it/s, NLL=3.42][A
  6%|▋         | 384/5951 [00:00<00:25, 214.75it/s, NLL=3.42][A
  6%|▋         | 384/5951 [00:00<00:25, 214.75it/s, NLL=3.36][A
  8%|▊         | 448/5951 [00:00<00:25, 214.75it/s, NLL=3.28][A
  9%|▊         | 512/5951 [00:00<00:25, 214.75it/s, NLL=3.19][A
 10%|▉         | 576/5951 [00:00<00:18, 284.77it/s, NLL=3.19][A
 10%|▉         | 576/5951 [00:00<00:18, 284.77it/s, NLL=3.21][A
 11%|█         | 640/5951 [00:01<00:18, 284.77it/s, NLL=3.13][A
 12%|█▏        | 704/5951 [00:01<00:14, 366.23it/s, NLL=3

 91%|█████████▏| 5440/5951 [00:05<00:00, 1095.82it/s, NLL=2.97][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1095.82it/s, NLL=2.99][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1153.33it/s, NLL=2.99][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1153.33it/s, NLL=3.01][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1153.33it/s, NLL=3]   [A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1051.34it/s, NLL=3][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1051.34it/s, NLL=3.01][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1051.34it/s, NLL=3.01][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 914.70it/s, NLL=3.01] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 914.70it/s, NLL=3.01][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 914.70it/s, NLL=3.02][A
100%|██████████| 5951/5951 [00:06<00:00, 914.70it/s, NLL=3.03][A
  2%|▏         | 2048/129922 [00:21<12:57, 164.42it/s, NLL=3.15, epoch=11]

[05.01.19 18:17:48] Saved checkpoint: ./save/train/train-58/step_1301268.pth.tar
[05.01.19 18:17:48] Removed checkpoint: ./save/train/train-58/step_1051024.pth.tar
[05.01.19 18:17:48] Dev NLL: 03.03, F1: 58.29, EM: 55.34, AvNA: 64.68
[05.01.19 18:17:48] Visualizing in TensorBoard...


 40%|████      | 52096/129922 [05:25<07:47, 166.62it/s, NLL=3.82, epoch=11]

[05.01.19 18:22:52] Evaluating at step 1351316...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:50, 116.25it/s][A
  1%|          | 64/5951 [00:00<00:50, 116.25it/s, NLL=4.18][A
  2%|▏         | 128/5951 [00:00<00:50, 116.25it/s, NLL=3.47][A
  3%|▎         | 192/5951 [00:00<00:36, 158.44it/s, NLL=3.47][A
  3%|▎         | 192/5951 [00:00<00:36, 158.44it/s, NLL=3.35][A
  4%|▍         | 256/5951 [00:00<00:35, 158.44it/s, NLL=3.37][A
  5%|▌         | 320/5951 [00:00<00:35, 158.44it/s, NLL=3.39][A
  6%|▋         | 384/5951 [00:00<00:25, 215.73it/s, NLL=3.39][A
  6%|▋         | 384/5951 [00:00<00:25, 215.73it/s, NLL=3.35][A
  8%|▊         | 448/5951 [00:00<00:25, 215.73it/s, NLL=3.28][A
  9%|▊         | 512/5951 [00:00<00:18, 287.33it/s, NLL=3.28][A
  9%|▊         | 512/5951 [00:00<00:18, 287.33it/s, NLL=3.19][A
 10%|▉         | 576/5951 [00:00<00:18, 287.33it/s, NLL=3.22][A
 11%|█         | 640/5951 [00:01<00:14, 364.98it/s, NLL=3.22][A
 11%|█         | 640/5951 [00:01<00:14, 364.98it/s, NLL=3

 91%|█████████▏| 5440/5951 [00:05<00:00, 1132.49it/s, NLL=2.98][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1090.19it/s, NLL=2.98][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1090.19it/s, NLL=2.99][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1090.19it/s, NLL=3.01][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1080.97it/s, NLL=3.01][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1080.97it/s, NLL=3.01][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1080.97it/s, NLL=3.02][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 925.01it/s, NLL=3.02] [A
 97%|█████████▋| 5760/5951 [00:05<00:00, 925.01it/s, NLL=3.02][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 925.01it/s, NLL=3.02][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 868.17it/s, NLL=3.02][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 868.17it/s, NLL=3.02][A
100%|██████████| 5951/5951 [00:06<00:00, 868.17it/s, NLL=3.03][A
 40%|████      | 52096/129922 [05:33<07:47, 166.62it/s, NLL=3.82, epoch=11]

[05.01.19 18:23:00] Saved checkpoint: ./save/train/train-58/step_1351316.pth.tar
[05.01.19 18:23:00] Removed checkpoint: ./save/train/train-58/step_1101072.pth.tar
[05.01.19 18:23:00] Dev NLL: 03.03, F1: 58.43, EM: 55.54, AvNA: 64.71
[05.01.19 18:23:00] Visualizing in TensorBoard...


 79%|███████▊  | 102144/129922 [10:41<02:41, 172.31it/s, NLL=3.17, epoch=11]

[05.01.19 18:28:07] Evaluating at step 1401364...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:43, 136.03it/s][A
  1%|          | 64/5951 [00:00<00:43, 136.03it/s, NLL=4.05][A
  2%|▏         | 128/5951 [00:00<00:42, 136.03it/s, NLL=3.25][A
  3%|▎         | 192/5951 [00:00<00:31, 184.18it/s, NLL=3.25][A
  3%|▎         | 192/5951 [00:00<00:31, 184.18it/s, NLL=3.15][A
  4%|▍         | 256/5951 [00:00<00:30, 184.18it/s, NLL=3.22][A
  5%|▌         | 320/5951 [00:00<00:30, 184.18it/s, NLL=3.28][A
  6%|▋         | 384/5951 [00:00<00:22, 247.73it/s, NLL=3.28][A
  6%|▋         | 384/5951 [00:00<00:22, 247.73it/s, NLL=3.27][A
  8%|▊         | 448/5951 [00:00<00:22, 247.73it/s, NLL=3.2] [A
  9%|▊         | 512/5951 [00:00<00:21, 247.73it/s, NLL=3.13][A
 10%|▉         | 576/5951 [00:00<00:16, 325.27it/s, NLL=3.13][A
 10%|▉         | 576/5951 [00:00<00:16, 325.27it/s, NLL=3.16][A
 11%|█         | 640/5951 [00:00<00:16, 325.27it/s, NLL=3.08][A
 12%|█▏        | 704/5951 [00:01<00:12, 411.77it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1181.21it/s, NLL=2.99][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1181.21it/s, NLL=2.99][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1164.63it/s, NLL=2.99][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1164.63it/s, NLL=3]   [A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1164.63it/s, NLL=3][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1089.24it/s, NLL=3][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1089.24it/s, NLL=3][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1089.24it/s, NLL=3.01][A
100%|██████████| 5951/5951 [00:05<00:00, 1089.24it/s, NLL=3.02][A
 79%|███████▊  | 102144/129922 [10:48<02:41, 172.31it/s, NLL=3.17, epoch=11]

[05.01.19 18:28:15] Saved checkpoint: ./save/train/train-58/step_1401364.pth.tar
[05.01.19 18:28:15] Removed checkpoint: ./save/train/train-58/step_1151120.pth.tar
[05.01.19 18:28:15] Dev NLL: 03.02, F1: 58.49, EM: 55.57, AvNA: 64.64
[05.01.19 18:28:15] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:38<00:00, 158.80it/s, NLL=3.38, epoch=11]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 18:31:04] Starting epoch 12...


 17%|█▋        | 22272/129922 [02:15<10:18, 173.98it/s, NLL=2.77, epoch=12]

[05.01.19 18:33:20] Evaluating at step 1451414...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 133.55it/s][A
  1%|          | 64/5951 [00:00<00:44, 133.55it/s, NLL=4.68][A
  2%|▏         | 128/5951 [00:00<00:43, 133.55it/s, NLL=3.65][A
  3%|▎         | 192/5951 [00:00<00:31, 181.97it/s, NLL=3.65][A
  3%|▎         | 192/5951 [00:00<00:31, 181.97it/s, NLL=3.49][A
  4%|▍         | 256/5951 [00:00<00:31, 181.97it/s, NLL=3.45][A
  5%|▌         | 320/5951 [00:00<00:30, 181.97it/s, NLL=3.44][A
  6%|▋         | 384/5951 [00:00<00:22, 246.79it/s, NLL=3.44][A
  6%|▋         | 384/5951 [00:00<00:22, 246.79it/s, NLL=3.41][A
  8%|▊         | 448/5951 [00:00<00:22, 246.79it/s, NLL=3.32][A
  9%|▊         | 512/5951 [00:00<00:22, 246.79it/s, NLL=3.23][A
 10%|▉         | 576/5951 [00:00<00:16, 325.41it/s, NLL=3.23][A
 10%|▉         | 576/5951 [00:00<00:16, 325.41it/s, NLL=3.25][A
 11%|█         | 640/5951 [00:00<00:16, 325.41it/s, NLL=3.15][A
 12%|█▏        | 704/5951 [00:00<00:12, 413.22it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1228.56it/s, NLL=2.97][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1228.56it/s, NLL=2.99][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1228.56it/s, NLL=2.99][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1203.45it/s, NLL=2.99][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1203.45it/s, NLL=2.99][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1203.45it/s, NLL=2.99][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1134.76it/s, NLL=2.99][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1134.76it/s, NLL=3]   [A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1134.76it/s, NLL=3][A
100%|██████████| 5951/5951 [00:05<00:00, 1134.76it/s, NLL=3.01][A
 17%|█▋        | 22272/129922 [02:23<10:18, 173.98it/s, NLL=2.77, epoch=12]

[05.01.19 18:33:27] Saved checkpoint: ./save/train/train-58/step_1451414.pth.tar


 17%|█▋        | 22272/129922 [02:23<10:18, 173.98it/s, NLL=2.77, epoch=12]

[05.01.19 18:33:28] New best checkpoint at step 1451414...
[05.01.19 18:33:28] Removed checkpoint: ./save/train/train-58/step_1251218.pth.tar
[05.01.19 18:33:28] Dev NLL: 03.01, F1: 58.96, EM: 55.94, AvNA: 65.20
[05.01.19 18:33:28] Visualizing in TensorBoard...


 56%|█████▌    | 72320/129922 [07:29<05:59, 160.02it/s, NLL=2.54, epoch=12] 

[05.01.19 18:38:34] Evaluating at step 1501462...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:50, 115.73it/s][A
  1%|          | 64/5951 [00:00<00:50, 115.73it/s, NLL=4.78][A
  2%|▏         | 128/5951 [00:00<00:50, 115.73it/s, NLL=3.77][A
  3%|▎         | 192/5951 [00:00<00:36, 156.54it/s, NLL=3.77][A
  3%|▎         | 192/5951 [00:00<00:36, 156.54it/s, NLL=3.67][A
  4%|▍         | 256/5951 [00:00<00:36, 156.54it/s, NLL=3.62][A
  5%|▌         | 320/5951 [00:00<00:27, 206.47it/s, NLL=3.62][A
  5%|▌         | 320/5951 [00:00<00:27, 206.47it/s, NLL=3.56][A
  6%|▋         | 384/5951 [00:00<00:26, 206.47it/s, NLL=3.51][A
  8%|▊         | 448/5951 [00:00<00:20, 270.33it/s, NLL=3.51][A
  8%|▊         | 448/5951 [00:00<00:20, 270.33it/s, NLL=3.41][A
  9%|▊         | 512/5951 [00:01<00:20, 270.33it/s, NLL=3.3] [A
 10%|▉         | 576/5951 [00:01<00:15, 350.13it/s, NLL=3.3][A
 10%|▉         | 576/5951 [00:01<00:15, 350.13it/s, NLL=3.32][A
 11%|█         | 640/5951 [00:01<00:15, 350.13it/s, NLL=3.

 91%|█████████▏| 5440/5951 [00:05<00:00, 948.47it/s, NLL=2.95][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 889.70it/s, NLL=2.95][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 889.70it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 889.70it/s, NLL=2.98][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 945.81it/s, NLL=2.98][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 945.81it/s, NLL=2.98][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 945.81it/s, NLL=2.99][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 958.25it/s, NLL=2.99][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 958.25it/s, NLL=2.99][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 958.25it/s, NLL=2.99][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 998.81it/s, NLL=2.99][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 998.81it/s, NLL=2.99][A
100%|██████████| 5951/5951 [00:06<00:00, 998.81it/s, NLL=3.01][A
 56%|█████▌    | 72320/129922 [07:38<05:59, 160.02it/s, NLL=2.54, epoch=12]

[05.01.19 18:38:43] Saved checkpoint: ./save/train/train-58/step_1501462.pth.tar
[05.01.19 18:38:43] Removed checkpoint: ./save/train/train-58/step_1301268.pth.tar
[05.01.19 18:38:43] Dev NLL: 03.01, F1: 58.88, EM: 55.87, AvNA: 65.25
[05.01.19 18:38:43] Visualizing in TensorBoard...


 94%|█████████▍| 122368/129922 [12:46<00:44, 170.87it/s, NLL=2.93, epoch=12]

[05.01.19 18:43:50] Evaluating at step 1551510...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:43, 134.17it/s][A
  1%|          | 64/5951 [00:00<00:43, 134.17it/s, NLL=4.61][A
  2%|▏         | 128/5951 [00:00<00:43, 134.17it/s, NLL=3.68][A
  3%|▎         | 192/5951 [00:00<00:31, 182.87it/s, NLL=3.68][A
  3%|▎         | 192/5951 [00:00<00:31, 182.87it/s, NLL=3.57][A
  4%|▍         | 256/5951 [00:00<00:31, 182.87it/s, NLL=3.55][A
  5%|▌         | 320/5951 [00:00<00:30, 182.87it/s, NLL=3.49][A
  6%|▋         | 384/5951 [00:00<00:22, 247.85it/s, NLL=3.49][A
  6%|▋         | 384/5951 [00:00<00:22, 247.85it/s, NLL=3.45][A
  8%|▊         | 448/5951 [00:00<00:22, 247.85it/s, NLL=3.35][A
  9%|▊         | 512/5951 [00:00<00:21, 247.85it/s, NLL=3.25][A
 10%|▉         | 576/5951 [00:00<00:16, 326.62it/s, NLL=3.25][A
 10%|▉         | 576/5951 [00:00<00:16, 326.62it/s, NLL=3.27][A
 11%|█         | 640/5951 [00:00<00:16, 326.62it/s, NLL=3.17][A
 12%|█▏        | 704/5951 [00:00<00:12, 418.18it/s, NLL=3

 95%|█████████▍| 5632/5951 [00:05<00:00, 1193.81it/s, NLL=2.94][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1176.22it/s, NLL=2.94][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1176.22it/s, NLL=2.95][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1176.22it/s, NLL=2.95][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1120.45it/s, NLL=2.95][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1120.45it/s, NLL=2.95][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1120.45it/s, NLL=2.96][A
100%|██████████| 5951/5951 [00:05<00:00, 1120.45it/s, NLL=2.97][A
 94%|█████████▍| 122368/129922 [12:53<00:44, 170.87it/s, NLL=2.93, epoch=12]

[05.01.19 18:43:58] Saved checkpoint: ./save/train/train-58/step_1551510.pth.tar


 94%|█████████▍| 122368/129922 [12:54<00:44, 170.87it/s, NLL=2.93, epoch=12]

[05.01.19 18:43:58] New best checkpoint at step 1551510...
[05.01.19 18:43:58] Removed checkpoint: ./save/train/train-58/step_1351316.pth.tar
[05.01.19 18:43:58] Dev NLL: 02.97, F1: 59.09, EM: 56.06, AvNA: 65.48
[05.01.19 18:43:58] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:40<00:00, 158.44it/s, NLL=1.29, epoch=12]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 18:44:44] Starting epoch 13...


 33%|███▎      | 42496/129922 [04:15<08:53, 164.01it/s, NLL=3.13, epoch=13]

[05.01.19 18:49:00] Evaluating at step 1601560...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:47, 123.07it/s][A
  1%|          | 64/5951 [00:00<00:47, 123.07it/s, NLL=4.38][A
  2%|▏         | 128/5951 [00:00<00:47, 123.07it/s, NLL=3.48][A
  3%|▎         | 192/5951 [00:00<00:34, 167.95it/s, NLL=3.48][A
  3%|▎         | 192/5951 [00:00<00:34, 167.95it/s, NLL=3.42][A
  4%|▍         | 256/5951 [00:00<00:33, 167.95it/s, NLL=3.42][A
  5%|▌         | 320/5951 [00:00<00:33, 167.95it/s, NLL=3.41][A
  6%|▋         | 384/5951 [00:00<00:24, 228.61it/s, NLL=3.41][A
  6%|▋         | 384/5951 [00:00<00:24, 228.61it/s, NLL=3.39][A
  8%|▊         | 448/5951 [00:00<00:24, 228.61it/s, NLL=3.3] [A
  9%|▊         | 512/5951 [00:00<00:23, 228.61it/s, NLL=3.27][A
 10%|▉         | 576/5951 [00:00<00:17, 302.62it/s, NLL=3.27][A
 10%|▉         | 576/5951 [00:00<00:17, 302.62it/s, NLL=3.29][A
 11%|█         | 640/5951 [00:00<00:17, 302.62it/s, NLL=3.19][A
 12%|█▏        | 704/5951 [00:01<00:13, 386.63it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1207.09it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1182.39it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1182.39it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1182.39it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1128.49it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1128.49it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1128.49it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1155.03it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1155.03it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:05<00:00, 1155.03it/s, NLL=2.99][A
 33%|███▎      | 42496/129922 [04:23<08:53, 164.01it/s, NLL=3.13, epoch=13]

[05.01.19 18:49:07] Saved checkpoint: ./save/train/train-58/step_1601560.pth.tar


 33%|███▎      | 42496/129922 [04:23<08:53, 164.01it/s, NLL=3.13, epoch=13]

[05.01.19 18:49:08] New best checkpoint at step 1601560...
[05.01.19 18:49:08] Removed checkpoint: ./save/train/train-58/step_1401364.pth.tar
[05.01.19 18:49:08] Dev NLL: 02.99, F1: 59.34, EM: 56.24, AvNA: 65.55
[05.01.19 18:49:08] Visualizing in TensorBoard...


 71%|███████   | 92544/129922 [09:26<04:02, 153.90it/s, NLL=2.55, epoch=13] 

[05.01.19 18:54:11] Evaluating at step 1651608...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:41, 140.38it/s][A
  1%|          | 64/5951 [00:00<00:41, 140.38it/s, NLL=4.49][A
  2%|▏         | 128/5951 [00:00<00:41, 140.38it/s, NLL=3.48][A
  3%|▎         | 192/5951 [00:00<00:30, 190.69it/s, NLL=3.48][A
  3%|▎         | 192/5951 [00:00<00:30, 190.69it/s, NLL=3.49][A
  4%|▍         | 256/5951 [00:00<00:29, 190.69it/s, NLL=3.48][A
  5%|▌         | 320/5951 [00:00<00:29, 190.69it/s, NLL=3.46][A
  6%|▋         | 384/5951 [00:00<00:21, 257.40it/s, NLL=3.46][A
  6%|▋         | 384/5951 [00:00<00:21, 257.40it/s, NLL=3.42][A
  8%|▊         | 448/5951 [00:00<00:21, 257.40it/s, NLL=3.34][A
  9%|▊         | 512/5951 [00:00<00:21, 257.40it/s, NLL=3.35][A
 10%|▉         | 576/5951 [00:00<00:15, 339.13it/s, NLL=3.35][A
 10%|▉         | 576/5951 [00:00<00:15, 339.13it/s, NLL=3.37][A
 11%|█         | 640/5951 [00:00<00:15, 339.13it/s, NLL=3.26][A
 12%|█▏        | 704/5951 [00:00<00:12, 429.91it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1187.23it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1187.23it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1197.98it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1197.98it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1197.98it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1139.99it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1139.99it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1139.99it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:05<00:00, 1139.99it/s, NLL=2.99][A
 71%|███████   | 92544/129922 [09:34<04:02, 153.90it/s, NLL=2.55, epoch=13]

[05.01.19 18:54:18] Saved checkpoint: ./save/train/train-58/step_1651608.pth.tar
[05.01.19 18:54:18] Removed checkpoint: ./save/train/train-58/step_1201170.pth.tar
[05.01.19 18:54:18] Dev NLL: 02.99, F1: 59.26, EM: 56.26, AvNA: 65.65
[05.01.19 18:54:18] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:21<00:00, 162.06it/s, NLL=2.95, epoch=13]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 18:58:06] Starting epoch 14...


 10%|▉         | 12672/129922 [01:17<11:58, 163.23it/s, NLL=2.46, epoch=14]

[05.01.19 18:59:23] Evaluating at step 1701658...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.52it/s][A
  1%|          | 64/5951 [00:00<00:44, 132.52it/s, NLL=4.76][A
  2%|▏         | 128/5951 [00:00<00:43, 132.52it/s, NLL=3.63][A
  3%|▎         | 192/5951 [00:00<00:32, 176.20it/s, NLL=3.63][A
  3%|▎         | 192/5951 [00:00<00:32, 176.20it/s, NLL=3.66][A
  4%|▍         | 256/5951 [00:00<00:32, 176.20it/s, NLL=3.6] [A
  5%|▌         | 320/5951 [00:00<00:31, 176.20it/s, NLL=3.56][A
  6%|▋         | 384/5951 [00:00<00:23, 238.55it/s, NLL=3.56][A
  6%|▋         | 384/5951 [00:00<00:23, 238.55it/s, NLL=3.48][A
  8%|▊         | 448/5951 [00:00<00:23, 238.55it/s, NLL=3.39][A
  9%|▊         | 512/5951 [00:00<00:22, 238.55it/s, NLL=3.35][A
 10%|▉         | 576/5951 [00:00<00:17, 314.95it/s, NLL=3.35][A
 10%|▉         | 576/5951 [00:00<00:17, 314.95it/s, NLL=3.37][A
 11%|█         | 640/5951 [00:01<00:16, 314.95it/s, NLL=3.26][A
 12%|█▏        | 704/5951 [00:01<00:13, 402.00it/s, NLL=3

 94%|█████████▎| 5568/5951 [00:05<00:00, 1164.13it/s, NLL=2.95][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1164.13it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1164.13it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1169.50it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1169.50it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1169.50it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1110.52it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1110.52it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1110.52it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:05<00:00, 1086.44it/s, NLL=2.98][A
 10%|▉         | 12672/129922 [01:24<11:58, 163.23it/s, NLL=2.46, epoch=14]

[05.01.19 18:59:31] Saved checkpoint: ./save/train/train-58/step_1701658.pth.tar
[05.01.19 18:59:31] Removed checkpoint: ./save/train/train-58/step_1501462.pth.tar
[05.01.19 18:59:31] Dev NLL: 02.99, F1: 59.00, EM: 56.07, AvNA: 65.54
[05.01.19 18:59:31] Visualizing in TensorBoard...


 48%|████▊     | 62720/129922 [06:36<06:44, 166.21it/s, NLL=2.98, epoch=14] 

[05.01.19 19:04:42] Evaluating at step 1751706...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:48, 120.38it/s][A
  1%|          | 64/5951 [00:00<00:48, 120.38it/s, NLL=4.34][A
  2%|▏         | 128/5951 [00:00<00:48, 120.38it/s, NLL=3.36][A
  3%|▎         | 192/5951 [00:00<00:35, 164.45it/s, NLL=3.36][A
  3%|▎         | 192/5951 [00:00<00:35, 164.45it/s, NLL=3.49][A
  4%|▍         | 256/5951 [00:00<00:34, 164.45it/s, NLL=3.46][A
  5%|▌         | 320/5951 [00:00<00:26, 214.59it/s, NLL=3.46][A
  5%|▌         | 320/5951 [00:00<00:26, 214.59it/s, NLL=3.45][A
  6%|▋         | 384/5951 [00:00<00:25, 214.59it/s, NLL=3.39][A
  8%|▊         | 448/5951 [00:00<00:19, 285.76it/s, NLL=3.39][A
  8%|▊         | 448/5951 [00:00<00:19, 285.76it/s, NLL=3.3] [A
  9%|▊         | 512/5951 [00:00<00:19, 285.76it/s, NLL=3.26][A
 10%|▉         | 576/5951 [00:01<00:14, 370.01it/s, NLL=3.26][A
 10%|▉         | 576/5951 [00:01<00:14, 370.01it/s, NLL=3.28][A
 11%|█         | 640/5951 [00:01<00:14, 370.01it/s, NLL=3

 90%|█████████ | 5376/5951 [00:05<00:00, 1003.48it/s, NLL=2.92][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1072.74it/s, NLL=2.92][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1072.74it/s, NLL=2.93][A
 92%|█████████▏| 5504/5951 [00:06<00:00, 1072.74it/s, NLL=2.94][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1105.32it/s, NLL=2.94][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1105.32it/s, NLL=2.96][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1105.32it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 960.82it/s, NLL=2.96] [A
 96%|█████████▌| 5696/5951 [00:06<00:00, 960.82it/s, NLL=2.97][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 960.82it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 943.94it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 943.94it/s, NLL=2.97][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 943.94it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:06<00:00, 979.93it/s, NLL=2.98][A
 48%|████▊     | 62720/129922 [06:44<06:44, 166.21it/s, NLL=2.98, ep

[05.01.19 19:04:51] Saved checkpoint: ./save/train/train-58/step_1751706.pth.tar


 48%|████▊     | 62720/129922 [06:45<06:44, 166.21it/s, NLL=2.98, epoch=14]

[05.01.19 19:04:52] New best checkpoint at step 1751706...
[05.01.19 19:04:52] Removed checkpoint: ./save/train/train-58/step_1451414.pth.tar
[05.01.19 19:04:52] Dev NLL: 02.99, F1: 59.44, EM: 56.36, AvNA: 65.80
[05.01.19 19:04:52] Visualizing in TensorBoard...


 87%|████████▋ | 112768/129922 [11:51<01:48, 157.49it/s, NLL=1.73, epoch=14]

[05.01.19 19:09:57] Evaluating at step 1801754...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:49, 119.50it/s][A
  1%|          | 64/5951 [00:00<00:49, 119.50it/s, NLL=4.4][A
  2%|▏         | 128/5951 [00:00<00:48, 119.50it/s, NLL=3.39][A
  3%|▎         | 192/5951 [00:00<00:35, 162.94it/s, NLL=3.39][A
  3%|▎         | 192/5951 [00:00<00:35, 162.94it/s, NLL=3.47][A
  4%|▍         | 256/5951 [00:00<00:34, 162.94it/s, NLL=3.45][A
  5%|▌         | 320/5951 [00:00<00:25, 219.35it/s, NLL=3.45][A
  5%|▌         | 320/5951 [00:00<00:25, 219.35it/s, NLL=3.42][A
  6%|▋         | 384/5951 [00:00<00:25, 219.35it/s, NLL=3.36][A
  8%|▊         | 448/5951 [00:00<00:25, 219.35it/s, NLL=3.27][A
  9%|▊         | 512/5951 [00:00<00:18, 292.38it/s, NLL=3.27][A
  9%|▊         | 512/5951 [00:00<00:18, 292.38it/s, NLL=3.29][A
 10%|▉         | 576/5951 [00:00<00:18, 292.38it/s, NLL=3.31][A
 11%|█         | 640/5951 [00:01<00:14, 373.49it/s, NLL=3.31][A
 11%|█         | 640/5951 [00:01<00:14, 373.49it/s, NLL=3.

 91%|█████████▏| 5440/5951 [00:05<00:00, 1054.87it/s, NLL=2.9][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1085.07it/s, NLL=2.9][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1085.07it/s, NLL=2.92][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1085.07it/s, NLL=2.94][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1084.14it/s, NLL=2.94][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1084.14it/s, NLL=2.93][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1084.14it/s, NLL=2.94][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1020.17it/s, NLL=2.94][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1020.17it/s, NLL=2.94][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1020.17it/s, NLL=2.95][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1051.31it/s, NLL=2.95][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1051.31it/s, NLL=2.95][A
100%|██████████| 5951/5951 [00:05<00:00, 1051.31it/s, NLL=2.96][A
 87%|████████▋ | 112768/129922 [11:59<01:48, 157.49it/s, NLL=1.73, epoch=14]

[05.01.19 19:10:05] Saved checkpoint: ./save/train/train-58/step_1801754.pth.tar


 87%|████████▋ | 112768/129922 [12:00<01:48, 157.49it/s, NLL=1.73, epoch=14]

[05.01.19 19:10:06] New best checkpoint at step 1801754...
[05.01.19 19:10:06] Removed checkpoint: ./save/train/train-58/step_1701658.pth.tar
[05.01.19 19:10:06] Dev NLL: 02.96, F1: 59.90, EM: 56.95, AvNA: 66.26
[05.01.19 19:10:06] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:44<00:00, 157.51it/s, NLL=0.997, epoch=14]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 19:11:51] Starting epoch 15...


 25%|██▌       | 32896/129922 [03:21<10:47, 149.75it/s, NLL=2.19, epoch=15]

[05.01.19 19:15:13] Evaluating at step 1851804...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:54, 107.93it/s][A
  1%|          | 64/5951 [00:00<00:54, 107.93it/s, NLL=4.28][A
  2%|▏         | 128/5951 [00:00<00:53, 107.93it/s, NLL=3.38][A
  3%|▎         | 192/5951 [00:00<00:39, 147.61it/s, NLL=3.38][A
  3%|▎         | 192/5951 [00:00<00:39, 147.61it/s, NLL=3.45][A
  4%|▍         | 256/5951 [00:00<00:38, 147.61it/s, NLL=3.42][A
  5%|▌         | 320/5951 [00:00<00:38, 147.61it/s, NLL=3.4] [A
  6%|▋         | 384/5951 [00:00<00:27, 201.27it/s, NLL=3.4][A
  6%|▋         | 384/5951 [00:00<00:27, 201.27it/s, NLL=3.34][A
  8%|▊         | 448/5951 [00:00<00:27, 201.27it/s, NLL=3.26][A
  9%|▊         | 512/5951 [00:00<00:20, 269.03it/s, NLL=3.26][A
  9%|▊         | 512/5951 [00:00<00:20, 269.03it/s, NLL=3.25][A
 10%|▉         | 576/5951 [00:01<00:19, 269.03it/s, NLL=3.26][A
 11%|█         | 640/5951 [00:01<00:15, 346.13it/s, NLL=3.26][A
 11%|█         | 640/5951 [00:01<00:15, 346.13it/s, NLL=3.

 90%|█████████ | 5376/5951 [00:05<00:00, 985.54it/s, NLL=2.89][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 985.54it/s, NLL=2.9] [A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1055.72it/s, NLL=2.9][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1055.72it/s, NLL=2.91][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1055.72it/s, NLL=2.94][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1067.88it/s, NLL=2.94][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1067.88it/s, NLL=2.93][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1067.88it/s, NLL=2.94][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1058.60it/s, NLL=2.94][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1058.60it/s, NLL=2.94][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 1058.60it/s, NLL=2.95][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 893.31it/s, NLL=2.95] [A
 99%|█████████▉| 5888/5951 [00:06<00:00, 893.31it/s, NLL=2.95][A
100%|██████████| 5951/5951 [00:06<00:00, 893.31it/s, NLL=2.96][A
 25%|██▌       | 32896/129922 [03:30<10:47, 149.75it/s, NLL=2.19, e

[05.01.19 19:15:21] Saved checkpoint: ./save/train/train-58/step_1851804.pth.tar


 25%|██▌       | 32896/129922 [03:31<10:47, 149.75it/s, NLL=2.19, epoch=15]

[05.01.19 19:15:22] New best checkpoint at step 1851804...
[05.01.19 19:15:22] Removed checkpoint: ./save/train/train-58/step_1551510.pth.tar
[05.01.19 19:15:22] Dev NLL: 02.96, F1: 60.19, EM: 57.20, AvNA: 66.49
[05.01.19 19:15:22] Visualizing in TensorBoard...


 64%|██████▍   | 82944/129922 [08:42<04:47, 163.25it/s, NLL=2.55, epoch=15] 

[05.01.19 19:20:33] Evaluating at step 1901852...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:49, 118.57it/s][A
  1%|          | 64/5951 [00:00<00:49, 118.57it/s, NLL=4.07][A
  2%|▏         | 128/5951 [00:00<00:49, 118.57it/s, NLL=3.23][A
  3%|▎         | 192/5951 [00:00<00:35, 161.46it/s, NLL=3.23][A
  3%|▎         | 192/5951 [00:00<00:35, 161.46it/s, NLL=3.35][A
  4%|▍         | 256/5951 [00:00<00:35, 161.46it/s, NLL=3.35][A
  5%|▌         | 320/5951 [00:00<00:25, 218.60it/s, NLL=3.35][A
  5%|▌         | 320/5951 [00:00<00:25, 218.60it/s, NLL=3.35][A
  6%|▋         | 384/5951 [00:00<00:25, 218.60it/s, NLL=3.3] [A
  8%|▊         | 448/5951 [00:00<00:25, 218.60it/s, NLL=3.23][A
  9%|▊         | 512/5951 [00:00<00:18, 291.46it/s, NLL=3.23][A
  9%|▊         | 512/5951 [00:00<00:18, 291.46it/s, NLL=3.16][A
 10%|▉         | 576/5951 [00:00<00:18, 291.46it/s, NLL=3.18][A
 11%|█         | 640/5951 [00:01<00:14, 367.21it/s, NLL=3.18][A
 11%|█         | 640/5951 [00:01<00:14, 367.21it/s, NLL=3

 91%|█████████▏| 5440/5951 [00:05<00:00, 1099.98it/s, NLL=2.91][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1099.98it/s, NLL=2.91][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1099.98it/s, NLL=2.93][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1112.85it/s, NLL=2.93][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1112.85it/s, NLL=2.96][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1112.85it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1103.06it/s, NLL=2.96][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 1103.06it/s, NLL=2.97][A
 97%|█████████▋| 5760/5951 [00:06<00:00, 1103.06it/s, NLL=2.97][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 968.68it/s, NLL=2.97] [A
 98%|█████████▊| 5824/5951 [00:06<00:00, 968.68it/s, NLL=2.97][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 968.68it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:06<00:00, 1035.41it/s, NLL=2.98][A
 64%|██████▍   | 82944/129922 [08:50<04:47, 163.25it/s, NLL=2.55, epoch=15]

[05.01.19 19:20:41] Saved checkpoint: ./save/train/train-58/step_1901852.pth.tar
[05.01.19 19:20:41] Removed checkpoint: ./save/train/train-58/step_1651608.pth.tar
[05.01.19 19:20:41] Dev NLL: 02.99, F1: 60.15, EM: 57.08, AvNA: 66.61
[05.01.19 19:20:41] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:41<00:00, 158.25it/s, NLL=2.97, epoch=15]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 19:25:32] Starting epoch 16...


  2%|▏         | 3072/129922 [00:19<12:28, 169.40it/s, NLL=3, epoch=16]   

[05.01.19 19:25:51] Evaluating at step 1951902...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:47, 123.03it/s][A
  1%|          | 64/5951 [00:00<00:47, 123.03it/s, NLL=3.8][A
  2%|▏         | 128/5951 [00:00<00:47, 123.03it/s, NLL=3.07][A
  3%|▎         | 192/5951 [00:00<00:34, 167.83it/s, NLL=3.07][A
  3%|▎         | 192/5951 [00:00<00:34, 167.83it/s, NLL=3.24][A
  4%|▍         | 256/5951 [00:00<00:33, 167.83it/s, NLL=3.27][A
  5%|▌         | 320/5951 [00:00<00:33, 167.83it/s, NLL=3.3] [A
  6%|▋         | 384/5951 [00:00<00:24, 226.64it/s, NLL=3.3][A
  6%|▋         | 384/5951 [00:00<00:24, 226.64it/s, NLL=3.26][A
  8%|▊         | 448/5951 [00:00<00:24, 226.64it/s, NLL=3.19][A
  9%|▊         | 512/5951 [00:00<00:18, 300.59it/s, NLL=3.19][A
  9%|▊         | 512/5951 [00:00<00:18, 300.59it/s, NLL=3.12][A
 10%|▉         | 576/5951 [00:00<00:17, 300.59it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:13, 381.40it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:13, 381.40it/s, NLL=3.0

 91%|█████████▏| 5440/5951 [00:05<00:00, 1039.64it/s, NLL=2.92][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1039.64it/s, NLL=2.92][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1039.64it/s, NLL=2.94][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1053.63it/s, NLL=2.94][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1053.63it/s, NLL=2.97][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1053.63it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1022.63it/s, NLL=2.97][A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1022.63it/s, NLL=2.98][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1022.63it/s, NLL=2.98][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 993.91it/s, NLL=2.98] [A
 98%|█████████▊| 5824/5951 [00:05<00:00, 993.91it/s, NLL=2.98][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 993.91it/s, NLL=2.98][A
100%|██████████| 5951/5951 [00:06<00:00, 993.91it/s, NLL=2.99][A
  2%|▏         | 3072/129922 [00:27<12:28, 169.40it/s, NLL=3, epoch=16]

[05.01.19 19:25:59] Saved checkpoint: ./save/train/train-58/step_1951902.pth.tar
[05.01.19 19:25:59] Removed checkpoint: ./save/train/train-58/step_1601560.pth.tar
[05.01.19 19:25:59] Dev NLL: 02.99, F1: 59.74, EM: 56.70, AvNA: 66.41
[05.01.19 19:25:59] Visualizing in TensorBoard...


 41%|████      | 53120/129922 [05:37<07:54, 161.87it/s, NLL=2.83, epoch=16]

[05.01.19 19:31:10] Evaluating at step 2001950...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:53, 110.92it/s][A
  1%|          | 64/5951 [00:00<00:53, 110.92it/s, NLL=3.61][A
  2%|▏         | 128/5951 [00:00<00:52, 110.92it/s, NLL=2.97][A
  3%|▎         | 192/5951 [00:00<00:38, 150.09it/s, NLL=2.97][A
  3%|▎         | 192/5951 [00:00<00:38, 150.09it/s, NLL=3.12][A
  4%|▍         | 256/5951 [00:00<00:37, 150.09it/s, NLL=3.19][A
  5%|▌         | 320/5951 [00:00<00:28, 199.93it/s, NLL=3.19][A
  5%|▌         | 320/5951 [00:00<00:28, 199.93it/s, NLL=3.23][A
  6%|▋         | 384/5951 [00:00<00:27, 199.93it/s, NLL=3.21][A
  8%|▊         | 448/5951 [00:01<00:21, 258.72it/s, NLL=3.21][A
  8%|▊         | 448/5951 [00:01<00:21, 258.72it/s, NLL=3.16][A
  9%|▊         | 512/5951 [00:01<00:21, 258.72it/s, NLL=3.1] [A
 10%|▉         | 576/5951 [00:01<00:16, 322.64it/s, NLL=3.1][A
 10%|▉         | 576/5951 [00:01<00:16, 322.64it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:16, 322.64it/s, NLL=3.

 90%|█████████ | 5376/5951 [00:05<00:00, 1033.81it/s, NLL=2.93][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1070.07it/s, NLL=2.93][A
 91%|█████████▏| 5440/5951 [00:05<00:00, 1070.07it/s, NLL=2.94][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1070.07it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1075.88it/s, NLL=2.96][A
 94%|█████████▎| 5568/5951 [00:06<00:00, 1075.88it/s, NLL=2.99][A
 95%|█████████▍| 5632/5951 [00:06<00:00, 1075.88it/s, NLL=2.99][A
 96%|█████████▌| 5696/5951 [00:06<00:00, 949.24it/s, NLL=2.99] [A
 96%|█████████▌| 5696/5951 [00:06<00:00, 949.24it/s, NLL=3]   [A
 97%|█████████▋| 5760/5951 [00:06<00:00, 949.24it/s, NLL=3][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 861.48it/s, NLL=3][A
 98%|█████████▊| 5824/5951 [00:06<00:00, 861.48it/s, NLL=3][A
 99%|█████████▉| 5888/5951 [00:06<00:00, 861.48it/s, NLL=3.01][A
100%|██████████| 5951/5951 [00:06<00:00, 861.48it/s, NLL=3.02][A
 41%|████      | 53120/129922 [05:46<07:54, 161.87it/s, NLL=2.83, epoch=16]

[05.01.19 19:31:18] Saved checkpoint: ./save/train/train-58/step_2001950.pth.tar
[05.01.19 19:31:18] Removed checkpoint: ./save/train/train-58/step_1751706.pth.tar
[05.01.19 19:31:18] Dev NLL: 03.02, F1: 59.50, EM: 56.38, AvNA: 66.41
[05.01.19 19:31:18] Visualizing in TensorBoard...


 79%|███████▉  | 103168/129922 [10:55<02:51, 156.40it/s, NLL=2.2, epoch=16] 

[05.01.19 19:36:27] Evaluating at step 2051998...



  0%|          | 0/5951 [00:00<?, ?it/s][A
  1%|          | 64/5951 [00:00<00:52, 111.22it/s][A
  1%|          | 64/5951 [00:00<00:52, 111.22it/s, NLL=3.66][A
  2%|▏         | 128/5951 [00:00<00:39, 147.25it/s, NLL=3.66][A
  2%|▏         | 128/5951 [00:00<00:39, 147.25it/s, NLL=3.03][A
  3%|▎         | 192/5951 [00:00<00:39, 147.25it/s, NLL=3.16][A
  4%|▍         | 256/5951 [00:00<00:28, 200.32it/s, NLL=3.16][A
  4%|▍         | 256/5951 [00:00<00:28, 200.32it/s, NLL=3.23][A
  5%|▌         | 320/5951 [00:00<00:28, 200.32it/s, NLL=3.26][A
  6%|▋         | 384/5951 [00:00<00:27, 200.32it/s, NLL=3.22][A
  8%|▊         | 448/5951 [00:00<00:20, 268.23it/s, NLL=3.22][A
  8%|▊         | 448/5951 [00:00<00:20, 268.23it/s, NLL=3.15][A
  9%|▊         | 512/5951 [00:00<00:20, 268.23it/s, NLL=3.1] [A
 10%|▉         | 576/5951 [00:01<00:15, 346.83it/s, NLL=3.1][A
 10%|▉         | 576/5951 [00:01<00:15, 346.83it/s, NLL=3.14][A
 11%|█         | 640/5951 [00:01<00:15, 346.83it/s, NLL=3.

 92%|█████████▏| 5504/5951 [00:05<00:00, 1129.78it/s, NLL=2.96][A
 92%|█████████▏| 5504/5951 [00:05<00:00, 1129.78it/s, NLL=2.97][A
 94%|█████████▎| 5568/5951 [00:05<00:00, 1129.78it/s, NLL=3.01][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1113.22it/s, NLL=3.01][A
 95%|█████████▍| 5632/5951 [00:05<00:00, 1113.22it/s, NLL=3]   [A
 96%|█████████▌| 5696/5951 [00:05<00:00, 1113.22it/s, NLL=3.01][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1095.84it/s, NLL=3.01][A
 97%|█████████▋| 5760/5951 [00:05<00:00, 1095.84it/s, NLL=3.01][A
 98%|█████████▊| 5824/5951 [00:05<00:00, 1095.84it/s, NLL=3.01][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1106.63it/s, NLL=3.01][A
 99%|█████████▉| 5888/5951 [00:05<00:00, 1106.63it/s, NLL=3.02][A
100%|██████████| 5951/5951 [00:06<00:00, 1106.63it/s, NLL=3.03][A
 79%|███████▉  | 103168/129922 [11:07<02:51, 156.40it/s, NLL=2.2, epoch=16]

[05.01.19 19:36:39] Saved checkpoint: ./save/train/train-58/step_2051998.pth.tar
[05.01.19 19:36:39] Removed checkpoint: ./save/train/train-58/step_2051998.pth.tar
[05.01.19 19:36:39] Dev NLL: 03.03, F1: 59.31, EM: 56.24, AvNA: 66.12
[05.01.19 19:36:39] Visualizing in TensorBoard...


100%|██████████| 129922/129922 [13:55<00:00, 155.53it/s, NLL=3.59, epoch=16]
  0%|          | 0/129922 [00:00<?, ?it/s]

[05.01.19 19:39:27] Starting epoch 17...


 18%|█▊        | 23296/129922 [02:26<10:09, 174.81it/s, NLL=2.19, epoch=17]

[05.01.19 19:41:54] Evaluating at step 2102048...



  0%|          | 0/5951 [00:00<?, ?it/s][A


OSError: [Errno 12] Cannot allocate memory

In [9]:
train_dataset = SQuAD(train_args.train_record_file, train_args.use_squad_v2)
train_loader = data.DataLoader(train_dataset,
                               batch_size=train_args.batch_size,
                               shuffle=True,
                               num_workers=train_args.num_workers,
                               collate_fn=collate_fn)
for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
    print(cc_idxs.shape)
    break

torch.Size([64, 353, 16])


In [5]:
util.get_available_devices()

(device(type='cuda', index=0), [0])