In [1]:
!nvidia-smi

Mon May 10 00:32:16 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os, pickle, time, random, logging, json, gc, warnings
from datetime import datetime
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW, BertConfig, BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import EncoderDecoderModel
from transformers import BertModel

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate import bleu
import spacy

warnings.simplefilter("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
bert_model = 'bert-large-uncased'
model_path = Path('./models')
data_path = Path('.')
bert_path = model_path / bert_model

In [4]:
if not (model_path / bert_model / 'config.json').is_file():
    BertModel.from_pretrained(bert_model).save_pretrained(model_path / bert_model)

In [22]:
class HP:
    def __init__(self):
        self.num_workers = 0
        self.pin_memory = False
        self.batch_size = 4
        self.weight_decay = 0.001
        self.lr = 0.05
        self.adam_lr = 0.001
        self.momentum = 0.9
        self.decoder_hidden_size = 512
        self.decoder_input_size = 512
        self.attention_hidden_size = 512
        self.max_input_length = 512
        self.n_layers = 1
        self.clip = 1
        self.dropout = 1
        self.n_epochs = 4
        self.max_epochs = 32
        self.patience = 3
        self.mb = 32
        self.checkpoint = None
        self.encoder_trained = False
        
        conf_file = json.load(open((bert_path / 'config.json'), 'r'))
        self.bert_hidden_size = conf_file['hidden_size']
        self.bert_vocab_size = conf_file['vocab_size']
        
        
hp = HP()

In [6]:
class Attention(nn.Module):
    def __init__(self, n_h_enc, n_h_dec, n_h_attention):
        super().__init__()
        self.attn = nn.Linear(n_h_enc + n_h_dec, n_h_attention)
        self.v = nn.Parameter(torch.rand(n_h_attention), requires_grad=True)
        
    def forward(self, key, queries):
        batch_size, src_len = queries.shape[:2]
        
        key = key.unsqueeze(1).repeat(1, src_len, 1)
        if key.size(0) != queries.size(0) or key.size(1) != queries.size(1):
            print(key.shape, queries.shape)
        kitten = torch.cat((key, queries), dim=2)
        energy = torch.tanh(self.attn(kitten))
        
        v = self.v.repeat(batch_size, 1).unsqueeze(2)
        
        attn = torch.bmm(energy, v).squeeze(2)
        
        return F.softmax(attn, dim=1)

class Decoder(nn.Module):
    def __init__(self, out_dims, emb_dims, n_h_enc, n_h_dec, n_layers, dropout, attn):
        super().__init__()
        self.out_dims = out_dims
        self.emb_dims = emb_dims
        self.n_h_enc = n_h_enc
        self.n_h_dec = n_h_dec
        self.n_layers = n_layers
        self.attn = attn
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(out_dims, emb_dims)
        
        self.rnn = nn.GRU(emb_dims, n_h_dec, batch_first=True, num_layers=n_layers, dropout=dropout)

        self.out_dense = nn.Linear(n_h_enc + n_h_dec, out_dims)
        
    def forward(self, src, queries, hidden):
        src = src.unsqueeze(1)
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        
        out, hidden = self.rnn(embedded, hidden)
        out = out.squeeze()
        
        a = self.attn(out, queries)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, queries)
        
        weighted = weighted.squeeze(1)
        
        out = self.out_dense(torch.cat([out, weighted], dim=1))
        
        return out, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, encoder_trained):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_trained = encoder_trained
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        input_ids, token_ids, attn_mask = src
        
        if self.encoder_trained:
            bert_hs = self.encoder(input_ids, token_type_ids=token_ids, attention_mask=attn_mask)
        else:
            with torch.no_grad():
                bert_hs = self.encoder(input_ids, token_type_ids=token_ids, attention_mask=attn_mask)
        
        bert_encodings = bert_hs[0]
        
        batch_size, max_len = trg.shape[:2]
        
        trg_vocab_size = self.decoder.out_dims
        
        outputs = torch.zeros(batch_size, max_len, trg_vocab_size).to(device)
        
        out = trg[:, 0]
        
        hidden = torch.zeros(self.decoder.n_layers, out.shape[0], self.decoder.n_h_dec).to(device)
        
        for t in range(1, max_len):
            out, hidden = self.decoder(out, bert_encodings, hidden)
            outputs[:, t] = out
            teacher_force = random.random() < teacher_forcing_ratio
            out = (trg[:, t] if teacher_force else out.max(1)[1])
        
        return outputs

In [7]:
# redo below
def enable_reproducibility(seed=69):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def no_grad(model):
    for p in model.parameters():
        p.requires_grad = False
    return model

def no_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def no_parameters(model):
    return sum(p.numel() for p in model.parameters())

def model_size(model):
    return sum(p.element_size() * p.nelement() for p in model.parameters())

def save_checkpoint(name, epoch, model, optimizer, valid_loss, train_loss):
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'valid_loss': valid_loss,
            'train_loss': train_loss,
            }, name)

def load_checkpoint(filename):
    checkpoint = torch.load(filename)
    return checkpoint['epoch'], checkpoint['model_state_dict'],\
           checkpoint['optimizer_state_dict'], checkpoint['valid_loss'], checkpoint['train_loss']

def epoch_time(start_time, end_time):
    elapsed_secs = end_time - start_time
    elapsed_mins = elapsed_secs / 60
    return elapsed_mins, elapsed_secs

In [8]:
pw_criterion = nn.CrossEntropyLoss(ignore_index=0)

def train(model, dataloader, optimizer, criterion, clip):
    log = logging.getLogger(__name__)
    model.train()

    epoch_loss = 0

    start = time.time()
    for i, (input_, output_) in tqdm(enumerate(dataloader), total=len(dataloader)):

        input_data, input_length = input_
        output_data, output_length = output_

        optimizer.zero_grad()
        
        prediction = model([x.to(device) for x in input_data],  output_data.to(device))

        trg_sent_len = prediction.size(1)

        prediction = prediction[:, 1:].contiguous().view(-1, prediction.shape[-1])
        output_data = output_data[:, 1:].contiguous().view(-1)  # Find a way to avoid calling contiguous

        with torch.no_grad():
            pw_loss = pw_criterion(prediction,  output_data.to(device))

        loss = criterion(prediction,  output_data.to(device))

        # reshape to [trg sent len - 1, batch size]
        loss = loss.view(-1, trg_sent_len - 1)
        loss = loss.sum(1)
        loss = loss.mean(0)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), clip)
        optimizer.step()

        if i % int(len(dataloader) * 0.1) == int(len(dataloader) * 0.1) - 1:
            print('Batch {}: Sentence Loss: {:.3f}, Word Loss: {:.3f}'
                  .format(i, loss.item(), pw_loss.item()))
            start = time.time()

        epoch_loss += pw_loss.item()

    return epoch_loss / len(dataloader)

In [37]:
tokenizer = BertTokenizer.from_pretrained(bert_model)

def bleu_score(prediction, ground_truth):
    prediction = prediction.max(2)[1]
    acc_bleu = 0

    for x, y in zip(ground_truth, prediction):
        x = tokenizer.convert_ids_to_tokens(x.tolist())
        y = tokenizer.convert_ids_to_tokens(y.tolist())
        idx1 = x.index('[PAD]') if '[PAD]' in x else len(x)
        idx2 = y.index('[SEP]') if '[SEP]' in y else len(y)

        acc_bleu += bleu([x[1:idx1 - 1]], y[1:idx2 - 1], smoothing_function=SmoothingFunction().method4)
    return acc_bleu / prediction.size(0)

def eval(model, dataloader, criterion):
    log = logging.getLogger(__name__)
    model.eval()

    epoch_loss = 0
    epoch_bleu = 0

    with torch.no_grad():

        for i, (input_, output_) in tqdm(enumerate(dataloader), total=len(dataloader)):

            input_data, input_length = input_
            output_data, output_length = output_

            prediction = model([x.to(device) for x in input_data], output_data.to(device), 0)  # turn off teacher forcing

            sample_t = tokenizer.convert_ids_to_tokens(output_data[0].tolist())
            sample_p = tokenizer.convert_ids_to_tokens(prediction[0].max(1)[1].tolist())
            idx1 = sample_t.index('[PAD]') if '[PAD]' in sample_t else len(sample_t)
            idx2 = sample_p.index('[SEP]') if '[SEP]' in sample_p else len(sample_p)

            bleu_i = bleu_score(prediction, output_data.to(device))

            trg_sent_len = prediction.size(1)
            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]

            prediction = prediction[:, 1:].contiguous().view(-1, prediction.shape[-1])
            output_data = output_data[:, 1:].contiguous().view(-1)  # Find a way to avoid calling contiguous

            # trg = [(trg sent len - 1) * batch size]
            # output = [(trg sent len - 1) * batch size, output dim]

            pw_loss = pw_criterion(prediction, output_data.to(device))

            loss = criterion(prediction, output_data.to(device))
            loss = loss.view(-1, trg_sent_len - 1)
            loss = loss.sum(1)
            loss = loss.mean(0)

            if i % int(len(dataloader) * 0.1) == int(len(dataloader) * 0.1) - 1:
                print('Batch {}: Sentence Loss: {:.3f}, Word Loss: {:.3f}, BLEU score {:.4f}'.format(i, loss.item(), pw_loss.item(), bleu_i))

            epoch_loss += pw_loss.item()
            epoch_bleu += bleu_i

        return epoch_loss / len(dataloader), epoch_bleu / len(dataloader)

In [10]:
class BertDataset(Dataset):
    def __init__(self, json_path, bert_model):
        all_data = json.load(open(json_path, 'r'))
        input, output = self._extract_data(all_data)
        self.data = self._tokenize_data(input, output, bert_model)

    def _extract_data(self, all_data):
        input, output = [], []
        for data in all_data:
            input.append((data['passages'], data['responses']))
            output.append(data['clues'])
        input, output = input[:-1], output[:-1]
        return input, output

    def _tokenize_data(self, input, output, bert_model):
        tokenizer = BertTokenizer.from_pretrained(bert_model)
        
        data = tokenizer.batch_encode_plus(input, pad_to_max_length=True, return_tensors='pt')
        out_dict = tokenizer.batch_encode_plus(output, pad_to_max_length=True, return_tensors='pt')
        
        data['output_ids'] = out_dict['input_ids']
        data['output_len'] = out_dict['attention_mask'].sum(dim=1)
        data['input_len'] = data['attention_mask'].sum(dim=1)
        
        idx = (data['input_len'] <= hp.max_input_length)
        in_m = max(data['input_len'][idx])
        out_m = max(data['output_len'][idx])
        
        data['input_ids'] = data['input_ids'][idx, :in_m]
        data['attention_mask'] = data['attention_mask'][idx, :in_m]
        data['token_type_ids'] = data['token_type_ids'][idx, :in_m]
        data['input_len'] = data['input_len'][idx]
        
        data['output_ids'] = data['output_ids'][idx, :out_m]
        data['output_len'] = data['output_len'][idx]
        
        return data
    
    def __len__(self):
        return self.data['input_ids'].shape[0]
    
    def __getitem__(self, idx):
        return (((self.data['input_ids'][idx],
                self.data['attention_mask'][idx],
                self.data['token_type_ids'][idx]),
                self.data['input_len'][idx]),
                (self.data['output_ids'][idx],
                self.data['output_len'][idx])
                )

In [11]:
train_set = BertDataset('data_train.json', bert_model)
dev_set = BertDataset('data_dev.json', bert_model)

Token indices sequence length is longer than the specified maximum sequence length for this model (1241 > 512). Running this sequence through the model will result in indexing errors


In [12]:
train_loader = DataLoader(train_set, batch_size=hp.batch_size, shuffle=True,
                          num_workers=hp.num_workers, pin_memory=hp.pin_memory
                         )
dev_loader = DataLoader(dev_set, batch_size=hp.batch_size, shuffle=True,
                        num_workers=hp.num_workers, pin_memory=hp.pin_memory
                       )

In [13]:
attn = Attention(hp.bert_hidden_size, hp.decoder_hidden_size, hp.attention_hidden_size)
decoder = Decoder(hp.bert_vocab_size, hp.decoder_input_size, hp.bert_hidden_size,
                  hp.decoder_hidden_size, hp.n_layers, hp.dropout, attn)
encoder = BertModel.from_pretrained(bert_path)

In [31]:
model = Seq2Seq(encoder, decoder, hp.encoder_trained)

In [32]:
# optimizer = optim.SGD(decoder.parameters(), weight_decay=hp.weight_decay, lr=hp.lr, momentum=hp.momentum)
optimizer = optim.Adam(decoder.parameters(), lr=hp.adam_lr, weight_decay=hp.weight_decay)
criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='none')

In [33]:
model.to(device);

In [34]:
train_losses, dev_losses = [], []

In [35]:
last_dev_loss = None
patience = hp.patience

In [36]:
model_save_dir = Path('models')
model_save_dir.is_dir()

True

In [None]:
for epoch in range(hp.max_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, hp.clip)
    dev_loss, bleu_score_i = eval(model, dev_loader, criterion)
    print('End of epoch {}. Train loss is {:.3f}, Dev loss is {:.3f}, Bleu is {}'
          .format(epoch+1, train_loss, dev_loss, bleu_score_i))
    train_losses.append(train_loss)
    dev_losses.append(dev_loss)
    if epoch > 0:
        if (dev_loss - last_dev_loss) > -0.0001:
            patience -= 1
            if patience <= 0: break
        else:
            patience = hp.patience
    last_dev_loss = dev_loss
if model_save_dir.is_dir():
    torch.save(model, 'models/best_model_big_BERT.pt')

 10%|▉         | 143/1435 [00:47<07:09,  3.01it/s]

Batch 142: Sentence Loss: 204.704, Word Loss: 7.512


 20%|█▉        | 286/1435 [01:35<06:31,  2.93it/s]

Batch 285: Sentence Loss: 193.343, Word Loss: 6.967


 30%|██▉       | 429/1435 [02:22<05:36,  2.99it/s]

Batch 428: Sentence Loss: 165.109, Word Loss: 7.957


 40%|███▉      | 572/1435 [03:10<04:48,  2.99it/s]

Batch 571: Sentence Loss: 154.835, Word Loss: 7.553


 50%|████▉     | 715/1435 [03:58<04:00,  2.99it/s]

Batch 714: Sentence Loss: 256.669, Word Loss: 7.180


 60%|█████▉    | 858/1435 [04:46<03:12,  2.99it/s]

Batch 857: Sentence Loss: 191.470, Word Loss: 7.294


 70%|██████▉   | 1001/1435 [05:34<02:25,  2.99it/s]

Batch 1000: Sentence Loss: 210.764, Word Loss: 7.527


 80%|███████▉  | 1144/1435 [06:21<01:37,  2.99it/s]

Batch 1143: Sentence Loss: 158.813, Word Loss: 7.387


 90%|████████▉ | 1287/1435 [07:09<00:49,  2.99it/s]

Batch 1286: Sentence Loss: 171.911, Word Loss: 8.186


100%|█████████▉| 1430/1435 [07:57<00:01,  2.98it/s]

Batch 1429: Sentence Loss: 169.690, Word Loss: 7.221


100%|██████████| 1435/1435 [07:59<00:00,  2.99it/s]
 10%|█         | 36/359 [00:05<00:46,  6.96it/s]

Batch 34: Sentence Loss: 120.728, Word Loss: 6.439, BLEU score 0.0024


 20%|█▉        | 71/359 [00:10<00:41,  6.96it/s]

Batch 69: Sentence Loss: 160.949, Word Loss: 7.574, BLEU score 0.0063


 30%|██▉       | 106/359 [00:15<00:36,  6.95it/s]

Batch 104: Sentence Loss: 145.000, Word Loss: 7.532, BLEU score 0.0071


 39%|███▉      | 141/359 [00:20<00:31,  6.95it/s]

Batch 139: Sentence Loss: 171.674, Word Loss: 7.228, BLEU score 0.0086


 49%|████▉     | 176/359 [00:25<00:26,  6.95it/s]

Batch 174: Sentence Loss: 243.871, Word Loss: 7.931, BLEU score 0.0070


 59%|█████▉    | 211/359 [00:30<00:21,  6.95it/s]

Batch 209: Sentence Loss: 117.441, Word Loss: 7.577, BLEU score 0.0061


 69%|██████▊   | 246/359 [00:35<00:16,  6.93it/s]

Batch 244: Sentence Loss: 145.149, Word Loss: 7.953, BLEU score 0.0077


 78%|███████▊  | 281/359 [00:40<00:11,  6.92it/s]

Batch 279: Sentence Loss: 154.702, Word Loss: 7.735, BLEU score 0.0024


 88%|████████▊ | 316/359 [00:45<00:06,  6.92it/s]

Batch 314: Sentence Loss: 136.146, Word Loss: 7.073, BLEU score 0.0036


 98%|█████████▊| 351/359 [00:50<00:01,  6.92it/s]

Batch 349: Sentence Loss: 213.037, Word Loss: 7.818, BLEU score 0.0038


100%|██████████| 359/359 [00:51<00:00,  6.95it/s]
  0%|          | 0/1435 [00:00<?, ?it/s]

End of epoch 1. Train loss is 7.380, Dev loss is 7.505, Bleu is 0.006095086225179486


 10%|▉         | 143/1435 [00:47<07:13,  2.98it/s]

Batch 142: Sentence Loss: 126.348, Word Loss: 7.897


 20%|█▉        | 286/1435 [01:35<06:24,  2.99it/s]

Batch 285: Sentence Loss: 137.158, Word Loss: 6.858


 30%|██▉       | 429/1435 [02:23<05:37,  2.98it/s]

Batch 428: Sentence Loss: 232.698, Word Loss: 7.387


 40%|███▉      | 572/1435 [03:11<04:49,  2.98it/s]

Batch 571: Sentence Loss: 151.279, Word Loss: 7.471


 50%|████▉     | 715/1435 [03:59<04:01,  2.98it/s]

Batch 714: Sentence Loss: 140.316, Word Loss: 6.762


 60%|█████▉    | 858/1435 [04:47<03:13,  2.98it/s]

Batch 857: Sentence Loss: 132.472, Word Loss: 7.570


 70%|██████▉   | 1001/1435 [05:35<02:25,  2.98it/s]

Batch 1000: Sentence Loss: 155.512, Word Loss: 8.079


 80%|███████▉  | 1144/1435 [06:23<01:37,  2.98it/s]

Batch 1143: Sentence Loss: 168.752, Word Loss: 7.181


 90%|████████▉ | 1287/1435 [07:11<00:49,  2.96it/s]

Batch 1286: Sentence Loss: 160.677, Word Loss: 7.561


100%|█████████▉| 1430/1435 [08:00<00:01,  2.98it/s]

Batch 1429: Sentence Loss: 133.173, Word Loss: 7.297


100%|██████████| 1435/1435 [08:01<00:00,  2.98it/s]
 10%|█         | 36/359 [00:05<00:46,  6.94it/s]

Batch 34: Sentence Loss: 130.273, Word Loss: 7.895, BLEU score 0.0052


 20%|█▉        | 71/359 [00:10<00:41,  6.94it/s]

Batch 69: Sentence Loss: 127.498, Word Loss: 6.892, BLEU score 0.0111


 30%|██▉       | 106/359 [00:15<00:36,  6.93it/s]

Batch 104: Sentence Loss: 184.049, Word Loss: 7.590, BLEU score 0.0072


 39%|███▉      | 141/359 [00:20<00:31,  6.94it/s]

Batch 139: Sentence Loss: 177.202, Word Loss: 8.055, BLEU score 0.0067


 49%|████▉     | 176/359 [00:25<00:26,  6.93it/s]

Batch 174: Sentence Loss: 148.445, Word Loss: 7.917, BLEU score 0.0084


 59%|█████▉    | 211/359 [00:30<00:21,  6.93it/s]

Batch 209: Sentence Loss: 223.312, Word Loss: 7.204, BLEU score 0.0105


 69%|██████▊   | 246/359 [00:35<00:16,  6.93it/s]

Batch 244: Sentence Loss: 200.751, Word Loss: 8.111, BLEU score 0.0076


 78%|███████▊  | 281/359 [00:40<00:11,  6.93it/s]

Batch 279: Sentence Loss: 125.791, Word Loss: 6.800, BLEU score 0.0197


 88%|████████▊ | 316/359 [00:45<00:06,  6.92it/s]

Batch 314: Sentence Loss: 145.882, Word Loss: 7.204, BLEU score 0.0065


 98%|█████████▊| 351/359 [00:50<00:01,  6.92it/s]

Batch 349: Sentence Loss: 157.135, Word Loss: 7.956, BLEU score 0.0060


100%|██████████| 359/359 [00:51<00:00,  6.94it/s]
  0%|          | 0/1435 [00:00<?, ?it/s]

End of epoch 2. Train loss is 7.357, Dev loss is 7.520, Bleu is 0.008543953052720306


 10%|▉         | 143/1435 [00:47<07:13,  2.98it/s]

Batch 142: Sentence Loss: 132.256, Word Loss: 7.348


 20%|█▉        | 286/1435 [01:35<06:25,  2.98it/s]

Batch 285: Sentence Loss: 228.997, Word Loss: 7.213


 30%|██▉       | 429/1435 [02:24<05:37,  2.98it/s]

Batch 428: Sentence Loss: 123.207, Word Loss: 7.040


 40%|███▉      | 572/1435 [03:12<04:49,  2.98it/s]

Batch 571: Sentence Loss: 159.389, Word Loss: 7.590


 50%|████▉     | 715/1435 [04:00<04:02,  2.97it/s]

Batch 714: Sentence Loss: 153.865, Word Loss: 7.327


 60%|█████▉    | 858/1435 [04:48<03:13,  2.98it/s]

Batch 857: Sentence Loss: 128.234, Word Loss: 7.543


 70%|██████▉   | 1001/1435 [05:36<02:26,  2.97it/s]

Batch 1000: Sentence Loss: 120.628, Word Loss: 6.993


 80%|███████▉  | 1144/1435 [06:24<01:37,  2.98it/s]

Batch 1143: Sentence Loss: 175.365, Word Loss: 7.015


 90%|████████▉ | 1287/1435 [07:12<00:49,  2.97it/s]

Batch 1286: Sentence Loss: 195.910, Word Loss: 7.324


100%|█████████▉| 1430/1435 [08:00<00:01,  2.98it/s]

Batch 1429: Sentence Loss: 123.852, Word Loss: 7.506


100%|██████████| 1435/1435 [08:01<00:00,  2.98it/s]
 10%|█         | 36/359 [00:05<00:46,  6.93it/s]

Batch 34: Sentence Loss: 181.115, Word Loss: 8.050, BLEU score 0.0025


 20%|█▉        | 71/359 [00:10<00:41,  6.94it/s]

Batch 69: Sentence Loss: 245.232, Word Loss: 7.663, BLEU score 0.0039


 30%|██▉       | 106/359 [00:15<00:36,  6.94it/s]

Batch 104: Sentence Loss: 161.189, Word Loss: 7.008, BLEU score 0.0129


 39%|███▉      | 141/359 [00:20<00:31,  6.94it/s]

Batch 139: Sentence Loss: 174.845, Word Loss: 7.686, BLEU score 0.0026


 49%|████▉     | 176/359 [00:25<00:26,  6.93it/s]

Batch 174: Sentence Loss: 185.673, Word Loss: 7.281, BLEU score 0.0093


 59%|█████▉    | 211/359 [00:30<00:21,  6.92it/s]

Batch 209: Sentence Loss: 157.934, Word Loss: 7.098, BLEU score 0.0043


 69%|██████▊   | 246/359 [00:35<00:16,  6.92it/s]

Batch 244: Sentence Loss: 160.977, Word Loss: 6.778, BLEU score 0.0078


 78%|███████▊  | 281/359 [00:40<00:11,  6.92it/s]

Batch 279: Sentence Loss: 156.771, Word Loss: 7.647, BLEU score 0.0039


 88%|████████▊ | 316/359 [00:45<00:06,  6.93it/s]

Batch 314: Sentence Loss: 167.257, Word Loss: 7.272, BLEU score 0.0025


 98%|█████████▊| 351/359 [00:50<00:01,  6.93it/s]

Batch 349: Sentence Loss: 161.093, Word Loss: 7.671, BLEU score 0.0121


100%|██████████| 359/359 [00:51<00:00,  6.94it/s]
  0%|          | 0/1435 [00:00<?, ?it/s]

End of epoch 3. Train loss is 7.357, Dev loss is 7.590, Bleu is 0.005622715845720784


 10%|▉         | 143/1435 [00:47<07:13,  2.98it/s]

Batch 142: Sentence Loss: 146.058, Word Loss: 7.490


 20%|█▉        | 286/1435 [01:35<06:25,  2.98it/s]

Batch 285: Sentence Loss: 220.818, Word Loss: 7.240


 30%|██▉       | 429/1435 [02:23<05:38,  2.97it/s]

Batch 428: Sentence Loss: 147.073, Word Loss: 7.447


 40%|███▉      | 572/1435 [03:11<04:49,  2.98it/s]

Batch 571: Sentence Loss: 157.417, Word Loss: 7.586


 50%|████▉     | 715/1435 [04:00<04:01,  2.98it/s]

Batch 714: Sentence Loss: 145.357, Word Loss: 7.091


 60%|█████▉    | 858/1435 [04:48<03:13,  2.98it/s]

Batch 857: Sentence Loss: 169.961, Word Loss: 7.009


 70%|██████▉   | 1001/1435 [05:36<02:25,  2.98it/s]

Batch 1000: Sentence Loss: 160.360, Word Loss: 7.919


 80%|███████▉  | 1144/1435 [06:23<01:37,  2.98it/s]

Batch 1143: Sentence Loss: 130.974, Word Loss: 6.893


 90%|████████▉ | 1287/1435 [07:11<00:49,  2.98it/s]

Batch 1286: Sentence Loss: 135.492, Word Loss: 6.860


100%|█████████▉| 1430/1435 [07:59<00:01,  2.98it/s]

Batch 1429: Sentence Loss: 203.683, Word Loss: 7.544


100%|██████████| 1435/1435 [08:01<00:00,  2.98it/s]
 10%|█         | 36/359 [00:05<00:46,  6.92it/s]

Batch 34: Sentence Loss: 156.152, Word Loss: 7.711, BLEU score 0.0100


 20%|█▉        | 71/359 [00:10<00:41,  6.91it/s]

Batch 69: Sentence Loss: 139.993, Word Loss: 8.116, BLEU score 0.0136


 30%|██▉       | 106/359 [00:15<00:36,  6.93it/s]

Batch 104: Sentence Loss: 162.638, Word Loss: 8.032, BLEU score 0.0000


 39%|███▉      | 141/359 [00:20<00:31,  6.92it/s]

Batch 139: Sentence Loss: 170.849, Word Loss: 7.855, BLEU score 0.0098


 49%|████▉     | 176/359 [00:25<00:26,  6.91it/s]

Batch 174: Sentence Loss: 160.325, Word Loss: 7.457, BLEU score 0.0070


 59%|█████▉    | 211/359 [00:30<00:21,  6.92it/s]

Batch 209: Sentence Loss: 159.578, Word Loss: 7.979, BLEU score 0.0061


 69%|██████▊   | 246/359 [00:35<00:16,  6.92it/s]

Batch 244: Sentence Loss: 201.934, Word Loss: 7.693, BLEU score 0.0073


 78%|███████▊  | 281/359 [00:40<00:11,  6.92it/s]

Batch 279: Sentence Loss: 192.170, Word Loss: 7.391, BLEU score 0.0138


 88%|████████▊ | 316/359 [00:45<00:06,  6.91it/s]

Batch 314: Sentence Loss: 145.631, Word Loss: 7.104, BLEU score 0.0034


 98%|█████████▊| 351/359 [00:50<00:01,  6.91it/s]

Batch 349: Sentence Loss: 196.691, Word Loss: 8.195, BLEU score 0.0049


100%|██████████| 359/359 [00:51<00:00,  6.92it/s]
  0%|          | 0/1435 [00:00<?, ?it/s]

End of epoch 4. Train loss is 7.348, Dev loss is 7.500, Bleu is 0.008447350934053086


 10%|▉         | 143/1435 [00:48<07:13,  2.98it/s]

Batch 142: Sentence Loss: 143.726, Word Loss: 7.011


 20%|█▉        | 286/1435 [01:36<06:25,  2.98it/s]

Batch 285: Sentence Loss: 164.940, Word Loss: 7.583


 30%|██▉       | 429/1435 [02:24<05:37,  2.98it/s]

Batch 428: Sentence Loss: 166.232, Word Loss: 7.227


 40%|███▉      | 572/1435 [03:12<04:49,  2.98it/s]

Batch 571: Sentence Loss: 155.854, Word Loss: 7.249


 50%|████▉     | 715/1435 [04:00<04:01,  2.98it/s]

Batch 714: Sentence Loss: 201.226, Word Loss: 7.187


 60%|█████▉    | 858/1435 [04:48<03:13,  2.98it/s]

Batch 857: Sentence Loss: 155.164, Word Loss: 7.217


 70%|██████▉   | 1001/1435 [05:36<02:25,  2.98it/s]

Batch 1000: Sentence Loss: 134.972, Word Loss: 7.012


 80%|███████▉  | 1144/1435 [06:24<01:37,  2.98it/s]

Batch 1143: Sentence Loss: 141.668, Word Loss: 7.556


 90%|████████▉ | 1287/1435 [07:12<00:49,  2.98it/s]

Batch 1286: Sentence Loss: 170.351, Word Loss: 7.173


100%|█████████▉| 1430/1435 [08:00<00:01,  2.98it/s]

Batch 1429: Sentence Loss: 163.898, Word Loss: 7.713


100%|██████████| 1435/1435 [08:01<00:00,  2.98it/s]
 10%|█         | 36/359 [00:05<00:46,  6.93it/s]

Batch 34: Sentence Loss: 125.533, Word Loss: 7.173, BLEU score 0.0059


 20%|█▉        | 71/359 [00:10<00:41,  6.95it/s]

Batch 69: Sentence Loss: 153.016, Word Loss: 7.556, BLEU score 0.0126


 30%|██▉       | 106/359 [00:15<00:36,  6.95it/s]

Batch 104: Sentence Loss: 134.976, Word Loss: 7.104, BLEU score 0.0107


 39%|███▉      | 141/359 [00:20<00:31,  6.94it/s]

Batch 139: Sentence Loss: 124.524, Word Loss: 7.325, BLEU score 0.0094


 49%|████▉     | 176/359 [00:25<00:26,  6.94it/s]

Batch 174: Sentence Loss: 164.187, Word Loss: 7.139, BLEU score 0.0111


 59%|█████▉    | 211/359 [00:30<00:21,  6.91it/s]

Batch 209: Sentence Loss: 196.582, Word Loss: 7.489, BLEU score 0.0054


 69%|██████▊   | 246/359 [00:35<00:16,  6.89it/s]

Batch 244: Sentence Loss: 167.783, Word Loss: 7.714, BLEU score 0.0079


 78%|███████▊  | 281/359 [00:40<00:11,  6.89it/s]

Batch 279: Sentence Loss: 136.641, Word Loss: 7.386, BLEU score 0.0039


 88%|████████▊ | 316/359 [00:45<00:06,  6.87it/s]

Batch 314: Sentence Loss: 198.413, Word Loss: 6.962, BLEU score 0.0099


 98%|█████████▊| 351/359 [00:50<00:01,  6.88it/s]

Batch 349: Sentence Loss: 139.297, Word Loss: 7.053, BLEU score 0.0016


100%|██████████| 359/359 [00:51<00:00,  6.92it/s]
  0%|          | 0/1435 [00:00<?, ?it/s]

End of epoch 5. Train loss is 7.352, Dev loss is 7.471, Bleu is 0.008515570209957652


 10%|▉         | 143/1435 [00:48<07:14,  2.97it/s]

Batch 142: Sentence Loss: 180.827, Word Loss: 7.233


 20%|█▉        | 286/1435 [01:36<06:26,  2.97it/s]

Batch 285: Sentence Loss: 172.206, Word Loss: 7.569


 30%|██▉       | 429/1435 [02:24<05:37,  2.98it/s]

Batch 428: Sentence Loss: 146.246, Word Loss: 7.134


 40%|███▉      | 572/1435 [03:12<04:50,  2.97it/s]

Batch 571: Sentence Loss: 147.179, Word Loss: 7.179


 50%|████▉     | 715/1435 [04:00<04:02,  2.97it/s]

Batch 714: Sentence Loss: 228.724, Word Loss: 7.261


 60%|█████▉    | 858/1435 [04:48<03:13,  2.99it/s]

Batch 857: Sentence Loss: 139.217, Word Loss: 7.525


 70%|██████▉   | 1001/1435 [05:36<02:25,  2.99it/s]

Batch 1000: Sentence Loss: 151.824, Word Loss: 8.319


 80%|███████▉  | 1144/1435 [06:24<01:37,  2.98it/s]

Batch 1143: Sentence Loss: 151.431, Word Loss: 7.667


 83%|████████▎ | 1186/1435 [06:38<01:23,  2.98it/s]