In [1]:
import pandas as pd
from datasets import load_dataset
import re
import unicodedata

import sentencepiece as spm

import math

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from torch.optim import Adam
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

In [2]:
df = pd.read_csv("jpn.txt", sep="\t",names=["English","Japanese"], index_col=2).drop_duplicates().reset_index(drop=True)

In [3]:
ja_en = df[["Japanese", "English"]].rename(columns={"Japanese":"japanese", "English":"english"})
ja_en = ja_en.sample(frac=1, random_state=7).reset_index(drop=True)

In [4]:
ja_en.head()

Unnamed: 0,japanese,english
0,馬は立ったまま寝るって本当ですか？,Is it true horses sleep while standing up?
1,リストからトムの名前を消しなさい。,Delete Tom's name from the list.
2,水割りにしてください。,I'd like whiskey and water.
3,それは表現しにくいですね。,That's hard to say.
4,どっちもいいね。,They are both good.


In [5]:
ja_en.tail()

Unnamed: 0,japanese,english
115486,今朝から雨が降ったりやんだりしている。,It's been raining on and off since this morning.
115487,紙がほしい。,I need some paper.
115488,私が彼女のこと毛嫌いしてるって、彼女は分かってないのよ。,She doesn't know I hate her.
115489,これは難しい状況ですね。,This is a difficult situation.
115490,一日中、雨は止むことなく降り続いた。,It rained continuously all day.


In [6]:
ja_en.shape

(115491, 2)

In [7]:
train_dataset = ja_en[:68501]
val_dataset = ja_en[68501:68702]

In [8]:
train_dataset.tail()

Unnamed: 0,japanese,english
68496,あのレースは八百長だった。,That race was fixed.
68497,歯磨きはどこ？,Where's the toothpaste?
68498,彼は銀行で働いています。,He works at a bank.
68499,トムはメアリーの肩にもたれた。,Tom leaned on Mary's shoulder.
68500,どうぞお立ち寄りください。,Please drop in on us.


In [9]:
val_dataset.head()

Unnamed: 0,japanese,english
68501,宿題は自分でやったの？,Did you do your homework by yourself?
68502,痛い目に遭わせるぞ。,Don't make me hurt you.
68503,落ちつけよ。,Please relax.
68504,今、トムは３歳よ。,Tom is now three years old.
68505,フランス語は便利よ。,French is useful.


In [10]:
def preprocess_the_ja_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'(https?://\S+)', r'', text)
    text = re.sub(r'(quote=\w+;?)', r'', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [11]:
def preprocess_the_en_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.lower()
    text = re.sub(r'(https?://\S+)', r'', text)
    text = re.sub(r'(quote=\w+;?)', r'', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [15]:
ja_en["japanese"].to_csv("ja.txt", index=False, header=False)
ja_en["english"].to_csv("en.txt", index=False, header=False)

In [16]:
def clean_and_save(input_path, output_path, preprocessing_fn):
    with open(input_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        
    cleaned_lines = [preprocessing_fn(line) for line in lines if line.strip()]
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned_lines))
        
    print(f"Saved cleaned file → {output_path} ({len(cleaned_lines)} lines)")

In [17]:
clean_and_save("./ja.txt", "clean_ja.txt", preprocess_the_ja_text)
clean_and_save("./en.txt", "clean_en.txt", preprocess_the_en_text)

Saved cleaned file → clean_ja.txt (115491 lines)
Saved cleaned file → clean_en.txt (115492 lines)


In [18]:
spm.SentencePieceTrainer.train(input="clean_ja.txt",
                              model_prefix="ja_bpe",
                              vocab_size=8000,
                              model_type="bpe",
                              bos_id=1,
                              eos_id=2,
                              pad_id=0,
                              unk_id=3)

In [19]:
spm.SentencePieceTrainer.train(input="clean_en.txt",
                              model_prefix="en_bpe",
                              vocab_size=8000,
                              model_type="bpe",
                              bos_id=1,
                              eos_id=2,
                              pad_id=0,
                              unk_id=3)

In [12]:
ja_tokenizer = spm.SentencePieceProcessor()
ja_tokenizer.load("ja_bpe.model")

en_tokenizer = spm.SentencePieceProcessor()
en_tokenizer.load("en_bpe.model")

True

In [13]:
pad_token_id = ja_tokenizer.pad_id()

In [14]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, ja_tokenizer, en_tokenizer, max_len=80):
        self.ja_samples = [preprocess_the_ja_text(s) for s in dataset["japanese"]]
        self.en_samples = [preprocess_the_en_text(s) for s in dataset["english"]]
        self.ja_tokenizer = ja_tokenizer
        self.en_tokenizer = en_tokenizer
        
        self.max_len = max_len

        self.ja_ids = [self.encode_text(ja, self.ja_tokenizer) for ja in self.ja_samples]
        self.en_ids = [self.encode_text(en, self.en_tokenizer) for en in self.en_samples]
        
        self.ja_ids = torch.stack(self.ja_ids)
        self.en_ids = torch.stack(self.en_ids)


    def encode_text(self, text, tokenizer):
        ids = tokenizer.encode(text)
        ids = [tokenizer.bos_id()] + ids + [tokenizer.eos_id()]
        if len(ids) < self.max_len:
            ids = ids + [tokenizer.pad_id()] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return torch.tensor(ids, dtype=torch.long)

    def __getitem__(self, idx):
        return self.ja_ids[idx], self.en_ids[idx]

    def __len__(self):
        return len(self.ja_ids)

    def ja_vocab_size(self):
        return len(self.ja_tokenizer)

    def en_vocab_size(self):
        return len(self.en_tokenizer)

In [15]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model=512, max_seq_len=80, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  
        
        self.register_buffer("pe", pe)
            
    def forward(self, x):
        seq_len = x.size(1)
        x = x * math.sqrt(self.d_model)
        x = x + self.pe[:, :seq_len, :].to(x.device)   
        return self.dropout(x)

In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = int(d_model / n_heads)

        assert self.d_model % self.n_heads == 0, "d_model has to be divisible by n_heads."
        
        self.w_q = nn.Linear(self.d_model, self.d_model)
        self.w_k = nn.Linear(self.d_model, self.d_model)
        self.w_v = nn.Linear(self.d_model, self.d_model)
        self.out = nn.Linear(self.d_model, self.d_model)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        q_len = q.size(1)
        k_len = k.size(1)

        q = self.w_q(q)
        k = self.w_k(k)
        v = self.w_v(v)

        q = q.view(batch_size, q_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch_size, k_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(batch_size, k_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        attention_weights = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            attention_weights = attention_weights.masked_fill(mask == 0, float('-1e20'))
            
        attention_scores = F.softmax(attention_weights,dim=-1)
        attention_scores = self.dropout(attention_scores)
        context = torch.matmul(attention_scores, v)
        concat = context.permute(0, 2, 1, 3).contiguous().view(batch_size, q_len, self.n_heads*self.head_dim)

        x = self.out(concat)
        return x

In [17]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, d_model*4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model*4, d_model)
        )
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)
        
    def forward(self, x, mask=None):
        attention_out = self.attention(x, x, x, mask)
        attention_residual_out = self.dropout1(attention_out) + x
        norm1_out = self.norm1(attention_residual_out)
        feedfwd_out = self.feedforward(norm1_out)
        feedfwd_residual_out = self.dropout2(feedfwd_out) + norm1_out
        output = self.norm2(feedfwd_residual_out)
        
        return output

In [18]:
class Encoder(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1, n_layers=6):
        super().__init__()
        self.n_layers = n_layers
        self.layers = nn.ModuleList([
            EncoderBlock(d_model, n_heads, dropout=dropout)
            for i in range(n_layers)
        ])
        
    def forward(self, x, ja_mask=None):
        for layer in self.layers:
            x = layer(x, ja_mask)
        return x

In [19]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1, mask=None):
        super().__init__()
        self.masked_attention = MultiHeadAttention(d_model, n_heads)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, d_model*4), 
            nn.ReLU(),
            nn.Linear(d_model*4, d_model)
        )
        self.dropout3 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(d_model)
        
    def forward(self, x, enc_out, ja_mask=None, en_mask=None):
        masked_attention_out = self.masked_attention(x, x, x, mask=en_mask)
        masked_attention_residual_out = self.dropout1(masked_attention_out) + x
        norm1_out = self.norm1(masked_attention_residual_out)
        
        attention_out = self.attention(norm1_out, enc_out, enc_out, mask=ja_mask)
        attention_residual_out = self.dropout2(attention_out) + norm1_out
        norm2_out = self.norm2(attention_residual_out)
        
        feedfwd_out = self.feedforward(norm2_out)
        feedfwd_residual_out = self.dropout3(feedfwd_out) + norm2_out
        output = self.norm3(feedfwd_residual_out)
        
        return output

In [20]:
class Decoder(nn.Module):
    def __init__(self, d_model=512, n_heads=8, dropout=0.1, n_layers=6):
        super().__init__()
        self.n_layers = n_layers
        self.layers = nn.ModuleList([
            DecoderBlock(d_model, n_heads, dropout=dropout)
            for i in range(n_layers)
        ])
    def forward(self, x, enc_out, en_mask=None, ja_mask=None):
        for layer in self.layers:
            x = layer(x, enc_out, en_mask=en_mask, ja_mask=ja_mask)
        return x

In [21]:
class Transformer(nn.Module):
    def __init__(self, ja_vocab_size, en_vocab_size, d_model=512, max_seq_len=80, n_heads=8, dropout=0.1, n_layers=6):
        super().__init__()
        self.d_model = d_model
        self.ja_embedding = nn.Embedding(ja_vocab_size, d_model, padding_idx=pad_token_id)
        self.en_embedding = nn.Embedding(en_vocab_size, d_model, padding_idx=pad_token_id)
        
        self.positional_encoder = PositionalEncoder(d_model, max_seq_len)

        self.dropout = nn.Dropout(dropout)
        
        self.encoder = Encoder(d_model=d_model, n_heads=n_heads, dropout=dropout, n_layers=n_layers)
        self.decoder = Decoder(d_model=d_model, n_heads=n_heads, dropout=dropout, n_layers=n_layers)
        
        self.out_proj = nn.Linear(d_model, en_vocab_size, bias=False)

        self.out_proj.weight = self.en_embedding.weight
        
    def generate_mask(self, ja_ids, dec_input, device):
        device = dec_input.device
        ja_mask = (ja_ids != 0).unsqueeze(1).unsqueeze(2)
        en_mask =  (dec_input != 0).unsqueeze(1).unsqueeze(2)
        seq_len = dec_input.size(1)
        causal_mask = torch.triu(torch.ones((seq_len, seq_len), device=device), diagonal=1).bool()
        causal_mask = ~causal_mask
        causal_mask = causal_mask.unsqueeze(0).unsqueeze(1)
        en_mask = en_mask & causal_mask
        return ja_mask, en_mask
        
    def forward(self, ja_ids, dec_input, ja_mask=None, en_mask=None):
        ja_mask, en_mask = self.generate_mask(ja_ids, dec_input, device)
        ja_embedding = self.dropout(self.positional_encoder(self.ja_embedding(ja_ids) * math.sqrt(self.d_model)))
        en_embedding = self.dropout(self.positional_encoder(self.en_embedding(dec_input) * math.sqrt(self.d_model)))
        encoder_out = self.encoder(ja_embedding, ja_mask)
        decoder_out = self.decoder(en_embedding, encoder_out, en_mask=en_mask, ja_mask=ja_mask)
        out = self.out_proj(decoder_out)
        return out

In [22]:
train_data = TranslationDataset(train_dataset, ja_tokenizer, en_tokenizer, max_len=80)
train_loader = DataLoader(train_data, batch_size=128, num_workers=0, shuffle=True, drop_last=False)

val_data = TranslationDataset(val_dataset, ja_tokenizer, en_tokenizer, max_len=80)
val_loader = DataLoader(val_data, batch_size=128, num_workers=0, shuffle=True, drop_last=False)

In [23]:
JA_VOCAB_SIZE = train_data.ja_vocab_size()
EN_VOCAB_SIZE = train_data.en_vocab_size()
D_MODEL = 512
N_HEADS = 8
N_LAYERS = 6
MAX_SEQ_LEN = 80
DROPOUT = 0.1
N_EPOCHS = 100
SAVE_EPOCH = 10
LR = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
CHECKPOINT_DIR = "./checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [25]:
print(str(JA_VOCAB_SIZE) + ",", EN_VOCAB_SIZE)
print(len(train_dataset))

8000, 8000
68501


In [26]:
def save_checkpoint(epoch, model, optimizer, loss, path):
    checkpoint = {
        "epoch" : epoch,
        "model_state" : model.state_dict(),
        "optimizer_state" : optimizer.state_dict(),
        "loss" : loss
    }
    torch.save(checkpoint, path)
    print(f"Saved checkpoint: {path}")

In [27]:
def load_checkpoint(path, model, optimizer=None, device="cpu"):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint["model_state"])
    if optimizer is not None and "optimizer_state" in checkpoint:
        optimizer.load_state_dict(checkpoint["optimizer_state"])
    print(f"Loaded checkpoint from epoch {checkpoint['epoch']} with loss {checkpoint['loss']:.4f}")
    return checkpoint["epoch"], checkpoint["loss"]

In [28]:
model = Transformer(d_model=D_MODEL,
                    ja_vocab_size=JA_VOCAB_SIZE,
                    en_vocab_size=EN_VOCAB_SIZE,
                    max_seq_len=MAX_SEQ_LEN, 
                    n_heads=N_HEADS,
                    dropout=DROPOUT,
                   n_layers=N_LAYERS).to(device)
writer = SummaryWriter()
optimizer = Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)

In [29]:
def greedy_decode(model, ja_ids, tokenizer, max_len=80, device="cpu"):
    model.eval()
    with torch.no_grad():
        if ja_ids.dim() == 1:
            ja_ids = src_ids.unsqueeze(0)
        ja_ids = ja_ids.to(device)

        bos_id = ja_tokenizer.bos_id()
        eos_id = ja_tokenizer.eos_id()

        dec_input = torch.tensor([[bos_id]], device=device)

        for _ in range(max_len):
            outputs = model(ja_ids, dec_input)   # [1, tgt_len, vocab_size]
            next_token = outputs[:, -1, :].argmax(-1)  # [1]
            dec_input = torch.cat([dec_input, next_token.unsqueeze(1)], dim=1)

            if next_token.item() == eos_id:
                break

    return dec_input[0].cpu().tolist()[1:-1] if dec_input[0, -1].item() == eos_id else dec_input[0].cpu().tolist()[1:]

In [30]:
def train_model(model, train_loader, val_loader, ja_tokenizer, en_tokenizer, 
                device, N_EPOCHS=100, lr=1e-4, SAVE_EPOCH=5, CHECKPOINT_DIR="./checkpoints"):
    for epoch in range(1, N_EPOCHS+1):
        model.train()
        epoch_loss = 0
        train_iterator = tqdm(train_loader, desc=f"EPOCH: {epoch}/{N_EPOCHS}")

        for batch_idx, (ja_ids, en_ids) in enumerate(train_iterator):
            ja_ids = ja_ids.to(device)
            en_ids = en_ids.to(device)

            dec_input = en_ids[:, :-1]
            target = en_ids[:, 1:].contiguous().view(-1)

            outputs = model(ja_ids, dec_input)     # [B, T, V]
            outputs = outputs.view(-1, outputs.size(-1))

            loss = criterion(outputs, target)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
            train_iterator.set_postfix(loss=loss.item())

        avg_loss = epoch_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_loss))
        print(f"Epoch {epoch} | Train Loss: {avg_loss:.4f} | PPL: {perplexity:.4f}")

        model.eval()
        val_loss = 0
        preds, refs = [], []
        with torch.no_grad():
            for ja_ids, en_ids in val_loader:
                ja_ids = ja_ids.to(device)
                en_ids = en_ids.to(device)

                dec_input = en_ids[:, :-1]
                target = en_ids[:, 1:].contiguous().view(-1)

                outputs = model(ja_ids, dec_input)
                outputs = outputs.view(-1, outputs.size(-1))

                loss = criterion(outputs, target)
                val_loss += loss.item()

                out_ids = greedy_decode(model, ja_ids[0:1], en_tokenizer, device=device)
                pred = en_tokenizer.decode(out_ids)
                ref = en_tokenizer.decode(en_ids[0].cpu().tolist())
                preds.append(pred.split())
                refs.append([ref.split()])

        avg_val_loss = val_loss / len(val_loader)
        val_ppl = torch.exp(torch.tensor(avg_val_loss))
        print(f"Epoch {epoch} | Val Loss: {avg_val_loss:.4f} | Val PPL: {val_ppl:.4f}")

        smoothing = SmoothingFunction().method1
        bleu_scores = [sentence_bleu(r, p, smoothing_function=smoothing) for p, r in zip(preds, refs)]
        bleu = sum(bleu_scores) / len(bleu_scores)
        print(f"Validation BLEU: {bleu:.4f}")

        print("\n A comparison between sample predictions and references:")
        for i in range(min(3, len(preds))):
            print(f"PRED {i+1}: {' '.join(preds[i])}")
            print(f"REF  {i+1}: {' '.join(refs[i][0])}")
            print("—" * 60)

        if epoch % SAVE_EPOCH == 0:
            save_path = os.path.join(CHECKPOINT_DIR, f"epoch_{epoch}.pth")
            torch.save({
                "epoch": epoch,
                "model_state": model.state_dict(),
                "optim_state": optimizer.state_dict(),
                "loss": avg_loss,
            }, save_path)
            print(f"Saved checkpoint: {save_path}")

In [31]:
train_model(model, train_loader, val_loader, ja_tokenizer, en_tokenizer, 
                device, N_EPOCHS=N_EPOCHS, lr=LR, SAVE_EPOCH=SAVE_EPOCH, CHECKPOINT_DIR=CHECKPOINT_DIR)

EPOCH: 1/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 16.5277 | PPL: 15061879.0000
Epoch 1 | Val Loss: 9.5867 | Val PPL: 14570.4248
Validation BLEU: 0.0000

 A comparison between sample predictions and references:
PRED 1: i don't want to be.
REF  1: there's no mistake about it.
————————————————————————————————————————————————————————————
PRED 2: i don't want to be a little.
REF  2: it seems less crowded during the week.
————————————————————————————————————————————————————————————


EPOCH: 2/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 8.7622 | PPL: 6387.8657
Epoch 2 | Val Loss: 6.3673 | Val PPL: 582.4880
Validation BLEU: 0.0011

 A comparison between sample predictions and references:
PRED 1: tom is a a a a a job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job job
REF  1: tom likes country music.
————————————————————————————————————————————————————————————
PRED 2: are you going to see you?
REF  2: may i come again?
————————————————————————————————————————————————————————————


EPOCH: 3/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 6.6779 | PPL: 794.6091
Epoch 3 | Val Loss: 5.1143 | Val PPL: 166.3812
Validation BLEU: 0.0387

 A comparison between sample predictions and references:
PRED 1: there is no one of the table.
REF  1: there is a doll in the box.
————————————————————————————————————————————————————————————
PRED 2: there is no one in 1 ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ .
REF  2: children who are sixteen years old or younger may not enter the theater.
————————————————————————————————————————————————————————————


EPOCH: 4/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 5.4849 | PPL: 241.0336
Epoch 4 | Val Loss: 4.4838 | Val PPL: 88.5719
Validation BLEU: 0.0512

 A comparison between sample predictions and references:
PRED 1: thank you yesterday yesterday yesterday.
REF  1: thanks for yesterday.
————————————————————————————————————————————————————————————
PRED 2: i study study night.
REF  2: i was up all night studying.
————————————————————————————————————————————————————————————


EPOCH: 5/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 4.7986 | PPL: 121.3433
Epoch 5 | Val Loss: 4.5425 | Val PPL: 93.9241
Validation BLEU: 0.1790

 A comparison between sample predictions and references:
PRED 1: french is a french.
REF  1: french is useful.
————————————————————————————————————————————————————————————
PRED 2: i think that's true.
REF  2: i think it's true.
————————————————————————————————————————————————————————————


EPOCH: 6/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 4.2924 | PPL: 73.1420
Epoch 6 | Val Loss: 3.8311 | Val PPL: 46.1128
Validation BLEU: 0.0331

 A comparison between sample predictions and references:
PRED 1: tom is a bad bad bad bad bad bad bad bad bad city.
REF  1: tom isn't good with basic arithmetic.
————————————————————————————————————————————————————————————
PRED 2: he is not not a little of his parents.
REF  2: there isn't any hope of his success.
————————————————————————————————————————————————————————————


EPOCH: 7/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 3.8583 | PPL: 47.3840
Epoch 7 | Val Loss: 3.6691 | Val PPL: 39.2154
Validation BLEU: 0.0546

 A comparison between sample predictions and references:
PRED 1: i think it's a here.
REF  1: i think that we'd better leave early.
————————————————————————————————————————————————————————————
PRED 2: both of them are both of them.
REF  2: would the two of you quit bickering?
————————————————————————————————————————————————————————————


EPOCH: 8/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 3.5628 | PPL: 35.2603
Epoch 8 | Val Loss: 3.1929 | Val PPL: 24.3595
Validation BLEU: 0.0807

 A comparison between sample predictions and references:
PRED 1: i was home.
REF  1: the house was in flames.
————————————————————————————————————————————————————————————
PRED 2: i haven't seen college.
REF  2: i haven't gone to college yet.
————————————————————————————————————————————————————————————


EPOCH: 9/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 3.2660 | PPL: 26.2070
Epoch 9 | Val Loss: 3.2440 | Val PPL: 25.6356
Validation BLEU: 0.0698

 A comparison between sample predictions and references:
PRED 1: i forgot i forgot to homework.
REF  1: i forgot we had homework.
————————————————————————————————————————————————————————————
PRED 2: tom is bad at eight.
REF  2: tom isn't good with basic arithmetic.
————————————————————————————————————————————————————————————


EPOCH: 10/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 3.0142 | PPL: 20.3730
Epoch 10 | Val Loss: 3.1770 | Val PPL: 23.9738
Validation BLEU: 0.0000

 A comparison between sample predictions and references:
PRED 1: i was 10 by train.
REF  1: the train will come in at platform ten.
————————————————————————————————————————————————————————————
PRED 2: i need more.
REF  2: try harder.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_10.pth


EPOCH: 11/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 11 | Train Loss: 2.8211 | PPL: 16.7956
Epoch 11 | Val Loss: 2.9900 | Val PPL: 19.8865
Validation BLEU: 0.5107

 A comparison between sample predictions and references:
PRED 1: why are you laughing?
REF  1: why are you laughing?
————————————————————————————————————————————————————————————
PRED 2: i'm very busy.
REF  2: i'm, as you know, a very busy person.
————————————————————————————————————————————————————————————


EPOCH: 12/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 12 | Train Loss: 2.6177 | PPL: 13.7042
Epoch 12 | Val Loss: 2.7758 | Val PPL: 16.0513
Validation BLEU: 0.0629

 A comparison between sample predictions and references:
PRED 1: i tree in laundry.
REF  1: bring the washing in.
————————————————————————————————————————————————————————————
PRED 2: this juice is very good.
REF  2: this steak is very juicy.
————————————————————————————————————————————————————————————


EPOCH: 13/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 13 | Train Loss: 2.4637 | PPL: 11.7483
Epoch 13 | Val Loss: 2.7347 | Val PPL: 15.4051
Validation BLEU: 0.0638

 A comparison between sample predictions and references:
PRED 1: i have a lot of things to do.
REF  1: i have something to give you.
————————————————————————————————————————————————————————————
PRED 2: the game won't team.
REF  2: our team won the game.
————————————————————————————————————————————————————————————


EPOCH: 14/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 14 | Train Loss: 2.3125 | PPL: 10.0992
Epoch 14 | Val Loss: 2.6977 | Val PPL: 14.8455
Validation BLEU: 0.0335

 A comparison between sample predictions and references:
PRED 1: tom took off his seat.
REF  1: tom was away from his desk.
————————————————————————————————————————————————————————————
PRED 2: i've been working here for a few years ago.
REF  2: it was just ⁇ years ago that i visited new zealand for the first time.
————————————————————————————————————————————————————————————


EPOCH: 15/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 15 | Train Loss: 2.1693 | PPL: 8.7522
Epoch 15 | Val Loss: 2.6098 | Val PPL: 13.5964
Validation BLEU: 0.0803

 A comparison between sample predictions and references:
PRED 1: have you had breakfast yet?
REF  1: have you eaten breakfast yet?
————————————————————————————————————————————————————————————
PRED 2: what's the article article article article for me?
REF  2: don't read my diary.
————————————————————————————————————————————————————————————


EPOCH: 16/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 16 | Train Loss: 2.0428 | PPL: 7.7124
Epoch 16 | Val Loss: 2.5835 | Val PPL: 13.2432
Validation BLEU: 0.0326

 A comparison between sample predictions and references:
PRED 1: she's me a liar she's me.
REF  1: she accused me of being a liar.
————————————————————————————————————————————————————————————
PRED 2: you're my family, will you?
REF  2: i am sending you a picture of my family.
————————————————————————————————————————————————————————————


EPOCH: 17/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 17 | Train Loss: 1.9346 | PPL: 6.9216
Epoch 17 | Val Loss: 2.6312 | Val PPL: 13.8898
Validation BLEU: 0.0248

 A comparison between sample predictions and references:
PRED 1: when the people do the right, we'll be able to spend the right someday.
REF  1: what you spend time doing in your childhood affects the rest of your life.
————————————————————————————————————————————————————————————
PRED 2: help me, i'd rather stay very happy.
REF  2: i'd be very grateful if you could help me.
————————————————————————————————————————————————————————————


EPOCH: 18/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 18 | Train Loss: 1.8217 | PPL: 6.1827
Epoch 18 | Val Loss: 2.5196 | Val PPL: 12.4240
Validation BLEU: 0.0942

 A comparison between sample predictions and references:
PRED 1: i had a reading novels novels.
REF  1: i was absorbed in reading a novel.
————————————————————————————————————————————————————————————
PRED 2: i thought i might know where tom went mary was.
REF  2: i thought tom might know where mary lives.
————————————————————————————————————————————————————————————


EPOCH: 19/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 19 | Train Loss: 1.7139 | PPL: 5.5504
Epoch 19 | Val Loss: 2.4691 | Val PPL: 11.8120
Validation BLEU: 0.0369

 A comparison between sample predictions and references:
PRED 1: we finally finally finally finally got my real.
REF  1: my wish has finally come true.
————————————————————————————————————————————————————————————
PRED 2: please feel free to eat yourself.
REF  2: please have some cookies.
————————————————————————————————————————————————————————————


EPOCH: 20/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 20 | Train Loss: 1.6072 | PPL: 4.9888
Epoch 20 | Val Loss: 2.4702 | Val PPL: 11.8247
Validation BLEU: 0.3380

 A comparison between sample predictions and references:
PRED 1: thanks for yesterday.
REF  1: thanks for yesterday.
————————————————————————————————————————————————————————————
PRED 2: have you already had breakfast?
REF  2: have you eaten breakfast yet?
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_20.pth


EPOCH: 21/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 21 | Train Loss: 1.5212 | PPL: 4.5778
Epoch 21 | Val Loss: 2.3997 | Val PPL: 11.0194
Validation BLEU: 0.5176

 A comparison between sample predictions and references:
PRED 1: our turn is next.
REF  1: our turn is next.
————————————————————————————————————————————————————————————
PRED 2: where do you spend your summer vacation?
REF  2: where are you going to spend the summer holidays?
————————————————————————————————————————————————————————————


EPOCH: 22/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 22 | Train Loss: 1.4368 | PPL: 4.2074
Epoch 22 | Val Loss: 2.4527 | Val PPL: 11.6193
Validation BLEU: 0.5052

 A comparison between sample predictions and references:
PRED 1: when i spend the right time, i've seen my right spend spend spend spend the people when i've seen a right time.
REF  1: what you spend time doing in your childhood affects the rest of your life.
————————————————————————————————————————————————————————————
PRED 2: the hotel burned down.
REF  2: the hotel burned down.
————————————————————————————————————————————————————————————


EPOCH: 23/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 23 | Train Loss: 1.3520 | PPL: 3.8652
Epoch 23 | Val Loss: 2.4671 | Val PPL: 11.7886
Validation BLEU: 0.3605

 A comparison between sample predictions and references:
PRED 1: we went for a walk.
REF  1: we went for a stroll.
————————————————————————————————————————————————————————————
PRED 2: i will take thery off.
REF  2: i'll take back what i said.
————————————————————————————————————————————————————————————


EPOCH: 24/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 24 | Train Loss: 1.2738 | PPL: 3.5743
Epoch 24 | Val Loss: 2.3126 | Val PPL: 10.1010
Validation BLEU: 0.0607

 A comparison between sample predictions and references:
PRED 1: calm down.
REF  1: please relax.
————————————————————————————————————————————————————————————
PRED 2: his wife is a hold of his wife.
REF  2: that man is skinny, but his wife is fat.
————————————————————————————————————————————————————————————


EPOCH: 25/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 25 | Train Loss: 1.1971 | PPL: 3.3105
Epoch 25 | Val Loss: 2.3662 | Val PPL: 10.6570
Validation BLEU: 0.7541

 A comparison between sample predictions and references:
PRED 1: tom isn't watching tv at all.
REF  1: tom isn't watching tv now.
————————————————————————————————————————————————————————————
PRED 2: may i come again?
REF  2: may i come again?
————————————————————————————————————————————————————————————


EPOCH: 26/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 26 | Train Loss: 1.1286 | PPL: 3.0912
Epoch 26 | Val Loss: 2.3413 | Val PPL: 10.3948
Validation BLEU: 0.5432

 A comparison between sample predictions and references:
PRED 1: bring the laundry in the laundry.
REF  1: bring the washing in.
————————————————————————————————————————————————————————————
PRED 2: how many brothers do you have?
REF  2: how many brothers do you have?
————————————————————————————————————————————————————————————


EPOCH: 27/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 27 | Train Loss: 1.0621 | PPL: 2.8925
Epoch 27 | Val Loss: 2.3605 | Val PPL: 10.5965
Validation BLEU: 0.1684

 A comparison between sample predictions and references:
PRED 1: there is a in the box.
REF  1: there is a doll in the box.
————————————————————————————————————————————————————————————
PRED 2: i cut my hair cut as a haircut as i get some transfer.
REF  2: i tried to cut my bangs myself, but i cut off too much.
————————————————————————————————————————————————————————————


EPOCH: 28/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 28 | Train Loss: 1.0051 | PPL: 2.7322
Epoch 28 | Val Loss: 2.4029 | Val PPL: 11.0557
Validation BLEU: 0.0721

 A comparison between sample predictions and references:
PRED 1: i'll give you ai-up.
REF  1: i have something to give you.
————————————————————————————————————————————————————————————
PRED 2: put on your uniform.
REF  2: put your uniform on.
————————————————————————————————————————————————————————————


EPOCH: 29/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 29 | Train Loss: 0.9446 | PPL: 2.5718
Epoch 29 | Val Loss: 2.3281 | Val PPL: 10.2586
Validation BLEU: 0.6429

 A comparison between sample predictions and references:
PRED 1: you can't live without water.
REF  1: you couldn't live without water.
————————————————————————————————————————————————————————————
PRED 2: our team won the game.
REF  2: our team won the game.
————————————————————————————————————————————————————————————


EPOCH: 30/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 30 | Train Loss: 0.8921 | PPL: 2.4403
Epoch 30 | Val Loss: 2.4195 | Val PPL: 11.2406
Validation BLEU: 0.1123

 A comparison between sample predictions and references:
PRED 1: the qu invited us to the thr.
REF  1: the queen was gracious enough to invite us.
————————————————————————————————————————————————————————————
PRED 2: tom committed suicide.
REF  2: tom has committed suicide.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_30.pth


EPOCH: 31/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 31 | Train Loss: 0.8374 | PPL: 2.3104
Epoch 31 | Val Loss: 2.5398 | Val PPL: 12.6772
Validation BLEU: 0.1446

 A comparison between sample predictions and references:
PRED 1: he didn't show up after all.
REF  1: he didn't turn up after all.
————————————————————————————————————————————————————————————
PRED 2: if you'd help me, i'd very happy.
REF  2: i'd be very grateful if you could help me.
————————————————————————————————————————————————————————————


EPOCH: 32/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 32 | Train Loss: 0.7894 | PPL: 2.2021
Epoch 32 | Val Loss: 2.3948 | Val PPL: 10.9660
Validation BLEU: 0.6028

 A comparison between sample predictions and references:
PRED 1: where are you going?
REF  1: where are you going?
————————————————————————————————————————————————————————————
PRED 2: could you help me do my homework?
REF  2: will you help me with my homework?
————————————————————————————————————————————————————————————


EPOCH: 33/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 33 | Train Loss: 0.7420 | PPL: 2.1001
Epoch 33 | Val Loss: 2.5611 | Val PPL: 12.9496
Validation BLEU: 0.5000

 A comparison between sample predictions and references:
PRED 1: what a article!
REF  1: don't read my diary.
————————————————————————————————————————————————————————————
PRED 2: how many brothers do you have?
REF  2: how many brothers do you have?
————————————————————————————————————————————————————————————


EPOCH: 34/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 34 | Train Loss: 0.7003 | PPL: 2.0143
Epoch 34 | Val Loss: 2.3393 | Val PPL: 10.3741
Validation BLEU: 1.0000

 A comparison between sample predictions and references:
PRED 1: i thought tom might know where mary lives.
REF  1: i thought tom might know where mary lives.
————————————————————————————————————————————————————————————
PRED 2: tom is making a paper airplane.
REF  2: tom is making a paper airplane.
————————————————————————————————————————————————————————————


EPOCH: 35/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 35 | Train Loss: 0.6642 | PPL: 1.9430
Epoch 35 | Val Loss: 2.4863 | Val PPL: 12.0164
Validation BLEU: 0.5000

 A comparison between sample predictions and references:
PRED 1: our team won the game.
REF  1: our team won the game.
————————————————————————————————————————————————————————————
PRED 2: you should've big child.
REF  2: i'm sure the children are getting big.
————————————————————————————————————————————————————————————


EPOCH: 36/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 36 | Train Loss: 0.6252 | PPL: 1.8686
Epoch 36 | Val Loss: 2.4692 | Val PPL: 11.8132
Validation BLEU: 0.0182

 A comparison between sample predictions and references:
PRED 1: you've cut my hair cut out once in the front of the bar.
REF  1: i tried to cut my bangs myself, but i cut off too much.
————————————————————————————————————————————————————————————
PRED 2: i've run short of money.
REF  2: our money ran out.
————————————————————————————————————————————————————————————


EPOCH: 37/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 37 | Train Loss: 0.5881 | PPL: 1.8006
Epoch 37 | Val Loss: 2.4857 | Val PPL: 12.0091
Validation BLEU: 0.2601

 A comparison between sample predictions and references:
PRED 1: tom isn't poor.
REF  1: tom is poor.
————————————————————————————————————————————————————————————
PRED 2: will you come to my party?
REF  2: would you like to come to my party?
————————————————————————————————————————————————————————————


EPOCH: 38/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 38 | Train Loss: 0.5602 | PPL: 1.7509
Epoch 38 | Val Loss: 2.4118 | Val PPL: 11.1542
Validation BLEU: 0.1306

 A comparison between sample predictions and references:
PRED 1: finally, she began my heart.
REF  1: my wish has finally come true.
————————————————————————————————————————————————————————————
PRED 2: bring the laundry in the laundry.
REF  2: bring in the laundry.
————————————————————————————————————————————————————————————


EPOCH: 39/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 39 | Train Loss: 0.5306 | PPL: 1.7000
Epoch 39 | Val Loss: 2.4235 | Val PPL: 11.2856
Validation BLEU: 0.0101

 A comparison between sample predictions and references:
PRED 1: tom wasclclcled.
REF  1: tom was away from his desk.
————————————————————————————————————————————————————————————
PRED 2: we will get the tag par ev evac arguing.
REF  2: give him an inch and he'll take a yard.
————————————————————————————————————————————————————————————


EPOCH: 40/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 40 | Train Loss: 0.5050 | PPL: 1.6570
Epoch 40 | Val Loss: 2.4444 | Val PPL: 11.5232
Validation BLEU: 0.1428

 A comparison between sample predictions and references:
PRED 1: the lake was frozen to be freezing.
REF  1: the lake was frozen.
————————————————————————————————————————————————————————————
PRED 2: tom is three years old now.
REF  2: tom is now three years old.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_40.pth


EPOCH: 41/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 41 | Train Loss: 0.4842 | PPL: 1.6228
Epoch 41 | Val Loss: 2.5890 | Val PPL: 13.3166
Validation BLEU: 0.5676

 A comparison between sample predictions and references:
PRED 1: our team won the game.
REF  1: our team won the game.
————————————————————————————————————————————————————————————
PRED 2: what's your secret?
REF  2: what's the secret?
————————————————————————————————————————————————————————————


EPOCH: 42/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 42 | Train Loss: 0.4599 | PPL: 1.5839
Epoch 42 | Val Loss: 2.4819 | Val PPL: 11.9645
Validation BLEU: 0.0781

 A comparison between sample predictions and references:
PRED 1: there are a lot ofiated in the box.
REF  1: there is a doll in the box.
————————————————————————————————————————————————————————————
PRED 2: you can do anything.
REF  2: anything is ok with me.
————————————————————————————————————————————————————————————


EPOCH: 43/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 43 | Train Loss: 0.4397 | PPL: 1.5522
Epoch 43 | Val Loss: 2.5504 | Val PPL: 12.8126
Validation BLEU: 0.0370

 A comparison between sample predictions and references:
PRED 1: i'd really gentle my arm, so i wouldn't have the whole body.
REF  1: i have a very sore arm where you hit me.
————————————————————————————————————————————————————————————
PRED 2: she got up with her eggs.
REF  2: she ran away with the eggs.
————————————————————————————————————————————————————————————


EPOCH: 44/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 44 | Train Loss: 0.4234 | PPL: 1.5271
Epoch 44 | Val Loss: 2.3696 | Val PPL: 10.6928
Validation BLEU: 0.1949

 A comparison between sample predictions and references:
PRED 1: don't underestimate me.
REF  1: you shouldn't underestimate me.
————————————————————————————————————————————————————————————
PRED 2: i can't speak french, too.
REF  2: i can't speak french or english.
————————————————————————————————————————————————————————————


EPOCH: 45/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 45 | Train Loss: 0.4068 | PPL: 1.5019
Epoch 45 | Val Loss: 2.6322 | Val PPL: 13.9040
Validation BLEU: 0.1581

 A comparison between sample predictions and references:
PRED 1: she's dieting.
REF  1: she's dieting.
————————————————————————————————————————————————————————————
PRED 2: you'll be a large man for you.
REF  2: you always try to blame somebody else.
————————————————————————————————————————————————————————————


EPOCH: 46/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 46 | Train Loss: 0.3902 | PPL: 1.4772
Epoch 46 | Val Loss: 2.6526 | Val PPL: 14.1910
Validation BLEU: 0.7686

 A comparison between sample predictions and references:
PRED 1: our team won the game.
REF  1: our team won the game.
————————————————————————————————————————————————————————————
PRED 2: hurry up or you'll be late.
REF  2: hurry up, or you'll be late.
————————————————————————————————————————————————————————————


EPOCH: 47/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 47 | Train Loss: 0.3737 | PPL: 1.4532
Epoch 47 | Val Loss: 2.4674 | Val PPL: 11.7914
Validation BLEU: 0.5165

 A comparison between sample predictions and references:
PRED 1: the quot us invited us to the king.
REF  1: the queen was gracious enough to invite us.
————————————————————————————————————————————————————————————
PRED 2: i wanted to play tag.
REF  2: i wanted to play tag.
————————————————————————————————————————————————————————————


EPOCH: 48/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 48 | Train Loss: 0.3599 | PPL: 1.4332
Epoch 48 | Val Loss: 2.5883 | Val PPL: 13.3065
Validation BLEU: 0.0401

 A comparison between sample predictions and references:
PRED 1: i'll pay the day before yesterday.
REF  1: give him an inch and he'll take a yard.
————————————————————————————————————————————————————————————
PRED 2: this book is written.
REF  2: this book could be useful to you.
————————————————————————————————————————————————————————————


EPOCH: 49/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 49 | Train Loss: 0.3500 | PPL: 1.4191
Epoch 49 | Val Loss: 2.5468 | Val PPL: 12.7664
Validation BLEU: 0.1946

 A comparison between sample predictions and references:
PRED 1: how many siblings do you have?
REF  1: how many brothers do you have?
————————————————————————————————————————————————————————————
PRED 2: we speaks french.
REF  2: we speak french.
————————————————————————————————————————————————————————————


EPOCH: 50/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 50 | Train Loss: 0.3357 | PPL: 1.3989
Epoch 50 | Val Loss: 2.6132 | Val PPL: 13.6426
Validation BLEU: 0.2375

 A comparison between sample predictions and references:
PRED 1: he didn't say when he'll come back.
REF  1: he didn't specify when he would return.
————————————————————————————————————————————————————————————
PRED 2: this coffee tastes good.
REF  2: this coffee tastes bitter.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_50.pth


EPOCH: 51/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 51 | Train Loss: 0.3252 | PPL: 1.3842
Epoch 51 | Val Loss: 2.5622 | Val PPL: 12.9639
Validation BLEU: 0.5817

 A comparison between sample predictions and references:
PRED 1: she ran away when she was away.
REF  1: she ran away with the eggs.
————————————————————————————————————————————————————————————
PRED 2: she accused me of being a liar.
REF  2: she accused me of being a liar.
————————————————————————————————————————————————————————————


EPOCH: 52/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 52 | Train Loss: 0.3161 | PPL: 1.3718
Epoch 52 | Val Loss: 2.6035 | Val PPL: 13.5113
Validation BLEU: 0.1716

 A comparison between sample predictions and references:
PRED 1: i'm interested in doing that.
REF  1: i'm interested in that, too.
————————————————————————————————————————————————————————————
PRED 2: he didn't know when he'll be back.
REF  2: he didn't specify when he would return.
————————————————————————————————————————————————————————————


EPOCH: 53/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 53 | Train Loss: 0.3060 | PPL: 1.3580
Epoch 53 | Val Loss: 2.6012 | Val PPL: 13.4797
Validation BLEU: 0.0394

 A comparison between sample predictions and references:
PRED 1: he's studying on the farmer.
REF  1: he is studying agriculture.
————————————————————————————————————————————————————————————
PRED 2: more and more think i could think he loves more.
REF  2: the more i think of it, the less i like it.
————————————————————————————————————————————————————————————


EPOCH: 54/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 54 | Train Loss: 0.2969 | PPL: 1.3456
Epoch 54 | Val Loss: 2.7002 | Val PPL: 14.8828
Validation BLEU: 0.5919

 A comparison between sample predictions and references:
PRED 1: there are someiated in the box.
REF  1: there is a doll in the box.
————————————————————————————————————————————————————————————
PRED 2: i thought you wanted tom to marry you.
REF  2: i thought you wanted tom to marry you.
————————————————————————————————————————————————————————————


EPOCH: 55/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 55 | Train Loss: 0.2902 | PPL: 1.3367
Epoch 55 | Val Loss: 2.6362 | Val PPL: 13.9603
Validation BLEU: 0.0466

 A comparison between sample predictions and references:
PRED 1: calm down.
REF  1: please relax.
————————————————————————————————————————————————————————————
PRED 2: when the old man became very young, he wasoo visited me.
REF  2: the old man often looks back on his youth.
————————————————————————————————————————————————————————————


EPOCH: 56/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 56 | Train Loss: 0.2824 | PPL: 1.3264
Epoch 56 | Val Loss: 2.7425 | Val PPL: 15.5261
Validation BLEU: 0.1270

 A comparison between sample predictions and references:
PRED 1: it's a long time to make it's going to rain.
REF  1: rice is grown in rainy regions.
————————————————————————————————————————————————————————————
PRED 2: i'll take back everything i said.
REF  2: i'll take back what i said.
————————————————————————————————————————————————————————————


EPOCH: 57/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 57 | Train Loss: 0.2757 | PPL: 1.3175
Epoch 57 | Val Loss: 2.6260 | Val PPL: 13.8185
Validation BLEU: 0.1064

 A comparison between sample predictions and references:
PRED 1: this house is my aunt's.
REF  1: this house belongs to my uncle.
————————————————————————————————————————————————————————————
PRED 2: more and more than i think it, the more i hate less i think.
REF  2: the more i think of it, the less i like it.
————————————————————————————————————————————————————————————


EPOCH: 58/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 58 | Train Loss: 0.2671 | PPL: 1.3062
Epoch 58 | Val Loss: 2.6965 | Val PPL: 14.8271
Validation BLEU: 0.6330

 A comparison between sample predictions and references:
PRED 1: the girl was kind enough to show me the way to the museum.
REF  1: the girl was kind enough to show me the way to the museum.
————————————————————————————————————————————————————————————
PRED 2: one can't live without water.
REF  2: you couldn't live without water.
————————————————————————————————————————————————————————————


EPOCH: 59/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 59 | Train Loss: 0.2613 | PPL: 1.2987
Epoch 59 | Val Loss: 2.5941 | Val PPL: 13.3847
Validation BLEU: 0.0325

 A comparison between sample predictions and references:
PRED 1: did you enjoy it?
REF  1: was it interesting?
————————————————————————————————————————————————————————————
PRED 2: i think we should leave early this quickly.
REF  2: i think that we'd better leave early.
————————————————————————————————————————————————————————————


EPOCH: 60/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 60 | Train Loss: 0.2551 | PPL: 1.2906
Epoch 60 | Val Loss: 2.6994 | Val PPL: 14.8704
Validation BLEU: 0.7812

 A comparison between sample predictions and references:
PRED 1: she has been busy since yesterday.
REF  1: she has been busy since yesterday.
————————————————————————————————————————————————————————————
PRED 2: today is saturday.
REF  2: today is saturday.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_60.pth


EPOCH: 61/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 61 | Train Loss: 0.2502 | PPL: 1.2843
Epoch 61 | Val Loss: 2.5965 | Val PPL: 13.4163
Validation BLEU: 0.2937

 A comparison between sample predictions and references:
PRED 1: tom's illness resulted from eating too much.
REF  1: the reason tom got sick was because he overate.
————————————————————————————————————————————————————————————
PRED 2: he was absent from school because he was sick.
REF  2: he was absent from school because of illness.
————————————————————————————————————————————————————————————


EPOCH: 62/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 62 | Train Loss: 0.2452 | PPL: 1.2779
Epoch 62 | Val Loss: 2.5491 | Val PPL: 12.7958
Validation BLEU: 0.3033

 A comparison between sample predictions and references:
PRED 1: today is saturday.
REF  1: today is saturday.
————————————————————————————————————————————————————————————
PRED 2: this is just over an hour.
REF  2: i've been laughing at that for over an hour now.
————————————————————————————————————————————————————————————


EPOCH: 63/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 63 | Train Loss: 0.2384 | PPL: 1.2693
Epoch 63 | Val Loss: 2.7330 | Val PPL: 15.3789
Validation BLEU: 0.1009

 A comparison between sample predictions and references:
PRED 1: i studied all night long.
REF  1: i was up all night studying.
————————————————————————————————————————————————————————————
PRED 2: he didn't know when he was back.
REF  2: he didn't specify when he would return.
————————————————————————————————————————————————————————————


EPOCH: 64/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 64 | Train Loss: 0.2334 | PPL: 1.2629
Epoch 64 | Val Loss: 2.6341 | Val PPL: 13.9308
Validation BLEU: 0.0909

 A comparison between sample predictions and references:
PRED 1: i'm sorry, but i have a pain in my arm.
REF  1: i have a very sore arm where you hit me.
————————————————————————————————————————————————————————————
PRED 2: it seems that the train is broken.
REF  2: the train seems to be late.
————————————————————————————————————————————————————————————


EPOCH: 65/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 65 | Train Loss: 0.2294 | PPL: 1.2578
Epoch 65 | Val Loss: 2.6389 | Val PPL: 13.9978
Validation BLEU: 0.0214

 A comparison between sample predictions and references:
PRED 1: i've run short of money.
REF  1: our money ran out.
————————————————————————————————————————————————————————————
PRED 2: it seems that the crowded is not for you are very crowded.
REF  2: it seems less crowded during the week.
————————————————————————————————————————————————————————————


EPOCH: 66/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 66 | Train Loss: 0.2250 | PPL: 1.2523
Epoch 66 | Val Loss: 2.6347 | Val PPL: 13.9388
Validation BLEU: 0.0173

 A comparison between sample predictions and references:
PRED 1: the big boy has grown up.
REF  1: i'm sure the children are getting big.
————————————————————————————————————————————————————————————
PRED 2: i try to pay the candy by one day.
REF  2: give him an inch and he'll take a yard.
————————————————————————————————————————————————————————————


EPOCH: 67/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 67 | Train Loss: 0.2206 | PPL: 1.2468
Epoch 67 | Val Loss: 2.6344 | Val PPL: 13.9343
Validation BLEU: 0.0669

 A comparison between sample predictions and references:
PRED 1: fallen rocks blocked the road.
REF  1: the road was blocked by fallen rocks.
————————————————————————————————————————————————————————————
PRED 2: his story is really not mine.
REF  2: his story must be true.
————————————————————————————————————————————————————————————


EPOCH: 68/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 68 | Train Loss: 0.2173 | PPL: 1.2427
Epoch 68 | Val Loss: 2.7711 | Val PPL: 15.9755
Validation BLEU: 0.0736

 A comparison between sample predictions and references:
PRED 1: i went home and was at work.
REF  1: having finished my work, i went home.
————————————————————————————————————————————————————————————
PRED 2: i think i should leave early here.
REF  2: i think that we'd better leave early.
————————————————————————————————————————————————————————————


EPOCH: 69/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 69 | Train Loss: 0.2148 | PPL: 1.2397
Epoch 69 | Val Loss: 2.6992 | Val PPL: 14.8684
Validation BLEU: 0.3344

 A comparison between sample predictions and references:
PRED 1: i'm going to the party while i'll have a web page.
REF  1: for further information, see page 1 ⁇ .
————————————————————————————————————————————————————————————
PRED 2: what made her do that.
REF  2: what made her do that?
————————————————————————————————————————————————————————————


EPOCH: 70/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 70 | Train Loss: 0.2112 | PPL: 1.2352
Epoch 70 | Val Loss: 2.5724 | Val PPL: 13.0968
Validation BLEU: 0.0953

 A comparison between sample predictions and references:
PRED 1: don't smoke here.
REF  1: smoking is not permitted here.
————————————————————————————————————————————————————————————
PRED 2: it's fine with me.
REF  2: anything is ok with me.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_70.pth


EPOCH: 71/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 71 | Train Loss: 0.2059 | PPL: 1.2286
Epoch 71 | Val Loss: 2.7098 | Val PPL: 15.0262
Validation BLEU: 0.4115

 A comparison between sample predictions and references:
PRED 1: you can't live without water.
REF  1: you couldn't live without water.
————————————————————————————————————————————————————————————
PRED 2: stay calm and do your best.
REF  2: stay calm, and do your best.
————————————————————————————————————————————————————————————


EPOCH: 72/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 72 | Train Loss: 0.2032 | PPL: 1.2254
Epoch 72 | Val Loss: 2.6959 | Val PPL: 14.8194
Validation BLEU: 0.6270

 A comparison between sample predictions and references:
PRED 1: where did you buy this car?
REF  1: where did you buy this car?
————————————————————————————————————————————————————————————
PRED 2: he didn't show up after all.
REF  2: he didn't turn up after all.
————————————————————————————————————————————————————————————


EPOCH: 73/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 73 | Train Loss: 0.2011 | PPL: 1.2227
Epoch 73 | Val Loss: 2.7966 | Val PPL: 16.3886
Validation BLEU: 0.2930

 A comparison between sample predictions and references:
PRED 1: stay calm and do your best.
REF  1: stay calm, and do your best.
————————————————————————————————————————————————————————————
PRED 2: give me a politician.
REF  2: i have something to give you.
————————————————————————————————————————————————————————————


EPOCH: 74/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 74 | Train Loss: 0.1977 | PPL: 1.2186
Epoch 74 | Val Loss: 2.6973 | Val PPL: 14.8396
Validation BLEU: 0.3693

 A comparison between sample predictions and references:
PRED 1: he wasn't able to press his music.
REF  1: because some urgent business came up, he wasn't able to go to the concert.
————————————————————————————————————————————————————————————
PRED 2: he was absent from school because he was sick.
REF  2: he was absent from school because of illness.
————————————————————————————————————————————————————————————


EPOCH: 75/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 75 | Train Loss: 0.1946 | PPL: 1.2149
Epoch 75 | Val Loss: 2.7321 | Val PPL: 15.3647
Validation BLEU: 0.0204

 A comparison between sample predictions and references:
PRED 1: i'm still insects.
REF  1: there are insects everywhere.
————————————————————————————————————————————————————————————
PRED 2: he's studying family of a farmer.
REF  2: he is studying agriculture.
————————————————————————————————————————————————————————————


EPOCH: 76/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 76 | Train Loss: 0.1901 | PPL: 1.2094
Epoch 76 | Val Loss: 2.7079 | Val PPL: 14.9982
Validation BLEU: 0.2891

 A comparison between sample predictions and references:
PRED 1: stay calm and do your best.
REF  1: stay calm, and do your best.
————————————————————————————————————————————————————————————
PRED 2: tom is bad at multaf sleep.
REF  2: tom isn't good with basic arithmetic.
————————————————————————————————————————————————————————————


EPOCH: 77/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 77 | Train Loss: 0.1870 | PPL: 1.2056
Epoch 77 | Val Loss: 2.6368 | Val PPL: 13.9690
Validation BLEU: 0.4784

 A comparison between sample predictions and references:
PRED 1: today is saturday.
REF  1: today is saturday.
————————————————————————————————————————————————————————————
PRED 2: he is more patient than his brother.
REF  2: his brother is more patient than he is.
————————————————————————————————————————————————————————————


EPOCH: 78/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 78 | Train Loss: 0.1857 | PPL: 1.2041
Epoch 78 | Val Loss: 2.7042 | Val PPL: 14.9418
Validation BLEU: 0.7151

 A comparison between sample predictions and references:
PRED 1: do you have a cell phone, tom?
REF  1: do you have a cell phone, tom?
————————————————————————————————————————————————————————————
PRED 2: do you think that would help?
REF  2: do you think that will be useful?
————————————————————————————————————————————————————————————


EPOCH: 79/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 79 | Train Loss: 0.1825 | PPL: 1.2003
Epoch 79 | Val Loss: 2.6584 | Val PPL: 14.2738
Validation BLEU: 0.0714

 A comparison between sample predictions and references:
PRED 1: you should go by train.
REF  1: it would be better if you went by train.
————————————————————————————————————————————————————————————
PRED 2: tom bought a lot of bread.
REF  2: tom bought rolls.
————————————————————————————————————————————————————————————


EPOCH: 80/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 80 | Train Loss: 0.1803 | PPL: 1.1975
Epoch 80 | Val Loss: 2.7174 | Val PPL: 15.1410
Validation BLEU: 0.3190

 A comparison between sample predictions and references:
PRED 1: i was impolite.
REF  1: i was rude.
————————————————————————————————————————————————————————————
PRED 2: he is my life.
REF  2: he is my classmate.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_80.pth


EPOCH: 81/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 81 | Train Loss: 0.1773 | PPL: 1.1940
Epoch 81 | Val Loss: 2.7123 | Val PPL: 15.0640
Validation BLEU: 0.6988

 A comparison between sample predictions and references:
PRED 1: i was absorbed in reading a novel.
REF  1: i was absorbed in reading a novel.
————————————————————————————————————————————————————————————
PRED 2: tom is a zebra.
REF  2: tom is a magician.
————————————————————————————————————————————————————————————


EPOCH: 82/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 82 | Train Loss: 0.1752 | PPL: 1.1915
Epoch 82 | Val Loss: 2.7533 | Val PPL: 15.6946
Validation BLEU: 0.1582

 A comparison between sample predictions and references:
PRED 1: it's really cool here.
REF  1: it's nice and cool here.
————————————————————————————————————————————————————————————
PRED 2: where is the cafeteria?
REF  2: where's the cafeteria?
————————————————————————————————————————————————————————————


EPOCH: 83/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 83 | Train Loss: 0.1740 | PPL: 1.1900
Epoch 83 | Val Loss: 2.8191 | Val PPL: 16.7624
Validation BLEU: 0.9046

 A comparison between sample predictions and references:
PRED 1: she has been busy since yesterday.
REF  1: she has been busy since yesterday.
————————————————————————————————————————————————————————————
PRED 2: do you wash your hands before eating?
REF  2: do you wash your hands before meals?
————————————————————————————————————————————————————————————


EPOCH: 84/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 84 | Train Loss: 0.1711 | PPL: 1.1866
Epoch 84 | Val Loss: 2.6745 | Val PPL: 14.5050
Validation BLEU: 0.0841

 A comparison between sample predictions and references:
PRED 1: the weather is good today.
REF  1: it's nice today.
————————————————————————————————————————————————————————————
PRED 2: we've been laughed for over an hour.
REF  2: i've been laughing at that for over an hour now.
————————————————————————————————————————————————————————————


EPOCH: 85/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 85 | Train Loss: 0.1694 | PPL: 1.1846
Epoch 85 | Val Loss: 2.7582 | Val PPL: 15.7717
Validation BLEU: 0.0695

 A comparison between sample predictions and references:
PRED 1: what's that strange noise?
REF  1: i wonder what that strange sound is.
————————————————————————————————————————————————————————————
PRED 2: i went home and went home last train.
REF  2: having finished my work, i went home.
————————————————————————————————————————————————————————————


EPOCH: 86/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 86 | Train Loss: 0.1688 | PPL: 1.1839
Epoch 86 | Val Loss: 2.7812 | Val PPL: 16.1376
Validation BLEU: 0.5131

 A comparison between sample predictions and references:
PRED 1: could you help me with my homework?
REF  1: will you help me with my homework?
————————————————————————————————————————————————————————————
PRED 2: i like both dogs and cats.
REF  2: i like both cats and dogs.
————————————————————————————————————————————————————————————


EPOCH: 87/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 87 | Train Loss: 0.1656 | PPL: 1.1801
Epoch 87 | Val Loss: 2.7523 | Val PPL: 15.6794
Validation BLEU: 0.5290

 A comparison between sample predictions and references:
PRED 1: tom took his seat.
REF  1: tom was away from his desk.
————————————————————————————————————————————————————————————
PRED 2: bring in the laundry.
REF  2: bring in the laundry.
————————————————————————————————————————————————————————————


EPOCH: 88/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 88 | Train Loss: 0.1635 | PPL: 1.1776
Epoch 88 | Val Loss: 2.5551 | Val PPL: 12.8726
Validation BLEU: 0.6988

 A comparison between sample predictions and references:
PRED 1: generally speaking, the climate of japan is mild.
REF  1: generally speaking, the climate of japan is mild.
————————————————————————————————————————————————————————————
PRED 2: the house was silent.
REF  2: the house was quiet.
————————————————————————————————————————————————————————————


EPOCH: 89/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 89 | Train Loss: 0.1621 | PPL: 1.1760
Epoch 89 | Val Loss: 2.8213 | Val PPL: 16.7994
Validation BLEU: 0.5253

 A comparison between sample predictions and references:
PRED 1: this desk is broken.
REF  1: this desk is broken.
————————————————————————————————————————————————————————————
PRED 2: i don't know why he did not work.
REF  2: i have no idea why he quit his job suddenly.
————————————————————————————————————————————————————————————


EPOCH: 90/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 90 | Train Loss: 0.1607 | PPL: 1.1743
Epoch 90 | Val Loss: 2.7768 | Val PPL: 16.0674
Validation BLEU: 0.1671

 A comparison between sample predictions and references:
PRED 1: he doesn't eat anything except fruit.
REF  1: he eats nothing but fruit.
————————————————————————————————————————————————————————————
PRED 2: you can't live without water.
REF  2: you couldn't live without water.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_90.pth


EPOCH: 91/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 91 | Train Loss: 0.1580 | PPL: 1.1711
Epoch 91 | Val Loss: 2.7992 | Val PPL: 16.4320
Validation BLEU: 0.3190

 A comparison between sample predictions and references:
PRED 1: i was impolite.
REF  1: i was rude.
————————————————————————————————————————————————————————————
PRED 2: he is my life.
REF  2: he is my classmate.
————————————————————————————————————————————————————————————


EPOCH: 92/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 92 | Train Loss: 0.1571 | PPL: 1.1701
Epoch 92 | Val Loss: 2.8666 | Val PPL: 17.5779
Validation BLEU: 0.7143

 A comparison between sample predictions and references:
PRED 1: let's conserve our limited water resources.
REF  1: let's conserve our limited water supply.
————————————————————————————————————————————————————————————
PRED 2: we went for a walk.
REF  2: we went for a stroll.
————————————————————————————————————————————————————————————


EPOCH: 93/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 93 | Train Loss: 0.1553 | PPL: 1.1680
Epoch 93 | Val Loss: 2.9018 | Val PPL: 18.2067
Validation BLEU: 0.1828

 A comparison between sample predictions and references:
PRED 1: why me, may i take my advice?
REF  1: how come you aren't taking me?
————————————————————————————————————————————————————————————
PRED 2: he makes frequent visits to japan on business.
REF  2: he often goes to japan on business.
————————————————————————————————————————————————————————————


EPOCH: 94/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 94 | Train Loss: 0.1534 | PPL: 1.1657
Epoch 94 | Val Loss: 2.8232 | Val PPL: 16.8307
Validation BLEU: 0.5221

 A comparison between sample predictions and references:
PRED 1: she helped her mother help, but not speak to her mother.
REF  1: she assisted her mother in caring for the baby.
————————————————————————————————————————————————————————————
PRED 2: i think it is a mere coincidence.
REF  2: i think it is a mere coincidence.
————————————————————————————————————————————————————————————


EPOCH: 95/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 95 | Train Loss: 0.1522 | PPL: 1.1644
Epoch 95 | Val Loss: 2.8248 | Val PPL: 16.8584
Validation BLEU: 0.0269

 A comparison between sample predictions and references:
PRED 1: calm down.
REF  1: please relax.
————————————————————————————————————————————————————————————
PRED 2: i was on my house.
REF  2: the house was in flames.
————————————————————————————————————————————————————————————


EPOCH: 96/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 96 | Train Loss: 0.1508 | PPL: 1.1628
Epoch 96 | Val Loss: 2.8380 | Val PPL: 17.0823
Validation BLEU: 0.0849

 A comparison between sample predictions and references:
PRED 1: he didn't know for sure when he'll come back.
REF  1: he didn't specify when he would return.
————————————————————————————————————————————————————————————
PRED 2: what are you laughing at?
REF  2: why are you laughing?
————————————————————————————————————————————————————————————


EPOCH: 97/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 97 | Train Loss: 0.1487 | PPL: 1.1603
Epoch 97 | Val Loss: 2.9787 | Val PPL: 19.6618
Validation BLEU: 0.0907

 A comparison between sample predictions and references:
PRED 1: can i talk with you for a moment?
REF  1: may i speak to you a minute?
————————————————————————————————————————————————————————————
PRED 2: the train will arrive on time.
REF  2: the train will come in at platform ten.
————————————————————————————————————————————————————————————


EPOCH: 98/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 98 | Train Loss: 0.1490 | PPL: 1.1607
Epoch 98 | Val Loss: 2.7270 | Val PPL: 15.2865
Validation BLEU: 0.0893

 A comparison between sample predictions and references:
PRED 1: that noise's what he's so strange.
REF  1: i wonder what that strange sound is.
————————————————————————————————————————————————————————————
PRED 2: tom is a man of the professional kit.
REF  2: tom is a real man.
————————————————————————————————————————————————————————————


EPOCH: 99/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 99 | Train Loss: 0.1480 | PPL: 1.1596
Epoch 99 | Val Loss: 2.8920 | Val PPL: 18.0290
Validation BLEU: 0.5568

 A comparison between sample predictions and references:
PRED 1: his story isn't really boring.
REF  1: his story must be true.
————————————————————————————————————————————————————————————
PRED 2: our team won the game.
REF  2: our team won the game.
————————————————————————————————————————————————————————————


EPOCH: 100/100:   0%|          | 0/536 [00:00<?, ?it/s]

Epoch 100 | Train Loss: 0.1459 | PPL: 1.1571
Epoch 100 | Val Loss: 2.8688 | Val PPL: 17.6162
Validation BLEU: 0.6270

 A comparison between sample predictions and references:
PRED 1: i haven't been to college yet.
REF  1: i haven't gone to college yet.
————————————————————————————————————————————————————————————
PRED 2: she has been busy since yesterday.
REF  2: she has been busy since yesterday.
————————————————————————————————————————————————————————————
Saved checkpoint: ./checkpoints\epoch_100.pth


In [32]:
torch.save(model.state_dict(), 'outputs/model_state.pth')

In [33]:
torch.save(model, 'outputs/ja_en_transformer_translator_model.pth')

In [37]:
model.load_state_dict(torch.load("outputs/model_state.pth", map_location=device))
model.eval()

Transformer(
  (ja_embedding): Embedding(8000, 512, padding_idx=0)
  (en_embedding): Embedding(8000, 512, padding_idx=0)
  (positional_encoder): PositionalEncoder(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feedforward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.1, inplac

In [37]:
checkpoint = torch.load("checkpoints/epoch_50.pth", map_location=device)
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])
epoch = checkpoint["epoch"]
train_loss = checkpoint["loss"]

In [15]:
model.eval()
val_loss = 0.0
n = 0
with torch.no_grad():
    for ja_ids, en_ids in val_loader:
        ja_ids = ja_ids.to(device)
        en_ids = en_ids.to(device)
        dec_input = en_ids[:, :-1]
        target = en_ids[:, 1:].contiguous().view(-1)
        outputs = model(ja_ids, dec_input)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = criterion(outputs, target)
        val_loss += loss.item() * ja_ids.size(0)
        n += ja_ids.size(0)

val_loss /= n
val_ppl = math.exp(val_loss)
print(f"Validation Loss: {val_loss:.4f} | Perplexity: {val_ppl:.2f}")

NameError: name 'val_loader' is not defined

In [51]:
def evaluate_samples(model, val_loader, tokenizer, device, num_samples=10):
    model.eval()
    shown = 0

    with torch.no_grad():
        for ja_ids, en_ids in val_loader:
            ja_ids = ja_ids.to(device)
            en_ids = en_ids.to(device)

            # Loop over batch sentences
            for j in range(ja_ids.size(0)):
                src = ja_ids[j].unsqueeze(0)   # [1, seq_len]
                ref = en_ids[j].tolist()

                # Decode prediction
                pred_ids = greedy_decode(model, src, tokenizer, max_len=80, device=device)
                pred_sentence = tokenizer.decode(pred_ids)

                # Decode reference
                ref_sentence = tokenizer.decode(
                    [t for t in ref if t not in [tokenizer.pad_id(), tokenizer.bos_id(), tokenizer.eos_id()]]
                )

                print("SRC:", src[0][:20].tolist(), "...")  # print first 20 tokens as a check
                print("PRED:", pred_sentence)
                print("REF: ", ref_sentence)
                print("—" * 80)

                shown += 1
                if shown >= num_samples:
                    return

In [52]:
evaluate_samples(model, val_loader, en_tokenizer, device, num_samples=10)

SRC: [1, 27, 1363, 135, 963, 5982, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ...
PRED: i'm going to go.
REF:  have a good look at this picture.
————————————————————————————————————————————————————————————————————————————————
SRC: [1, 2166, 6886, 273, 6005, 6421, 6037, 2082, 5982, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ...
PRED: i'm going to go.
REF:  don't get involved with that guy.
————————————————————————————————————————————————————————————————————————————————
SRC: [1, 528, 5993, 3565, 1515, 403, 6005, 538, 285, 3754, 88, 5982, 2, 0, 0, 0, 0, 0, 0, 0] ...
PRED: i'm going to go.
REF:  either you or i have to go there.
————————————————————————————————————————————————————————————————————————————————
SRC: [1, 2870, 6082, 6999, 5986, 2257, 606, 6012, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ...
PRED: i'm going to go.
REF:  have you ever had food poisoning?
————————————————————————————————————————————————————————————————————————————————
SRC: [1, 23, 2971, 609, 3968, 5982, 2, 0, 0, 0, 0, 0, 0, 0, 0, 

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# -----------------
# 1. Tiny dummy dataset (10 pairs)
# -----------------
tiny_data = [
    ("私は学生です", "I am a student."),
    ("彼は先生です", "He is a teacher."),
    ("猫が好きです", "I like cats."),
    ("犬が好きです", "I like dogs."),
    ("彼女は学生です", "She is a student."),
    ("私は日本人です", "I am Japanese."),
    ("これはペンです", "This is a pen."),
    ("あれは本です", "That is a book."),
    ("ありがとう", "Thank you."),
    ("さようなら", "Goodbye."),
]

# Replace with your tokenizer / vocab
src_tokenizer = lambda x: [ord(c) % 100 for c in x]   # dummy encoder
tgt_tokenizer = lambda x: [ord(c) % 100 for c in x]

class TinyDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        ja, en = self.pairs[idx]
        return torch.tensor(src_tokenizer(ja)), torch.tensor(tgt_tokenizer(en))

def collate(batch):
    srcs, tgts = zip(*batch)
    srcs = nn.utils.rnn.pad_sequence(srcs, batch_first=True, padding_value=0)
    tgts = nn.utils.rnn.pad_sequence(tgts, batch_first=True, padding_value=0)
    return srcs, tgts

train_loader = DataLoader(TinyDataset(tiny_data), batch_size=2, shuffle=True, collate_fn=collate)

# -----------------
# 2. Model (your Transformer here)
# -----------------
# Assume you already have Seq2SeqTransformer(enc, dec)
model = Transformer(d_model=D_MODEL,
                    ja_vocab_size=JA_VOCAB_SIZE,
                    en_vocab_size=EN_VOCAB_SIZE,
                    max_seq_len=MAX_SEQ_LEN, 
                    n_heads=N_HEADS,
                    dropout=DROPOUT,
                   n_layers=N_LAYERS)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# -----------------
# 3. Training loop
# -----------------
for epoch in range(10):
    total_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        # teacher forcing: input = tgt[:, :-1], target = tgt[:, 1:]
        out = model(src, tgt[:, :-1])
        out = out.reshape(-1, out.size(-1))
        loss = criterion(out, tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# -----------------
# 4. Test decoding
# -----------------
def greedy_decode(model, src, max_len=20):
    src = src.unsqueeze(0)
    tgt = torch.tensor([[1]])  # BOS token id
    for _ in range(max_len):
        out = model(src, tgt)
        next_word = out[0, -1].argmax().unsqueeze(0).unsqueeze(0)
        tgt = torch.cat([tgt, next_word], dim=1)
        if next_word.item() == 2:  # EOS token id
            break
    return tgt

with torch.no_grad():
    for ja, en in tiny_data[:5]:
        src = torch.tensor(src_tokenizer(ja))
        pred = greedy_decode(model, src)
        print("SRC:", ja)
        print("REF:", en)
        print("PRED:", pred.tolist())
        print("---")

Epoch 1, Loss: 267.5431
Epoch 2, Loss: 46.9154
Epoch 3, Loss: 31.8220
Epoch 4, Loss: 29.3930
Epoch 5, Loss: 28.4512
Epoch 6, Loss: 25.8560
Epoch 7, Loss: 26.2965
Epoch 8, Loss: 27.1346
Epoch 9, Loss: 24.2319
Epoch 10, Loss: 23.7523
SRC: 私は学生です
REF: I am a student.
PRED: [[1, 46, 15, 17, 99, 21, 16, 17, 32, 99, 10, 15, 46, 10, 10, 46, 46, 46, 16, 11, 46]]
---
SRC: 彼は先生です
REF: He is a teacher.
PRED: [[1, 97, 10, 46, 5, 32, 98, 32, 32, 7, 97, 10, 17, 46, 46, 5, 46, 32, 5, 46, 5]]
---
SRC: 猫が好きです
REF: I like cats.
PRED: [[1, 5, 46, 7, 46, 15, 97, 9, 46, 15, 32, 97, 32, 46, 32, 11, 15, 15, 46, 46, 97]]
---
SRC: 犬が好きです
REF: I like dogs.
PRED: [[1, 46, 15, 46, 11, 46, 16, 15, 5, 15, 12, 1, 10, 46, 32, 8, 8, 15, 15, 97, 46]]
---
SRC: 彼女は学生です
REF: She is a student.
PRED: [[1, 1, 15, 10, 46, 8, 99, 16, 15, 16, 97, 4, 99, 17, 46, 15, 15, 16, 1, 5, 46]]
---
