In [7]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import math
import nltk
import time
import re
import string
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import sacrebleu
from torch.optim.lr_scheduler import _LRScheduler, CosineAnnealingLR, ReduceLROnPlateau
import os
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

torch.manual_seed(42)

<torch._C.Generator at 0x14307390350>

In [None]:
DATE = "23Maret-NoKeyMask-embffn5122048Layer6H4"

In [11]:
# Fungsi untuk membaca dataset JSON
def load_dataset(json_file):
    with open(f"data2/{json_file}", "r", encoding="utf-8") as f:
        data = json.load(f)
    return [entry["text"] for entry in data], [entry["label"] for entry in data]

In [12]:
# Extract text and labels
src_texts, tgt_texts = load_dataset("train.json")
val_src_texts, val_tgt_texts = load_dataset("validation.json")
test_src_texts, test_tgt_texts = load_dataset("test.json")

In [13]:
# Tokenizers
src_tokenizer = get_tokenizer("basic_english")
tgt_tokenizer = get_tokenizer("basic_english")

In [14]:
def clean_text(text):
    """Menghapus tanda baca dan mengonversi ke huruf kecil."""
    text = text.lower()  # Ubah ke huruf kecil
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # Hapus tanda baca
    text = re.sub(r"\s+", " ", text).strip()  # Hapus spasi berlebih
    return text

In [15]:
# Build vocabularies
def tokenize_cleaned(text, tokenizer):
    return tokenizer(text)

def yield_tokens(texts, tokenizer):
    for text in texts:
        yield tokenizer(text)

src_vocab = build_vocab_from_iterator(yield_tokens(src_texts, src_tokenizer), specials=["<pad>", "<sos>", "<eos>", "<unk>"])
tgt_vocab = build_vocab_from_iterator(yield_tokens(tgt_texts, tgt_tokenizer), specials=["<pad>", "<sos>", "<eos>", "<unk>"])

src_vocab.set_default_index(src_vocab["<unk>"])
tgt_vocab.set_default_index(tgt_vocab["<unk>"])

In [16]:
pad_idx = src_vocab.get_stoi()["<pad>"]
sos_idx = src_vocab.get_stoi()["<sos>"]
eos_idx = src_vocab.get_stoi()["<eos>"]
unk_idx = src_vocab.get_stoi()["<unk>"]

In [17]:
print(f"Jumlah token dalam vocabulary sumber (Javanese): {len(src_vocab)}")
print(f"Jumlah token dalam vocabulary target (Indonesian): {len(tgt_vocab)}")

Jumlah token dalam vocabulary sumber (Javanese): 23876
Jumlah token dalam vocabulary target (Indonesian): 15229


In [18]:
# Convert text to tensor
def text_to_tensor(text, vocab, tokenizer):
    tokens = [vocab["<sos>"]] + [vocab[token] for token in tokenizer(text)] + [vocab["<eos>"]]
    return torch.tensor(tokens, dtype=torch.long)

# Convert tensor to text
def tensor_to_text(tensor, vocab):
    special_tokens = {"<sos>", "<eos>", "<pad>"}  # Token yang ingin dihapus
    tokens = vocab.lookup_tokens(tensor.tolist())  # Konversi tensor ke token
    filtered_tokens = [token for token in tokens if token not in special_tokens]  # Hapus token spesial
    return " ".join(filtered_tokens)

In [19]:
# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.src_texts = [clean_text(text) for text in src_texts]
        self.tgt_texts = [clean_text(text) for text in tgt_texts]
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_tensor = text_to_tensor(self.src_texts[idx], self.src_vocab, self.src_tokenizer)
        tgt_tensor = text_to_tensor(self.tgt_texts[idx], self.tgt_vocab, self.tgt_tokenizer)
        return src_tensor, tgt_tensor

# DataLoader with padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab["<pad>"], batch_first=True)
    return src_batch, tgt_batch


In [20]:
batch_size=64

In [21]:
dataset = TranslationDataset(src_texts, tgt_texts, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
val_dataset = TranslationDataset(val_src_texts, val_tgt_texts, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataset = TranslationDataset(test_src_texts, test_tgt_texts, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [22]:
for src_batch, tgt_batch in dataloader:
    print(src_batch.shape, tgt_batch.shape)
    print(tensor_to_text(src_batch[0], src_vocab))
    print(tensor_to_text(tgt_batch[0], tgt_vocab))
    break

torch.Size([64, 60]) torch.Size([64, 61])
sampun ngantos namanipun yonatan kabusek saking tedhak turunipun dhimas dawud ugi manawi pangeran yehuwah badhe males ukum dhateng mengsah mengsahipun dhimas dawud
janganlah nama yonatan terhapus dari keturunan daud melainkan kiranya tuhan menuntut balas dari pada musuh musuh daud


In [23]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, : x.size(1), :]


In [None]:
class TransformerMT(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size=256, num_heads=8, num_layers=4, ff_dim=512, max_len=128, dropout=0.1):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.pos_encoding = PositionalEncoding(embed_size, max_len)
        self.dropout = nn.Dropout(dropout)

        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True), 
            num_layers=num_layers,
            norm = nn.LayerNorm(embed_size)
        )

        # self.enc_norm = nn.LayerNorm(embed_size)

        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True),
            num_layers=num_layers,
            norm = nn.LayerNorm(embed_size)
        )

        # self.dec_norm = nn.LayerNorm(embed_size)

        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)
        self.src_pad_idx = src_vocab["<pad>"]
        self.tgt_pad_idx = tgt_vocab["<pad>"]

    def generate_key_padding_mask(self, seq, pad_idx):
        mask = (seq == pad_idx).float()  # Mask posisi padding
        return mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))

    def generate_subsequent_mask(self, size):
        mask = torch.triu(torch.ones(size, size), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))
        return mask

    def forward(self, src, tgt, return_encoder=False, return_attention=False):
        src_emb = self.dropout(self.pos_encoding(self.src_embedding(src)))
        tgt_emb = self.dropout(self.pos_encoding(self.tgt_embedding(tgt)))

        src_key_padding_mask = None #self.generate_key_padding_mask(src, self.src_pad_idx).to(src.device)
        tgt_key_padding_mask = None #self.generate_key_padding_mask(tgt, self.tgt_pad_idx).to(tgt.device)


        tgt_mask = self.generate_subsequent_mask(tgt.size(1)).to(tgt.device)

        enc_output = self.encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

        dec_output = self.decoder(
            tgt_emb, enc_output, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask, 
            memory_key_padding_mask=src_key_padding_mask
        )
        if return_attention:
            attn_scores = self.decoder.layers[0].multihead_attn.attn_output_weights
            return self.fc_out(dec_output), attn_scores

        if return_encoder:
            return self.fc_out(dec_output), enc_output
        else:
            return self.fc_out(dec_output)


In [26]:
class TransformerLRScheduler:
    def __init__(self, d_model=512, warmup_steps=4000):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.current_step = 0

    def step(self, optimizer):
        """Update learning rate based on current step."""
        self.current_step += 1
        lr = self.compute_lr()
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

    def compute_lr(self):
        """Compute learning rate at current step."""
        scale = self.d_model ** -0.5
        step_factor = min(self.current_step ** -0.5, self.current_step * self.warmup_steps ** -1.5)
        return scale * step_factor

In [27]:
embed_size=512
num_heads=4
num_layers=6
ff_dim=2048
max_len=128
dropout = 0.3
warmup_steps=500

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerMT(len(src_vocab), len(tgt_vocab), embed_size, num_heads, num_layers, ff_dim, max_len, dropout)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001) #lr=0.001
scheduler = TransformerLRScheduler(d_model=embed_size)

best_val_loss = float("inf")
best_bleu_score = float("-inf")

In [29]:
@torch.no_grad()
def evaluate_bleu(model, dataloader, device, src_vocab, tgt_vocab, max_len=64):
    model.eval()
    hypotheses = []
    references = []

    with tqdm(total=len(dataloader), desc="Evaluating BLEU") as pbar:
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)

            for i in range(src.size(0)):
                src_input = src[i].unsqueeze(0)  # Tambahkan batch dimensi
                enc_output = model.encoder(model.dropout(model.pos_encoding(model.src_embedding(src_input))))

                tgt_tokens = [tgt_vocab["<sos>"]]  # Mulai dengan token <sos>

                for _ in range(max_len):
                    tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, len)
                    tgt_emb = model.dropout(model.pos_encoding(model.tgt_embedding(tgt_tensor)))

                    tgt_mask = model.generate_subsequent_mask(tgt_tensor.size(1)).to(device)

                    dec_output = model.decoder(tgt_emb, enc_output, tgt_mask=tgt_mask)
                    next_token_logits = model.fc_out(dec_output[:, -1, :])  # Ambil token terakhir
                    next_token = next_token_logits.argmax(dim=-1).item()

                    if next_token == tgt_vocab["<eos>"]:  # Stop jika mencapai <eos>
                        break

                    tgt_tokens.append(next_token)

                pred_text = tensor_to_text(torch.tensor(tgt_tokens).detach().cpu(), tgt_vocab)  # Detach & pindahkan ke CPU
                ref_text = tensor_to_text(tgt[i].detach().cpu(), tgt_vocab)  # Pindahkan ke CPU

                hypotheses.append(pred_text)
                references.append(ref_text)

            pbar.update(1)

    # Hitung BLEU score dengan sacreBLEU
    bleu_score = sacrebleu.corpus_bleu(hypotheses, [references]).score
    return bleu_score

In [30]:
for src, tgt in dataloader:
    source = src.to(device)
    targer = tgt.to(device)
    break

In [None]:
# from torchview import draw_graph

# model_graph = draw_graph(model, input_data=(src, tgt), expand_nested=True)
# model_graph.visual_graph

In [31]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trained_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trained_params

total_params, trained_params = count_parameters(model)

print(f"Total Parameters    : {total_params}")
print(f"Trained Parameters  : {trained_params}")

Total Parameters    : 71974781
Trained Parameters  : 71974781


In [32]:
@torch.no_grad()
def greedy_decode(model, src, src_vocab, tgt_vocab, device, max_len=64):
    model.eval()
    src = src.unsqueeze(0).to(device)  # Tambahkan batch dimensi

    with torch.no_grad():
        enc_output = model.enc_norm(model.encoder(model.dropout(model.pos_encoding(model.src_embedding(src)))))

    tgt_tokens = [tgt_vocab["<sos>"]]  # Mulai dengan token <sos>

    with tqdm(total=max_len, desc="Decoding", leave=False) as pbar:
        for _ in range(max_len):
            tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, len)
            tgt_emb = model.dropout(model.pos_encoding(model.tgt_embedding(tgt_tensor)))

            tgt_mask = model.generate_subsequent_mask(tgt_tensor.size(1)).to(device)

            with torch.no_grad():
                dec_output = model.dec_norm(model.decoder(tgt_emb, enc_output, tgt_mask=tgt_mask))
                next_token_logits = model.fc_out(dec_output[:, -1, :])  # Ambil token terakhir
                next_token = next_token_logits.argmax(dim=-1).item()

            if next_token == tgt_vocab["<eos>"]:  # Stop jika mencapai <eos>
                break

            tgt_tokens.append(next_token)
            pbar.update(1)  # Update progress bar setiap iterasi

    return tensor_to_text(torch.tensor(tgt_tokens, dtype=torch.long, device=device), tgt_vocab)

In [33]:
@torch.no_grad()
def beam_search_decode(model, src, src_vocab=src_vocab, tgt_vocab=tgt_vocab, device=device, beam_size=5, max_len=64, temperature=1.0):
    model.eval()
    src = src.unsqueeze(0).to(device)  # Tambahkan batch dimensi

    with torch.no_grad():
        enc_output = model.enc_norm(model.encoder(model.dropout(model.pos_encoding(model.src_embedding(src)))))

    # Inisialisasi beam search dengan (score, token sequence)
    beams = [(0, [tgt_vocab["<sos>"]])]  # Log probabilitas awal = 0

    with tqdm(total=max_len, desc="Beam Decoding", leave=False) as pbar:
        for _ in range(max_len):
            all_candidates = []

            for score, tgt_tokens in beams:
                tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, len)
                tgt_emb = model.dropout(model.pos_encoding(model.tgt_embedding(tgt_tensor)))

                tgt_mask = model.generate_subsequent_mask(tgt_tensor.size(1)).to(device)

                with torch.no_grad():
                    dec_output = model.dec_norm(model.decoder(tgt_emb, enc_output, tgt_mask=tgt_mask))
                    logits = model.fc_out(dec_output[:, -1, :])  # Ambil token terakhir

                # Terapkan temperature scaling
                logits = logits / temperature
                probs = F.softmax(logits, dim=-1)  # Konversi ke probabilitas
                log_probs = torch.log(probs + 1e-9)  # Hindari log(0)

                # Ambil top-k kandidat
                top_log_probs, top_indices = log_probs.topk(beam_size)

                for i in range(beam_size):
                    next_token = top_indices[0, i].item()
                    new_score = score + top_log_probs[0, i].item()  # Akumulasi log probabilitas
                    new_sequence = tgt_tokens + [next_token]
                    all_candidates.append((new_score, new_sequence))

            # Pilih `beam_size` terbaik berdasarkan skor tertinggi
            beams = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]

            # Cek apakah semua beam sudah mencapai <eos>
            if all(next_token == tgt_vocab["<eos>"] for _, seq in beams for next_token in [seq[-1]]):
                break

            pbar.update(1)  # Update progress bar setiap iterasi

    # Ambil sequence dengan skor tertinggi
    best_sequence = max(beams, key=lambda x: x[0])[1]

    return tensor_to_text(torch.tensor(best_sequence, dtype=torch.long, device=device), tgt_vocab)

In [None]:
@torch.no_grad()
def beam_search_decode_text(model, src_text, src_vocab, tgt_vocab, device, beam_size=5, max_len=64, temperature=1.0):
    """
    Beam Search Decoding untuk input teks.

    Args:
        model: Model Transformer yang telah dilatih.
        src_text (str): Kalimat input dalam bentuk teks.
        src_vocab (dict): Kamus subword ke indeks untuk bahasa sumber.
        tgt_vocab (dict): Kamus subword ke indeks untuk bahasa target.
        device: Perangkat (CPU/GPU).
        beam_size (int): Jumlah kandidat yang dijaga selama decoding.
        max_len (int): Panjang maksimum output terjemahan.
        temperature (float): Faktor untuk scaling logits (default=1.0).

    Returns:
        str: Hasil terjemahan dalam bentuk teks.
    """

    model.eval()

    # Konversi teks sumber menjadi tensor indeks
    src_tokens = text_to_tensor(src_text, src_vocab).to(device).unsqueeze(0)  # Tambahkan batch dimensi

    with torch.no_grad():
        enc_output = model.enc_norm(model.encoder(model.dropout(model.pos_encoding(model.src_embedding(src_tokens)))))

    # Inisialisasi beam search dengan (score, token sequence)
    beams = [(0, [tgt_vocab["<sos>"]])]  # Log probabilitas awal = 0

    with tqdm(total=max_len, desc="Beam Decoding", leave=False) as pbar:
        for _ in range(max_len):
            all_candidates = []

            for score, tgt_tokens in beams:
                tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long, device=device).unsqueeze(0)  # (1, len)
                tgt_emb = model.dropout(model.pos_encoding(model.tgt_embedding(tgt_tensor)))

                tgt_mask = model.generate_subsequent_mask(tgt_tensor.size(1)).to(device)

                with torch.no_grad():
                    dec_output = model.dec_norm(model.decoder(tgt_emb, enc_output, tgt_mask=tgt_mask))
                    logits = model.fc_out(dec_output[:, -1, :])  # Ambil token terakhir

                # Terapkan temperature scaling
                logits = logits / temperature
                probs = F.softmax(logits, dim=-1)  # Konversi ke probabilitas
                log_probs = torch.log(probs + 1e-9)  # Hindari log(0)

                # Ambil top-k kandidat
                top_log_probs, top_indices = log_probs.topk(beam_size)

                for i in range(beam_size):
                    next_token = top_indices[0, i].item()
                    new_score = score + top_log_probs[0, i].item()  # Akumulasi log probabilitas
                    new_sequence = tgt_tokens + [next_token]
                    all_candidates.append((new_score, new_sequence))

            # Pilih `beam_size` terbaik berdasarkan skor tertinggi
            beams = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]

            # Cek apakah semua beam sudah mencapai <eos>
            if all(next_token == tgt_vocab["<eos>"] for _, seq in beams for next_token in [seq[-1]]):
                break

            pbar.update(1)  # Update progress bar setiap iterasi

    # Ambil sequence dengan skor tertinggi
    best_sequence = max(beams, key=lambda x: x[0])[1]

    return tensor_to_text(torch.tensor(best_sequence, dtype=torch.long, device=device), tgt_vocab)


In [35]:
def train(model, dataloader, val_dataloader, optimizer, scheduler, num_epochs, device, src_vocab, tgt_vocab, save_dir="models"):
    os.makedirs(save_dir, exist_ok=True)  # Pastikan folder penyimpanan ada
    
    train_losses = []
    val_losses = []
    bleu_scores = []
    learning_rates = []  # Menyimpan learning rate
    best_bleu = 0  # BLEU score terbaik

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for src, tgt in progress_bar:
            src, tgt = src.to(device), tgt.to(device)

            tgt_input = tgt[:, :-1]  # Input untuk decoder
            tgt_output = tgt[:, 1:]  # Target untuk loss

            optimizer.zero_grad()

            # Forward pass
            logits = model(src, tgt_input)
            logits = logits.reshape(-1, logits.shape[-1])
            tgt_output = tgt_output.reshape(-1)

            loss = F.cross_entropy(logits, tgt_output, ignore_index=0, label_smoothing=0.1)
            loss.backward()
            optimizer.step()

            # Update learning rate
            if scheduler is not None:
                scheduler.step(optimizer)

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        avg_train_loss = epoch_loss / len(dataloader)
        train_losses.append(avg_train_loss)

        # === Evaluasi (Validasi Loss) ===
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for src, tgt in val_dataloader:
                src, tgt = src.to(device), tgt.to(device)
                tgt_input = tgt[:, :-1]
                tgt_output = tgt[:, 1:]

                logits = model(src, tgt_input)
                loss = F.cross_entropy(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1), ignore_index=0)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)

        # === Evaluasi BLEU Setiap 20 Epoch ===
        bleu_score = None
        if (epoch + 1) % 20 == 0:  
            bleu_score = evaluate_bleu(model, val_dataloader, device, src_vocab, tgt_vocab)
            bleu_scores.append(bleu_score)
            print(f"\n[Epoch {epoch+1}] BLEU: {bleu_score:.2f}")

            # Simpan model terbaik berdasarkan BLEU
            if bleu_score > best_bleu:
                best_bleu = bleu_score
                torch.save(model.state_dict(), os.path.join(save_dir, f"{DATE}_best_model.pth"))
                print(f"🔥 Model terbaik disimpan dengan BLEU: {bleu_score:.2f}")

        # Simpan learning rate
        if scheduler is not None:
            lr = optimizer.param_groups[0]['lr']  # Ambil learning rate terbaru
            learning_rates.append(lr)

        print(f"\nEpoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, ")
        print(f"Learning Rate: {learning_rates[-1]:.6f}")

        # Simpan model setiap 10 epoch
        if (epoch + 1) % 60 == 0:
            torch.save(model.state_dict(), os.path.join(save_dir, f"{DATE}_model_epoch_{epoch+1}.pth"))
            print(f"✅ Model disimpan: {DATE}_model_epoch_{epoch+1}.pth")

    return train_losses, val_losses, bleu_scores, learning_rates


In [31]:
history = train(model, dataloader, val_dataloader, optimizer, scheduler=scheduler, num_epochs=120, device=device, src_vocab=src_vocab, tgt_vocab=tgt_vocab)

Epoch 1/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=6.8441]



Epoch 1: Train Loss = 7.9032, Val Loss = 6.5143, 
Learning Rate: 0.000064


Epoch 2/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=6.1901]



Epoch 2: Train Loss = 6.6574, Val Loss = 5.7581, 
Learning Rate: 0.000127


Epoch 3/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=5.5475]



Epoch 3: Train Loss = 6.0667, Val Loss = 5.1495, 
Learning Rate: 0.000191


Epoch 4/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=4.9685]



Epoch 4: Train Loss = 5.5981, Val Loss = 4.6833, 
Learning Rate: 0.000254


Epoch 5/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=4.4867]



Epoch 5: Train Loss = 5.2258, Val Loss = 4.3104, 
Learning Rate: 0.000318


Epoch 6/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=3.9264]



Epoch 6: Train Loss = 4.9220, Val Loss = 4.0266, 
Learning Rate: 0.000382


Epoch 7/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=3.6419]



Epoch 7: Train Loss = 4.6679, Val Loss = 3.8351, 
Learning Rate: 0.000445


Epoch 8/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=3.3265]



Epoch 8: Train Loss = 4.4568, Val Loss = 3.6657, 
Learning Rate: 0.000509


Epoch 9/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=3.0245]



Epoch 9: Train Loss = 4.2775, Val Loss = 3.5595, 
Learning Rate: 0.000572


Epoch 10/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.8784]



Epoch 10: Train Loss = 4.1387, Val Loss = 3.4521, 
Learning Rate: 0.000636


Epoch 11/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.7471]



Epoch 11: Train Loss = 4.0255, Val Loss = 3.3587, 
Learning Rate: 0.000698


Epoch 12/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.6636]



Epoch 12: Train Loss = 3.9122, Val Loss = 3.2817, 
Learning Rate: 0.000669


Epoch 13/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.5312]



Epoch 13: Train Loss = 3.7618, Val Loss = 3.1377, 
Learning Rate: 0.000642


Epoch 14/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.3476]



Epoch 14: Train Loss = 3.6248, Val Loss = 3.0483, 
Learning Rate: 0.000619


Epoch 15/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=2.3512]



Epoch 15: Train Loss = 3.4943, Val Loss = 2.9870, 
Learning Rate: 0.000598


Epoch 16/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.1370]



Epoch 16: Train Loss = 3.3838, Val Loss = 2.9266, 
Learning Rate: 0.000579


Epoch 17/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=2.0803]



Epoch 17: Train Loss = 3.2844, Val Loss = 2.8818, 
Learning Rate: 0.000562


Epoch 18/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=2.0840]



Epoch 18: Train Loss = 3.1994, Val Loss = 2.8378, 
Learning Rate: 0.000546


Epoch 19/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=1.9765]



Epoch 19: Train Loss = 3.1212, Val Loss = 2.8272, 
Learning Rate: 0.000531


Epoch 20/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.9459]
Evaluating BLEU: 100%|██████████| 49/49 [06:59<00:00,  8.56s/it]



[Epoch 20] BLEU: 21.34
🔥 Model terbaik disimpan dengan BLEU: 21.34

Epoch 20: Train Loss = 3.0528, Val Loss = 2.7840, 
Learning Rate: 0.000518


Epoch 21/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.8666]



Epoch 21: Train Loss = 2.9871, Val Loss = 2.7609, 
Learning Rate: 0.000505


Epoch 22/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.8594]



Epoch 22: Train Loss = 2.9247, Val Loss = 2.7371, 
Learning Rate: 0.000494


Epoch 23/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.8885]



Epoch 23: Train Loss = 2.8691, Val Loss = 2.7196, 
Learning Rate: 0.000483


Epoch 24/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.9287]



Epoch 24: Train Loss = 2.8186, Val Loss = 2.7122, 
Learning Rate: 0.000473


Epoch 25/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.8309]



Epoch 25: Train Loss = 2.7725, Val Loss = 2.6890, 
Learning Rate: 0.000463


Epoch 26/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7704]



Epoch 26: Train Loss = 2.7231, Val Loss = 2.6941, 
Learning Rate: 0.000454


Epoch 27/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.8424]



Epoch 27: Train Loss = 2.6816, Val Loss = 2.6933, 
Learning Rate: 0.000446


Epoch 28/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7739]



Epoch 28: Train Loss = 2.6440, Val Loss = 2.6774, 
Learning Rate: 0.000438


Epoch 29/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7418]



Epoch 29: Train Loss = 2.6034, Val Loss = 2.6682, 
Learning Rate: 0.000430


Epoch 30/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.8013]



Epoch 30: Train Loss = 2.5694, Val Loss = 2.6603, 
Learning Rate: 0.000423


Epoch 31/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7233]



Epoch 31: Train Loss = 2.5395, Val Loss = 2.6653, 
Learning Rate: 0.000416


Epoch 32/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7290]



Epoch 32: Train Loss = 2.5062, Val Loss = 2.6525, 
Learning Rate: 0.000409


Epoch 33/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7338]



Epoch 33: Train Loss = 2.4756, Val Loss = 2.6462, 
Learning Rate: 0.000403


Epoch 34/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7389]



Epoch 34: Train Loss = 2.4457, Val Loss = 2.6451, 
Learning Rate: 0.000397


Epoch 35/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7415]



Epoch 35: Train Loss = 2.4221, Val Loss = 2.6441, 
Learning Rate: 0.000392


Epoch 36/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7261]



Epoch 36: Train Loss = 2.3932, Val Loss = 2.6529, 
Learning Rate: 0.000386


Epoch 37/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6883]



Epoch 37: Train Loss = 2.3705, Val Loss = 2.6532, 
Learning Rate: 0.000381


Epoch 38/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.7454]



Epoch 38: Train Loss = 2.3457, Val Loss = 2.6642, 
Learning Rate: 0.000376


Epoch 39/120: 100%|██████████| 364/364 [01:23<00:00,  4.39it/s, loss=1.7354]



Epoch 39: Train Loss = 2.3241, Val Loss = 2.6647, 
Learning Rate: 0.000371


Epoch 40/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6843]
Evaluating BLEU: 100%|██████████| 49/49 [07:09<00:00,  8.77s/it]



[Epoch 40] BLEU: 26.28
🔥 Model terbaik disimpan dengan BLEU: 26.28

Epoch 40: Train Loss = 2.3006, Val Loss = 2.6568, 
Learning Rate: 0.000366


Epoch 41/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7048]



Epoch 41: Train Loss = 2.2826, Val Loss = 2.6634, 
Learning Rate: 0.000362


Epoch 42/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7202]



Epoch 42: Train Loss = 2.2590, Val Loss = 2.6665, 
Learning Rate: 0.000357


Epoch 43/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6701]



Epoch 43: Train Loss = 2.2448, Val Loss = 2.6692, 
Learning Rate: 0.000353


Epoch 44/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7505]



Epoch 44: Train Loss = 2.2251, Val Loss = 2.6541, 
Learning Rate: 0.000349


Epoch 45/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6498]



Epoch 45: Train Loss = 2.2082, Val Loss = 2.6700, 
Learning Rate: 0.000345


Epoch 46/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.7087]



Epoch 46: Train Loss = 2.1895, Val Loss = 2.6663, 
Learning Rate: 0.000342


Epoch 47/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6858]



Epoch 47: Train Loss = 2.1781, Val Loss = 2.6684, 
Learning Rate: 0.000338


Epoch 48/120: 100%|██████████| 364/364 [01:23<00:00,  4.39it/s, loss=1.6742]



Epoch 48: Train Loss = 2.1588, Val Loss = 2.6686, 
Learning Rate: 0.000334


Epoch 49/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6471]



Epoch 49: Train Loss = 2.1438, Val Loss = 2.6726, 
Learning Rate: 0.000331


Epoch 50/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6699]



Epoch 50: Train Loss = 2.1313, Val Loss = 2.6790, 
Learning Rate: 0.000328


Epoch 51/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6693]



Epoch 51: Train Loss = 2.1179, Val Loss = 2.6813, 
Learning Rate: 0.000324


Epoch 52/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6570]



Epoch 52: Train Loss = 2.1045, Val Loss = 2.6828, 
Learning Rate: 0.000321


Epoch 53/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6260]



Epoch 53: Train Loss = 2.0911, Val Loss = 2.6905, 
Learning Rate: 0.000318


Epoch 54/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6721]



Epoch 54: Train Loss = 2.0789, Val Loss = 2.6810, 
Learning Rate: 0.000315


Epoch 55/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6675]



Epoch 55: Train Loss = 2.0622, Val Loss = 2.6803, 
Learning Rate: 0.000312


Epoch 56/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6110]



Epoch 56: Train Loss = 2.0498, Val Loss = 2.6840, 
Learning Rate: 0.000310


Epoch 57/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6560]



Epoch 57: Train Loss = 2.0409, Val Loss = 2.6923, 
Learning Rate: 0.000307


Epoch 58/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.5704]



Epoch 58: Train Loss = 2.0314, Val Loss = 2.6944, 
Learning Rate: 0.000304


Epoch 59/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6071]



Epoch 59: Train Loss = 2.0196, Val Loss = 2.6996, 
Learning Rate: 0.000302


Epoch 60/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6463]
Evaluating BLEU: 100%|██████████| 49/49 [07:09<00:00,  8.77s/it]



[Epoch 60] BLEU: 27.45
🔥 Model terbaik disimpan dengan BLEU: 27.45

Epoch 60: Train Loss = 2.0084, Val Loss = 2.6996, 
Learning Rate: 0.000299
✅ Model disimpan: 23Maret-NoKeyMask-embffn5122048Layer6H4_model_epoch_60.pth


Epoch 61/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6573]



Epoch 61: Train Loss = 1.9984, Val Loss = 2.6937, 
Learning Rate: 0.000297


Epoch 62/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.6134]



Epoch 62: Train Loss = 1.9891, Val Loss = 2.7103, 
Learning Rate: 0.000294


Epoch 63/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6101]



Epoch 63: Train Loss = 1.9792, Val Loss = 2.7171, 
Learning Rate: 0.000292


Epoch 64/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.6007]



Epoch 64: Train Loss = 1.9729, Val Loss = 2.7226, 
Learning Rate: 0.000290


Epoch 65/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.5698]



Epoch 65: Train Loss = 1.9635, Val Loss = 2.7038, 
Learning Rate: 0.000287


Epoch 66/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.5863]



Epoch 66: Train Loss = 1.9584, Val Loss = 2.7093, 
Learning Rate: 0.000285


Epoch 67/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5839]



Epoch 67: Train Loss = 1.9466, Val Loss = 2.7086, 
Learning Rate: 0.000283


Epoch 68/120: 100%|██████████| 364/364 [01:22<00:00,  4.39it/s, loss=1.5911]



Epoch 68: Train Loss = 1.9380, Val Loss = 2.7167, 
Learning Rate: 0.000281


Epoch 69/120: 100%|██████████| 364/364 [01:23<00:00,  4.39it/s, loss=1.6304]



Epoch 69: Train Loss = 1.9302, Val Loss = 2.7143, 
Learning Rate: 0.000279


Epoch 70/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=1.5805]



Epoch 70: Train Loss = 1.9216, Val Loss = 2.7122, 
Learning Rate: 0.000277


Epoch 71/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5947]



Epoch 71: Train Loss = 1.9137, Val Loss = 2.7133, 
Learning Rate: 0.000275


Epoch 72/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5832]



Epoch 72: Train Loss = 1.9073, Val Loss = 2.7282, 
Learning Rate: 0.000273


Epoch 73/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.6244]



Epoch 73: Train Loss = 1.9002, Val Loss = 2.7166, 
Learning Rate: 0.000271


Epoch 74/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5993]



Epoch 74: Train Loss = 1.8933, Val Loss = 2.7051, 
Learning Rate: 0.000269


Epoch 75/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5824]



Epoch 75: Train Loss = 1.8852, Val Loss = 2.7251, 
Learning Rate: 0.000267


Epoch 76/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.6053]



Epoch 76: Train Loss = 1.8793, Val Loss = 2.7258, 
Learning Rate: 0.000266


Epoch 77/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5757]



Epoch 77: Train Loss = 1.8744, Val Loss = 2.7368, 
Learning Rate: 0.000264


Epoch 78/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.6320]



Epoch 78: Train Loss = 1.8655, Val Loss = 2.7194, 
Learning Rate: 0.000262


Epoch 79/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5765]



Epoch 79: Train Loss = 1.8617, Val Loss = 2.7349, 
Learning Rate: 0.000261


Epoch 80/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5302]
Evaluating BLEU: 100%|██████████| 49/49 [08:11<00:00, 10.02s/it]



[Epoch 80] BLEU: 28.27
🔥 Model terbaik disimpan dengan BLEU: 28.27

Epoch 80: Train Loss = 1.8538, Val Loss = 2.7202, 
Learning Rate: 0.000259


Epoch 81/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5542]



Epoch 81: Train Loss = 1.8482, Val Loss = 2.7334, 
Learning Rate: 0.000257


Epoch 82/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5516]



Epoch 82: Train Loss = 1.8468, Val Loss = 2.7086, 
Learning Rate: 0.000256


Epoch 83/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5426]



Epoch 83: Train Loss = 1.8409, Val Loss = 2.7221, 
Learning Rate: 0.000254


Epoch 84/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5891]



Epoch 84: Train Loss = 1.8319, Val Loss = 2.7335, 
Learning Rate: 0.000253


Epoch 85/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5566]



Epoch 85: Train Loss = 1.8283, Val Loss = 2.7342, 
Learning Rate: 0.000251


Epoch 86/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5271]



Epoch 86: Train Loss = 1.8237, Val Loss = 2.7247, 
Learning Rate: 0.000250


Epoch 87/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5790]



Epoch 87: Train Loss = 1.8169, Val Loss = 2.7347, 
Learning Rate: 0.000248


Epoch 88/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5394]



Epoch 88: Train Loss = 1.8137, Val Loss = 2.7269, 
Learning Rate: 0.000247


Epoch 89/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5338]



Epoch 89: Train Loss = 1.8065, Val Loss = 2.7358, 
Learning Rate: 0.000246


Epoch 90/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5480]



Epoch 90: Train Loss = 1.8020, Val Loss = 2.7437, 
Learning Rate: 0.000244


Epoch 91/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5775]



Epoch 91: Train Loss = 1.7989, Val Loss = 2.7325, 
Learning Rate: 0.000243


Epoch 92/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5243]



Epoch 92: Train Loss = 1.7929, Val Loss = 2.7417, 
Learning Rate: 0.000242


Epoch 93/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5243]



Epoch 93: Train Loss = 1.7887, Val Loss = 2.7487, 
Learning Rate: 0.000240


Epoch 94/120: 100%|██████████| 364/364 [01:23<00:00,  4.34it/s, loss=1.5411]



Epoch 94: Train Loss = 1.7830, Val Loss = 2.7333, 
Learning Rate: 0.000239


Epoch 95/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5665]



Epoch 95: Train Loss = 1.7829, Val Loss = 2.7370, 
Learning Rate: 0.000238


Epoch 96/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5894]



Epoch 96: Train Loss = 1.7791, Val Loss = 2.7824, 
Learning Rate: 0.000236


Epoch 97/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5239]



Epoch 97: Train Loss = 1.7732, Val Loss = 2.7522, 
Learning Rate: 0.000235


Epoch 98/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5478]



Epoch 98: Train Loss = 1.7671, Val Loss = 2.7432, 
Learning Rate: 0.000234


Epoch 99/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5726]



Epoch 99: Train Loss = 1.7637, Val Loss = 2.7521, 
Learning Rate: 0.000233


Epoch 100/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5475]
Evaluating BLEU: 100%|██████████| 49/49 [08:09<00:00, 10.00s/it]



[Epoch 100] BLEU: 28.85
🔥 Model terbaik disimpan dengan BLEU: 28.85

Epoch 100: Train Loss = 1.7611, Val Loss = 2.7416, 
Learning Rate: 0.000232


Epoch 101/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5224]



Epoch 101: Train Loss = 1.7567, Val Loss = 2.7596, 
Learning Rate: 0.000230


Epoch 102/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5276]



Epoch 102: Train Loss = 1.7509, Val Loss = 2.7506, 
Learning Rate: 0.000229


Epoch 103/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5423]



Epoch 103: Train Loss = 1.7555, Val Loss = 2.7531, 
Learning Rate: 0.000228


Epoch 104/120: 100%|██████████| 364/364 [01:23<00:00,  4.35it/s, loss=1.5505]



Epoch 104: Train Loss = 1.7464, Val Loss = 2.7494, 
Learning Rate: 0.000227


Epoch 105/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5662]



Epoch 105: Train Loss = 1.7419, Val Loss = 2.7541, 
Learning Rate: 0.000226


Epoch 106/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5562]



Epoch 106: Train Loss = 1.7395, Val Loss = 2.7451, 
Learning Rate: 0.000225


Epoch 107/120: 100%|██████████| 364/364 [01:23<00:00,  4.36it/s, loss=1.5379]



Epoch 107: Train Loss = 1.7351, Val Loss = 2.7606, 
Learning Rate: 0.000224


Epoch 108/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5086]



Epoch 108: Train Loss = 1.7321, Val Loss = 2.7558, 
Learning Rate: 0.000223


Epoch 109/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5745]



Epoch 109: Train Loss = 1.7281, Val Loss = 2.7489, 
Learning Rate: 0.000222


Epoch 110/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5079]



Epoch 110: Train Loss = 1.7238, Val Loss = 2.7496, 
Learning Rate: 0.000221


Epoch 111/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5015]



Epoch 111: Train Loss = 1.7225, Val Loss = 2.7609, 
Learning Rate: 0.000220


Epoch 112/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5458]



Epoch 112: Train Loss = 1.7208, Val Loss = 2.7560, 
Learning Rate: 0.000219


Epoch 113/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=1.5337]



Epoch 113: Train Loss = 1.7160, Val Loss = 2.7647, 
Learning Rate: 0.000218


Epoch 114/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5336]



Epoch 114: Train Loss = 1.7141, Val Loss = 2.7602, 
Learning Rate: 0.000217


Epoch 115/120: 100%|██████████| 364/364 [01:23<00:00,  4.37it/s, loss=1.5179]



Epoch 115: Train Loss = 1.7107, Val Loss = 2.7719, 
Learning Rate: 0.000216


Epoch 116/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5279]



Epoch 116: Train Loss = 1.7088, Val Loss = 2.7604, 
Learning Rate: 0.000215


Epoch 117/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5015]



Epoch 117: Train Loss = 1.7043, Val Loss = 2.7631, 
Learning Rate: 0.000214


Epoch 118/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5289]



Epoch 118: Train Loss = 1.7013, Val Loss = 2.7754, 
Learning Rate: 0.000213


Epoch 119/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5083]



Epoch 119: Train Loss = 1.6979, Val Loss = 2.7693, 
Learning Rate: 0.000212


Epoch 120/120: 100%|██████████| 364/364 [01:23<00:00,  4.38it/s, loss=1.5387]
Evaluating BLEU: 100%|██████████| 49/49 [07:02<00:00,  8.63s/it]



[Epoch 120] BLEU: 29.26
🔥 Model terbaik disimpan dengan BLEU: 29.26

Epoch 120: Train Loss = 1.6977, Val Loss = 2.7651, 
Learning Rate: 0.000211
✅ Model disimpan: 23Maret-NoKeyMask-embffn5122048Layer6H4_model_epoch_120.pth


In [33]:
import pickle
# Specify the file path where you want to save the data
file_path = f'Run-{DATE}.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(history, file)

In [40]:
@torch.no_grad()
def beam_search_dataloader(model, dataloader, src_vocab, tgt_vocab, device, beam_size=5, max_len=64, temperature=1.0, output_file="beam_translations.txt"):
    model.eval()
    translations = []
    references = []

    with open(output_file, "w", encoding="utf-8") as f:
        for src, tgt in tqdm(dataloader, desc="Beam Search Decoding"):
            src, tgt = src.to(device), tgt.to(device)
            batch_translations = []

            for i in range(src.size(0)):
                src_i = src[i].unsqueeze(0)  # Ambil satu contoh
                enc_output = model.encoder(model.dropout(model.pos_encoding(model.src_embedding(src_i)))).to(device)

                # Inisialisasi beam search dengan (score, token sequence)
                beams = [(0, [tgt_vocab["<sos>"]])]

                for _ in range(max_len):
                    all_candidates = []
                    for score, tgt_tokens in beams:
                        if tgt_tokens[-1] == tgt_vocab["<eos>"]:
                            all_candidates.append((score, tgt_tokens))
                            continue

                        tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long, device=device).unsqueeze(0)
                        tgt_emb = model.dropout(model.pos_encoding(model.tgt_embedding(tgt_tensor)))
                        tgt_mask = model.generate_subsequent_mask(tgt_tensor.size(1)).to(device)

                        dec_output = model.decoder(tgt_emb, enc_output, tgt_mask=tgt_mask)
                        logits = model.fc_out(dec_output[:, -1, :])  # Tetap di GPU

                        logits = logits / temperature
                        probs = F.softmax(logits, dim=-1)
                        log_probs = torch.log(probs + 1e-9)

                        top_log_probs, top_indices = log_probs.topk(beam_size)
                        for j in range(beam_size):
                            next_token = top_indices[0, j].item()
                            new_score = score + top_log_probs[0, j].item()
                            new_sequence = tgt_tokens + [next_token]
                            all_candidates.append((new_score, new_sequence))

                    beams = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_size]
                    if all(seq[-1] == tgt_vocab["<eos>"] for _, seq in beams):
                        break

                best_sequence = max(beams, key=lambda x: x[0])[1]
                best_sequence = [tok for tok in best_sequence if tok != tgt_vocab["<sos>"]]  # Hapus <sos>
                translation = tensor_to_text(torch.tensor(best_sequence, dtype=torch.long), tgt_vocab)
                batch_translations.append(translation)

            translations.extend(batch_translations)
            references.extend([tensor_to_text(t, tgt_vocab) for t in tgt])

            for trans in batch_translations:
                f.write(trans + "\n")

    print(f"Terjemahan berhasil disimpan di '{output_file}'")
    return translations, references

In [36]:
model.load_state_dict(torch.load(f"models/{DATE}_best_model.pth", weights_only=True))

<All keys matched successfully>

In [35]:
translations = beam_search_dataloader(
    model, val_dataloader, src_vocab, tgt_vocab, device, beam_size=3, temperature=0.7, output_file=f"{DATE}_beam_translations.txt"
)

Beam Search Decoding: 100%|██████████| 49/49 [20:48<00:00, 25.49s/it]

Terjemahan berhasil disimpan di '23Maret-NoKeyMask-embffn5122048Layer6H4_beam_translations.txt'





In [41]:
translations = beam_search_dataloader(
    model, test_dataloader, src_vocab, tgt_vocab, device, beam_size=3, temperature=0.7, output_file=f"{DATE}_beam_translations-test.txt"
)

  return torch._transformer_encoder_layer_fwd(
Beam Search Decoding: 100%|██████████| 73/73 [1:05:50<00:00, 54.12s/it]

Terjemahan berhasil disimpan di '23Maret-NoKeyMask-embffn5122048Layer6H4_beam_translations-test.txt'





In [38]:
hypothesis = translations[0]
reference = [[text] for text in translations[1]]

print(sacrebleu.corpus_bleu(hypothesis, reference).score)

47.24583568309559


In [42]:
hypothesis = translations[0]
reference = [[text] for text in translations[1]]

print(sacrebleu.corpus_bleu(hypothesis, reference).score)

39.33172938363535


In [36]:
def bleu_by_sentence(hypothesis, references):
    """
    Menghitung skor BLEU untuk satu hipotesis terhadap satu atau lebih referensi menggunakan sacrebleu.
    
    Parameters:
        hypothesis (str): Kalimat hipotesis.
        references (list of str): Daftar kalimat referensi.
    
    Returns:
        float: Skor BLEU
    """
    bleu_score = sacrebleu.sentence_bleu(hypothesis, references).score
    return bleu_score

# Contoh penggunaan
hypothesis = "the cat is on the mat"
reference = ["there is a cat on the mat", "the cat sits on the mat"]

In [42]:
model

TransformerMT(
  (src_embedding): Embedding(23876, 512)
  (tgt_embedding): Embedding(15229, 512)
  (pos_encoding): PositionalEncoding()
  (dropout): Dropout(p=0.3, inplace=False)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): M