In [None]:
    # src padding mask (batch_size=1)
    src_key_padding_mask = (src_ids == src_stoi['<pad>'])
    src_key_padding_mask = (src_ids == src_stoi['<pad>'])
    src_key_padding_mask = (src_ids == src_stoi['<pad>'])
{
    "cells": [
        {
            "cell_type": "markdown",
            "id": "#VSC-1b097b09",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "",
                "# Transformer-based English–French Translation",
                "",
                "Notebook ini merupakan implementasi eksplorasi model **Transformer** untuk penerjemahan otomatis dari Bahasa Inggris ke Bahasa Prancis.",
                "",
                "Tujuan utama:",
                "- Melatih model Transformer selama **1 epoch** dengan **batch-size maksimal 100**",
                "- Menunjukkan proses **Text Preprocessing**, **Definisi Arsitektur Transformer**, **Training**, dan **Inference**",
                "- Menampilkan metrik: `TrainLoss`, `ValLoss`, dan `ValAcc` di setiap akhir batch.",
                "",
                "Bobot penilaian:",
                "| Aspek | Bobot |",
                "|-------|-------|",
                "| Data Preparation (Text Preprocessing) | 20% |",
                "| Definisi Class Transformer | 25% |",
                "| Proses Training (TrainLoss, ValLoss, ValAcc) | 35% |",
                "| Inference Translation | 20% |",
                ""
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-16b419c0",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## Persiapan Lingkungan"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-43b30d09",
            "metadata": {
                "language": "python"
            },
            "source": [
                "",
                "!pip install torch pandas numpy",
                "import torch, pandas as pd, numpy as np, random, math, re, os",
                "from collections import Counter",
                "from torch import nn",
                "from torch.utils.data import Dataset, DataLoader",
                "",
                "DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
                "print('Running on', DEVICE)",
                ""
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-e7c78cc8",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## 1. Data Preparation & Text Preprocessing (20%)"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-88b4fc9a",
            "metadata": {
                "language": "python"
            },
            "source": [
                "",
                "# Dataset diambil dari file small_vocab_en.csv dan small_vocab_fr.csv",
                "en_path = 'small_vocab_en.csv'",
                "fr_path = 'small_vocab_fr.csv'",
                "",
                "# Baca setiap baris sebagai satu teks utuh",
                "with open(en_path, 'r', encoding='utf-8') as f:",
                "    src_texts = [line.strip() for line in f if line.strip()]",
                "",
                "with open(fr_path, 'r', encoding='utf-8') as f:",
                "    tgt_texts = [line.strip() for line in f if line.strip()]",
                "",
                "print(f\"Contoh data Inggris: {src_texts[0]}\")",
                "print(f\"Contoh data Prancis: {tgt_texts[0]}\")",
                "",
                "def clean_text(text):",
                "    text = text.lower()",
                "    text = re.sub(r\"[^a-zâêôàèçùé'\\-\\.\\,\\?\\!\\s]\", ' ', text)",
                "    return re.sub(r'\\s+', ' ', text).strip()",
                "",
                "def tokenize(text):",
                "    return text.split()",
                "",
                "src_tokens = [tokenize(clean_text(s)) for s in src_texts]",
                "tgt_tokens = [tokenize(clean_text(t)) for t in tgt_texts]",
                "",
                "# Split train/val",
                "data = list(zip(src_tokens, tgt_tokens))",
                "random.shuffle(data)",
                "split = int(0.9 * len(data))",
                "train, val = data[:split], data[split:]",
                "",
                "PAD, BOS, EOS, UNK = '<pad>', '<s>', '</s>', '<unk>'",
                "",
                "def build_vocab(sentences):",
                "    counter = Counter(t for s in sentences for t in s)",
                "    vocab = [PAD, BOS, EOS, UNK] + [t for t, _ in counter.most_common()]",
                "    stoi = {t: i for i, t in enumerate(vocab)}",
                "    itos = {i: t for t, i in stoi.items()}",
                "    return stoi, itos",
                "",
                "src_stoi, src_itos = build_vocab([s for s, _ in train])",
                "tgt_stoi, tgt_itos = build_vocab([t for _, t in train])",
                "",
                "print('Vocab sizes -> src:', len(src_stoi), '| tgt:', len(tgt_stoi))"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-b2803b6f",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## 2. Definisi Arsitektur Transformer (25%)"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-9b3b5e12",
            "metadata": {
                "language": "python"
            },
            "source": [
                "import torch",
                "import torch.nn as nn",
                "import math",
                "",
                "# ==========================================",
                "# Positional Encoding",
                "# ==========================================",
                "class PositionalEncoding(nn.Module):",
                "    def __init__(self, d_model, max_len=5000):",
                "        super().__init__()",
                "        pe = torch.zeros(max_len, d_model)",
                "        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)",
                "        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))",
                "        pe[:, 0::2] = torch.sin(position * div_term)",
                "        pe[:, 1::2] = torch.cos(position * div_term)",
                "        pe = pe.unsqueeze(0)",
                "        self.register_buffer('pe', pe)",
                "",
                "    def forward(self, x):",
                "        x = x + self.pe[:, :x.size(1)]",
                "        return x",
                "",
                "",
                "# ==========================================",
                "# Transformer Model untuk Translation",
                "# ==========================================",
                "class TransformerMT(nn.Module):",
                "    def __init__(self, src_vocab, tgt_vocab, d_model=128, nhead=4, num_layers=2, dim_ff=512, dropout=0.1):",
                "        super().__init__()",
                "        self.src_embed = nn.Embedding(src_vocab, d_model)",
                "        self.tgt_embed = nn.Embedding(tgt_vocab, d_model)",
                "        self.pos_enc = PositionalEncoding(d_model)",
                "        self.transformer = nn.Transformer(",
                "            d_model=d_model,",
                "            nhead=nhead,",
                "            num_encoder_layers=num_layers,",
                "            num_decoder_layers=num_layers,",
                "            dim_feedforward=dim_ff,",
                "            dropout=dropout,",
                "            batch_first=True",
                "        )",
                "        self.fc_out = nn.Linear(d_model, tgt_vocab)",
                "",
                "    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, tgt_mask=None):",
                "        \"\"\"Forward with optional masks:\",",
                ",",
                ",",
                "\"\"",
                "        src = self.pos_enc(self.src_embed(src))",
                "        tgt = self.pos_enc(self.tgt_embed(tgt))",
                "        out = self.transformer(src, tgt,\",",
                ",",
                ",",
                ",",
                ",",
                ",",
                "",
                ": ",
                ",",
                ": ",
                ",",
                ": {",
                ": ",
                "",
                ": [",
                ",",
                ",",
                ",",
                ",",
                ",",
                ","
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-01cb1b65",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## 3. Proses Training (35%)"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-e2475665",
            "metadata": {
                "language": "python"
            },
            "source": [
                "from tqdm import tqdm",
                "",
                "def accuracy_fn(y_pred, y_true, pad_idx):",
                "    pred_tokens = y_pred.argmax(dim=-1)",
                "    mask = y_true != pad_idx",
                "    correct = (pred_tokens == y_true) & mask",
                "    return correct.sum().float() / mask.sum().float()",
                "",
                "EPOCHS = 1",
                "for epoch in range(EPOCHS):",
                "    model.train()",
                "    total_loss = 0",
                "    print(f\"\\nEpoch {epoch+1}/{EPOCHS}\")",
                "    for i, (src, tgt_in, tgt_out) in enumerate(tqdm(train_dl)):",
                "        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)",
                "        optimizer.zero_grad()",
                "        # buat mask: subsequent mask untuk decoder dan padding mask untuk src/tgt",
                "        tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_in.size(1)).to(device)",
                "        src_key_padding_mask = (src == src_stoi['<pad>'])",
                "        tgt_key_padding_mask = (tgt_in == tgt_stoi['<pad>'])",
                "        output = model(src, tgt_in, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, tgt_mask=tgt_mask)",
                "        loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))",
                "        loss.backward()",
                "        optimizer.step()",
                "        total_loss += loss.item()",
                "        if (i+1) % 1 == 0:",
                "            print(f\"Batch {i+1}/{len(train_dl)} - TrainLoss: {loss.item():.4f}\")",
                "",
                "    # Validation",
                "    model.eval()",
                "    val_loss, val_acc = 0, 0",
                "    with torch.no_grad():",
                "        for src, tgt_in, tgt_out in val_dl:",
                "            src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)",
                "            # buat mask untuk validasi juga",
                "            tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_in.size(1)).to(device)",
                "            src_key_padding_mask = (src == src_stoi['<pad>'])",
                "            tgt_key_padding_mask = (tgt_in == tgt_stoi['<pad>'])",
                "            output = model(src, tgt_in, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, tgt_mask=tgt_mask)",
                "            loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))",
                "            val_loss += loss.item()",
                "            # Perbaikan: gunakan pad index target saat menghitung akurasi",
                "            val_acc += accuracy_fn(output, tgt_out, tgt_stoi['<pad>']).item()",
                "    val_loss /= len(val_dl)",
                "    val_acc /= len(val_dl)",
                "    print(f\"ValLoss: {val_loss:.4f}, ValAcc: {val_acc*100:.2f}%\")",
                ""
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-6f03225b",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## 4. Inference Translation (20%)"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-96de842a",
            "metadata": {
                "language": "python"
            },
            "source": [
                "def translate_sentence(model, sentence, src_stoi, tgt_stoi, tgt_itos, max_len=20):",
                "    model.eval()",
                "    tokens = [w.lower() for w in sentence.split()]",
                "    src_ids = torch.tensor([[src_stoi.get(t, src_stoi['<unk>']) for t in tokens]], device=device)",
                "    tgt_input = torch.tensor([[tgt_stoi['<s>']]], device=device)",
                "",
                "    for _ in range(max_len):",
                "        out = model(src_ids, tgt_input)",
                "        next_token = out[:, -1].argmax(dim=-1).unsqueeze(0)",
                "        tgt_input = torch.cat([tgt_input, next_token], dim=1)",
                "        if next_token.item() == tgt_stoi['</s>']:",
                "            break",
                "",
                "    translated = [tgt_itos[idx.item()] for idx in tgt_input[0]]",
                "    return ' '.join(translated[1:-1])  # hilangkan <s> dan </s>",
                "",
                "# Contoh uji terjemahan (satu contoh)",
                "test_sentence = src_texts[0]",
                "print(\"English :\", test_sentence)",
                "print(\"French (predicted):\", translate_sentence(model, test_sentence, src_stoi, tgt_stoi, tgt_itos))",
                "",
                "# ==========================================",
                "# Evaluasi greedy (autoregressive) pada validation set",
                "# ==========================================",
                "def greedy_decode(tokens, src_stoi, tgt_stoi, tgt_itos, max_len=20):",
                "    model.eval()",
                "    src_ids = torch.tensor([[src_stoi.get(t, src_stoi['<unk>']) for t in tokens]], device=device)",
                "    tgt = torch.tensor([[tgt_stoi['<s>']]], device=device)",
                "    for _ in range(max_len):",
                "        out = model(src_ids, tgt)",
                "        next_tok = out[0, -1].argmax().item()",
                "        tgt = torch.cat([tgt, torch.tensor([[next_tok]], device=device)], dim=1)",
                "        if next_tok == tgt_stoi['</s>']: break",
                "    return [tgt_itos[i] for i in tgt[0].tolist()][1:-1]",
                "",
                "def eval_greedy(val_pairs, n_samples=200):",
                "    # n_samples untuk menjalankan evaluasi cepat; set None untuk semua",
                "    total_sent, exact_match, token_correct, token_total = 0,0,0,0",
                "    samples = val_pairs if n_samples is None else val_pairs[:n_samples]",
                "    for src_tokens, tgt_tokens in samples:",
                "        pred = greedy_decode(src_tokens, src_stoi, tgt_stoi, tgt_itos, max_len=MAX_LEN)",
                "        total_sent += 1",
                "        if pred == tgt_tokens: exact_match += 1",
                "        # token-level overlap (up to min len)",
                "        m = min(len(pred), len(tgt_tokens))",
                "        for i in range(m):",
                "            if pred[i] == tgt_tokens[i]: token_correct += 1",
                "            token_total += 1",
                "    print(f'Greedy Exact Match: {exact_match}/{total_sent} = {exact_match/total_sent:.4f}')",
                "    if token_total>0:",
                "        print(f'Greedy Token Accuracy (overlap): {token_correct}/{token_total} = {token_correct/token_total:.4f}')",
                "    else:",
                "        print('No token comparisons performed (empty preds?).')",
                "",
                "# Jalankan evaluasi greedy cepat (200 contoh)",
                "eval_greedy(val, n_samples=200)",
                ""
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-0742d772",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "",
                "## 5. Kesimpulan",
                "",
                "Eksperimen ini menunjukkan implementasi dasar Transformer untuk penerjemahan Bahasa Inggris ke Bahasa Prancis.",
                "",
                "- Data telah dibersihkan dan ditokenisasi secara sederhana.",
                "- Arsitektur Transformer telah dibangun dari nol dengan PyTorch.",
                "- Proses training menampilkan *TrainLoss*, *ValLoss*, dan *ValAcc* tiap batch.",
                "- Model berhasil melakukan inferensi dengan pendekatan *greedy decoding*.",
                "",
                "Selanjutnya, model dapat diperluas dengan peningkatan jumlah epoch, mekanisme perhatian visualisasi, dan evaluasi BLEU score.",
                ""
            ]
        }
    ]
}


# Transformer-based English–French Translation

Notebook ini merupakan implementasi eksplorasi model **Transformer** untuk penerjemahan otomatis dari Bahasa Inggris ke Bahasa Prancis.

Tujuan utama:
- Melatih model Transformer selama **1 epoch** dengan **batch-size maksimal 100**
- Menunjukkan proses **Text Preprocessing**, **Definisi Arsitektur Transformer**, **Training**, dan **Inference**
- Menampilkan metrik: `TrainLoss`, `ValLoss`, dan `ValAcc` di setiap akhir batch.

Bobot penilaian:
| Aspek | Bobot |
|-------|-------|
| Data Preparation (Text Preprocessing) | 20% |
| Definisi Class Transformer | 25% |
| Proses Training (TrainLoss, ValLoss, ValAcc) | 35% |
| Inference Translation | 20% |


## Persiapan Lingkungan

In [None]:

!pip install torch pandas numpy
import torch, pandas as pd, numpy as np, random, math, re, os
from collections import Counter
from torch import nn
from torch.utils.data import Dataset, DataLoader

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Running on', DEVICE)


## 1. Data Preparation & Text Preprocessing (20%)

In [None]:

# Dataset diambil dari file small_vocab_en.csv dan small_vocab_fr.csv
en_path = 'small_vocab_en.csv'
fr_path = 'small_vocab_fr.csv'

# Baca setiap baris sebagai satu teks utuh
with open(en_path, 'r', encoding='utf-8') as f:
    src_texts = [line.strip() for line in f if line.strip()]

with open(fr_path, 'r', encoding='utf-8') as f:
    tgt_texts = [line.strip() for line in f if line.strip()]

print(f"Contoh data Inggris: {src_texts[0]}")
print(f"Contoh data Prancis: {tgt_texts[0]}")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zâêôàèçùé'\-\.\,\?\!\s]", ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def tokenize(text):
    return text.split()

src_tokens = [tokenize(clean_text(s)) for s in src_texts]
tgt_tokens = [tokenize(clean_text(t)) for t in tgt_texts]

# Split train/val
data = list(zip(src_tokens, tgt_tokens))
random.shuffle(data)
split = int(0.9 * len(data))
train, val = data[:split], data[split:]

PAD, BOS, EOS, UNK = '<pad>', '<s>', '</s>', '<unk>'

def build_vocab(sentences):
    counter = Counter(t for s in sentences for t in s)
    vocab = [PAD, BOS, EOS, UNK] + [t for t, _ in counter.most_common()]
    stoi = {t: i for i, t in enumerate(vocab)}
    itos = {i: t for t, i in stoi.items()}
    return stoi, itos

src_stoi, src_itos = build_vocab([s for s, _ in train])
tgt_stoi, tgt_itos = build_vocab([t for _, t in train])

print('Vocab sizes -> src:', len(src_stoi), '| tgt:', len(tgt_stoi))

## 2. Definisi Arsitektur Transformer (25%)

In [None]:
import torch
import torch.nn as nn
import math

# ==========================================
# Positional Encoding
# ==========================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


# ==========================================
# Transformer Model untuk Translation
# ==========================================
class TransformerMT(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=128, nhead=4, num_layers=2, dim_ff=512, dropout=0.1):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, tgt_mask=None):
        """Forward with optional masks:",
,
,
""
        src = self.pos_enc(self.src_embed(src))
        tgt = self.pos_enc(self.tgt_embed(tgt))
        out = self.transformer(src, tgt,",
,
,
,
,
,

: 
,
: 
,
: {
: 

: [
,
,
,
,
,
,

## 3. Proses Training (35%)

In [None]:
from tqdm import tqdm

def accuracy_fn(y_pred, y_true, pad_idx):
    pred_tokens = y_pred.argmax(dim=-1)
    mask = y_true != pad_idx
    correct = (pred_tokens == y_true) & mask
    return correct.sum().float() / mask.sum().float()

EPOCHS = 1
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    for i, (src, tgt_in, tgt_out) in enumerate(tqdm(train_dl)):
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
        optimizer.zero_grad()
        # buat mask: subsequent mask untuk decoder dan padding mask untuk src/tgt
        tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_in.size(1)).to(device)
        src_key_padding_mask = (src == src_stoi['<pad>'])
        tgt_key_padding_mask = (tgt_in == tgt_stoi['<pad>'])
        output = model(src, tgt_in, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, tgt_mask=tgt_mask)
        loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i+1) % 1 == 0:
            print(f"Batch {i+1}/{len(train_dl)} - TrainLoss: {loss.item():.4f}")

    # Validation
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for src, tgt_in, tgt_out in val_dl:
            src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
            # buat mask untuk validasi juga
            tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_in.size(1)).to(device)
            src_key_padding_mask = (src == src_stoi['<pad>'])
            tgt_key_padding_mask = (tgt_in == tgt_stoi['<pad>'])
            output = model(src, tgt_in, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, tgt_mask=tgt_mask)
            loss = criterion(output.view(-1, output.size(-1)), tgt_out.view(-1))
            val_loss += loss.item()
            # Perbaikan: gunakan pad index target saat menghitung akurasi
            val_acc += accuracy_fn(output, tgt_out, tgt_stoi['<pad>']).item()
    val_loss /= len(val_dl)
    val_acc /= len(val_dl)
    print(f"ValLoss: {val_loss:.4f}, ValAcc: {val_acc*100:.2f}%")


## 4. Inference Translation (20%)

In [None]:
def translate_sentence(model, sentence, src_stoi, tgt_stoi, tgt_itos, max_len=20):
    model.eval()
    tokens = [w.lower() for w in sentence.split()]
    src_ids = torch.tensor([[src_stoi.get(t, src_stoi['<unk>']) for t in tokens]], device=device)
    tgt_input = torch.tensor([[tgt_stoi['<s>']]], device=device)

    for _ in range(max_len):
        out = model(src_ids, tgt_input)
        next_token = out[:, -1].argmax(dim=-1).unsqueeze(0)
        tgt_input = torch.cat([tgt_input, next_token], dim=1)
        if next_token.item() == tgt_stoi['</s>']:
            break

    translated = [tgt_itos[idx.item()] for idx in tgt_input[0]]
    return ' '.join(translated[1:-1])  # hilangkan <s> dan </s>

# Contoh uji terjemahan (satu contoh)
test_sentence = src_texts[0]
print("English :", test_sentence)
print("French (predicted):", translate_sentence(model, test_sentence, src_stoi, tgt_stoi, tgt_itos))

# ==========================================
# Evaluasi greedy (autoregressive) pada validation set
# ==========================================
def greedy_decode(tokens, src_stoi, tgt_stoi, tgt_itos, max_len=20):
    model.eval()
    src_ids = torch.tensor([[src_stoi.get(t, src_stoi['<unk>']) for t in tokens]], device=device)
    tgt = torch.tensor([[tgt_stoi['<s>']]], device=device)
    for _ in range(max_len):
        out = model(src_ids, tgt)
        next_tok = out[0, -1].argmax().item()
        tgt = torch.cat([tgt, torch.tensor([[next_tok]], device=device)], dim=1)
        if next_tok == tgt_stoi['</s>']: break
    return [tgt_itos[i] for i in tgt[0].tolist()][1:-1]

def eval_greedy(val_pairs, n_samples=200):
    # n_samples untuk menjalankan evaluasi cepat; set None untuk semua
    total_sent, exact_match, token_correct, token_total = 0,0,0,0
    samples = val_pairs if n_samples is None else val_pairs[:n_samples]
    for src_tokens, tgt_tokens in samples:
        pred = greedy_decode(src_tokens, src_stoi, tgt_stoi, tgt_itos, max_len=MAX_LEN)
        total_sent += 1
        if pred == tgt_tokens: exact_match += 1
        # token-level overlap (up to min len)
        m = min(len(pred), len(tgt_tokens))
        for i in range(m):
            if pred[i] == tgt_tokens[i]: token_correct += 1
            token_total += 1
    print(f'Greedy Exact Match: {exact_match}/{total_sent} = {exact_match/total_sent:.4f}')
    if token_total>0:
        print(f'Greedy Token Accuracy (overlap): {token_correct}/{token_total} = {token_correct/token_total:.4f}')
    else:
        print('No token comparisons performed (empty preds?).')

# Jalankan evaluasi greedy cepat (200 contoh)
eval_greedy(val, n_samples=200)



## 5. Kesimpulan

Eksperimen ini menunjukkan implementasi dasar Transformer untuk penerjemahan Bahasa Inggris ke Bahasa Prancis.

- Data telah dibersihkan dan ditokenisasi secara sederhana.
- Arsitektur Transformer telah dibangun dari nol dengan PyTorch.
- Proses training menampilkan *TrainLoss*, *ValLoss*, dan *ValAcc* tiap batch.
- Model berhasil melakukan inferensi dengan pendekatan *greedy decoding*.

Selanjutnya, model dapat diperluas dengan peningkatan jumlah epoch, mekanisme perhatian visualisasi, dan evaluasi BLEU score.
