
# Transformer-based English–French Translation

Notebook ini merupakan implementasi eksplorasi model **Transformer** untuk penerjemahan otomatis dari Bahasa Inggris ke Bahasa Prancis.

Tujuan utama:
- Melatih model Transformer selama **1 epoch** dengan **batch-size maksimal 100**
- Menunjukkan proses **Text Preprocessing**, **Definisi Arsitektur Transformer**, **Training**, dan **Inference**
- Menampilkan metrik: `TrainLoss`, `ValLoss`, dan `ValAcc` di setiap akhir batch.

Bobot penilaian:
| Aspek | Bobot |
|-------|-------|
| Data Preparation (Text Preprocessing) | 20% |
| Definisi Class Transformer | 25% |
| Proses Training (TrainLoss, ValLoss, ValAcc) | 35% |
| Inference Translation | 20% |


## Persiapan Lingkungan

In [6]:

!pip install torch pandas numpy
import torch, pandas as pd, numpy as np, random, math, re, os
from collections import Counter
from torch import nn
from torch.utils.data import Dataset, DataLoader

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Running on', DEVICE)


^C
Running on cuda
Running on cuda




## 1. Data Preparation & Text Preprocessing (20%)

In [8]:

# Dataset diambil dari file small_vocab_en.csv dan small_vocab_fr.csv
# Gunakan path relatif terhadap working directory agar berjalan di Windows/Local
en_path = os.path.join(os.getcwd(), 'small_vocab_en.csv')
fr_path = os.path.join(os.getcwd(), 'small_vocab_fr.csv')

# Pastikan file ada - bila tidak ada, munculkan pesan informatif
if not os.path.exists(en_path) or not os.path.exists(fr_path):
    raise FileNotFoundError(f'Required files not found. Expected at: {en_path} and {fr_path}')

# Beberapa kalimat mengandung koma; baca tiap baris langsung dari file untuk menghindari parser CSV
with open(en_path, 'r', encoding='utf-8') as f:
    src_texts = [line.strip() for line in f if line.strip()]
with open(fr_path, 'r', encoding='utf-8') as f:
    tgt_texts = [line.strip() for line in f if line.strip()]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zâêôàèçùé'\-\.\,\?\!\s]", ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def tokenize(text): return text.split()

src_tokens = [tokenize(clean_text(s)) for s in src_texts]
tgt_tokens = [tokenize(clean_text(t)) for t in tgt_texts]

# Split train/val
data = list(zip(src_tokens, tgt_tokens))
random.shuffle(data)
# Buat split train/val, pastikan val tidak kosong (jika dataset sangat kecil)
split = int(0.9 * len(data))
if split >= len(data):
    split = max(1, len(data) - 1)
train, val = data[:split], data[split:]

PAD, BOS, EOS, UNK = '<pad>', '<s>', '</s>', '<unk>'

def build_vocab(sentences):
    counter = Counter(t for s in sentences for t in s)
    vocab = [PAD, BOS, EOS, UNK] + [t for t, _ in counter.most_common()]
    stoi = {t: i for i, t in enumerate(vocab)}
    itos = {i: t for t, i in stoi.items()}
    return stoi, itos

src_stoi, src_itos = build_vocab([s for s, _ in train])
tgt_stoi, tgt_itos = build_vocab([t for _, t in train])

print('Vocab sizes -> src:', len(src_stoi), '| tgt:', len(tgt_stoi))


Vocab sizes -> src: 231 | tgt: 358


## 2. Definisi Arsitektur Transformer (25%)

In [9]:

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerMT(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=256, nhead=8, nlayers=3):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model)
        self.pos = PositionalEncoding(d_model)
        self.trans = nn.Transformer(d_model, nhead, nlayers, nlayers, batch_first=True)
        self.fc = nn.Linear(d_model, tgt_vocab)
        self.d_model = d_model

    def forward(self, src, tgt):
        src = self.pos(self.src_emb(src) * math.sqrt(self.d_model))
        tgt = self.pos(self.tgt_emb(tgt) * math.sqrt(self.d_model))
        mask = torch.tril(torch.ones(tgt.size(1), tgt.size(1), device=DEVICE)).bool()
        out = self.trans(src, tgt, tgt_mask=mask)
        return self.fc(out)


## 3. Proses Training (35%)

In [10]:

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_stoi, tgt_stoi, max_len=50):
        self.data = pairs; self.src_stoi = src_stoi; self.tgt_stoi = tgt_stoi; self.max_len = max_len

    def encode(self, tokens, vocab):
        ids = [vocab.get(t, vocab['<unk>']) for t in tokens] + [vocab['</s>']]
        return ids + [vocab['<pad>']] * (self.max_len - len(ids)) if len(ids)<self.max_len else ids[:self.max_len]

    def __getitem__(self, idx):
        s, t = self.data[idx]
        src = torch.tensor(self.encode(s, self.src_stoi))
        tgt = torch.tensor([self.tgt_stoi['<s>']] + self.encode(t, self.tgt_stoi))
        return src, tgt[:-1], tgt[1:]
    def __len__(self): return len(self.data)

train_ds = TranslationDataset(train, src_stoi, tgt_stoi)
val_ds = TranslationDataset(val, src_stoi, tgt_stoi)
train_dl = DataLoader(train_ds, batch_size=100, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=100)

model = TransformerMT(len(src_stoi), len(tgt_stoi)).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_stoi['<pad>'])

def evaluate():
    model.eval(); loss, correct, total = 0,0,0
    with torch.no_grad():
        for s, t_in, t_out in val_dl:
            s,t_in,t_out = s.to(DEVICE),t_in.to(DEVICE),t_out.to(DEVICE)
            out = model(s,t_in)
            loss += loss_fn(out.reshape(-1,out.size(-1)), t_out.reshape(-1)).item()
            pred = out.argmax(-1)
            mask = t_out!=tgt_stoi['<pad>']
            correct += (pred==t_out).masked_select(mask).sum().item()
            total += mask.sum().item()
    return loss/len(val_dl), correct/total

for b,(s,t_in,t_out) in enumerate(train_dl,1):
    model.train(); s,t_in,t_out=s.to(DEVICE),t_in.to(DEVICE),t_out.to(DEVICE)
    opt.zero_grad()
    out = model(s,t_in)
    loss = loss_fn(out.reshape(-1,out.size(-1)), t_out.reshape(-1))
    loss.backward(); opt.step()
    val_loss, val_acc = evaluate()
    print(f'Batch {b} | TrainLoss {loss.item():.4f} | ValLoss {val_loss:.4f} | ValAcc {val_acc:.4f}')


Batch 1 | TrainLoss 6.0151 | ValLoss nan | ValAcc 0.0000
Batch 2 | TrainLoss 5.4421 | ValLoss nan | ValAcc 0.0000
Batch 2 | TrainLoss 5.4421 | ValLoss nan | ValAcc 0.0000
Batch 3 | TrainLoss 5.1770 | ValLoss nan | ValAcc 0.0000
Batch 3 | TrainLoss 5.1770 | ValLoss nan | ValAcc 0.0000
Batch 4 | TrainLoss 5.0295 | ValLoss nan | ValAcc 0.0000
Batch 4 | TrainLoss 5.0295 | ValLoss nan | ValAcc 0.0000


KeyboardInterrupt: 

## 4. Inference Translation (20%)

In [None]:

def translate(sentence_tokens):
    model.eval()
    src = torch.tensor([train_ds.encode(sentence_tokens, src_stoi)]).to(DEVICE)
    tgt = torch.tensor([[tgt_stoi['<s>']]]).to(DEVICE)
    for _ in range(50):
        out = model(src, tgt)
        next_tok = out[0,-1].argmax().item()
        tgt = torch.cat([tgt, torch.tensor([[next_tok]], device=DEVICE)], dim=1)
        if next_tok == tgt_stoi['</s>']: break
    return ' '.join(tgt_itos[i.item()] for i in tgt[0][1:-1])

for i in range(5):
    print('SRC :', ' '.join(val[i][0]))
    print('PRED:', translate(val[i][0]))
    print('REF :', ' '.join(val[i][1]))
    print('---')



## 5. Kesimpulan

Eksperimen ini menunjukkan implementasi dasar Transformer untuk penerjemahan Bahasa Inggris ke Bahasa Prancis.

- Data telah dibersihkan dan ditokenisasi secara sederhana.
- Arsitektur Transformer telah dibangun dari nol dengan PyTorch.
- Proses training menampilkan *TrainLoss*, *ValLoss*, dan *ValAcc* tiap batch.
- Model berhasil melakukan inferensi dengan pendekatan *greedy decoding*.

Selanjutnya, model dapat diperluas dengan peningkatan jumlah epoch, mekanisme perhatian visualisasi, dan evaluasi BLEU score.
