Install required libraries : 


`!pip install "numpy<2" --upgrade `


`!pip install pandas torch scikit-learn streamlit`


In [None]:
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import math
import random
import time

### A Translation transformer

In [9]:
%%time
# ---------------------------
# Toy Dataset: English → French
# ---------------------------
pairs = [
    ("hello", "bonjour"),
    ("world", "monde"),
    ("je", "I"),
    ("ingenieur", "engineer"),
    ("sac", "bag"),
    ("cup", "tasse"),
]

# Build vocab
src_vocab = {"<pad>":0, "<sos>":1, "<eos>":2} # Initialize Source vocabulary with special tokens
tgt_vocab = {"<pad>":0, "<sos>":1, "<eos>":2} # Initialize Target vocabulary with special tokens

# Create & Populate vocabularies with characters from dataset (both source and target) => char-level tokenization & reference for embedding vectors
for src, tgt in pairs:
    for ch in src:
        if ch not in src_vocab: src_vocab[ch] = len(src_vocab)
    for ch in tgt:
        if ch not in tgt_vocab: tgt_vocab[ch] = len(tgt_vocab)

# Inverse vocabularies for decoding dictionaries (id → char)
inv_src_vocab = {v:k for k,v in src_vocab.items()}
inv_tgt_vocab = {v:k for k,v in tgt_vocab.items()}

# Encoding and Padding functions
def encode(text, vocab):
    return [vocab["<sos>"]] + [vocab[ch] for ch in text] + [vocab["<eos>"]]

def pad(seq, max_len):
    return seq + [0]*(max_len-len(seq))

# Parameter: Maximum sequence lengths to control the input & output sizes (adding 2 for <sos> and <eos>)
src_max_len = max(len(s) for s,_ in pairs)+2 # Control the maximum length of the source sequences
tgt_max_len = max(len(t) for _,t in pairs)+2 # Control the maximum length of the target sequences

# Prepare dataset: Encode and pad all sequences
data = []
for src, tgt in pairs:
    src_ids = pad(encode(src, src_vocab), src_max_len)
    tgt_ids = pad(encode(tgt, tgt_vocab), tgt_max_len)
    data.append((src_ids, tgt_ids))

# ---------------------------
# Model Components 
# ---------------------------
# d_model: Dimension of embedding vectors
# num_heads: Number of attention heads
# num_layers: Number of encoder and decoder layers
# d_ff: Dimension of feed-forward network

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model) # initialize linear layers for query, key, value
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.out(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=256):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
    def forward(self, x, mask=None):
        x = self.norm1(x + self.attn(x, x, x, mask))
        return self.norm2(x + self.ff(x))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.self_attn(x, x, x, tgt_mask))
        x = self.norm2(x + self.cross_attn(x, enc_output, enc_output, src_mask))
        return self.norm3(x + self.ff(x))

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, num_heads=4, num_layers=2, d_ff=256):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    def forward(self, src, tgt):
        src = self.pos_enc(self.src_embed(src))
        tgt = self.pos_enc(self.tgt_embed(tgt))
        for layer in self.encoder_layers:
            src = layer(src)
        enc_output = src
        for layer in self.decoder_layers:
            tgt = layer(tgt, enc_output)
        return self.fc_out(tgt)

# ---------------------------
# Training Loop
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Transformer(len(src_vocab), len(tgt_vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 200
for epoch in range(epochs):
    total_loss = 0
    for src_ids, tgt_ids in data:
        src_tensor = torch.tensor([src_ids], device=device)
        tgt_tensor = torch.tensor([tgt_ids], device=device)
        optimizer.zero_grad()
        output = model(src_tensor, tgt_tensor[:, :-1])  # predict next token
        loss = criterion(output.view(-1, len(tgt_vocab)), tgt_tensor[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# ---------------------------
# Test Prediction
# ---------------------------
def predict(word):
    src_ids = pad(encode(word, src_vocab), src_max_len) # Encode and pad input word (normalize length)
    tgt_ids = [tgt_vocab["<sos>"]] # Initialize target sequence with <sos> (predicting from scratch)
    for _ in range(tgt_max_len-1):
        tgt_tensor = torch.tensor([tgt_ids], device=device)
        src_tensor = torch.tensor([src_ids], device=device)
        output = model(src_tensor, tgt_tensor)
        next_token = output[0, -1].argmax().item()
        if next_token == tgt_vocab["<eos>"]: # Stop if <eos> is predicted
            break
        tgt_ids.append(next_token)
    return "".join(inv_tgt_vocab[i] for i in tgt_ids[1:]) # Decode predicted tokens



Epoch 0, Loss: 19.0265
Epoch 20, Loss: 0.4420
Epoch 40, Loss: 0.0753
Epoch 60, Loss: 0.0275
Epoch 80, Loss: 0.0182
Epoch 100, Loss: 0.0132
Epoch 120, Loss: 0.0101
Epoch 140, Loss: 0.0080
Epoch 160, Loss: 0.0065
Epoch 180, Loss: 0.0054
CPU times: user 1min 33s, sys: 6.58 s, total: 1min 40s
Wall time: 17.5 s


In [14]:
print("Translate 'hello' ->", predict("cup"))

Translate 'hello' -> tase


### Explanations

**✅ What is tgt_vocab and why these tokens?**


- `<pad>`: Padding token 
Used to make all sequences the same length by filling empty positions with zeros.


- `<sos>`: Start-of-sequence token 
Indicates the beginning of the target sentence during decoding.


- `<eos>`: End-of-sequence token 
Marks the end of the sentence so the model knows when to stop generating.


These special tokens are essential for sequence models because:

- Padding ensures batches have uniform shape.
- Start and end tokens help the model learn where sentences begin and end.

---

**✅ How vocabularies are built**

We start with special tokens.
Then we add characters from the dataset (English and Pig Latin words).
Each character gets a unique integer ID.

src_vocab might look like:
```{"<pad>":0, "<sos>":1, "<eos>":2, "h":3, "e":4, "l":5, "o":6, "w":7, ...}```

tgt_vocab might look like:
```{"<pad>":0, "<sos>":1, "<eos>":2, "e":3, "l":4, "o":5, "h":6, "a":7, "y":8, ...}```

---

**✅ Encoding and Padding**


- encode("hello", src_vocab) → [1, 3, 4, 5, 5, 6, 2]
(`<sos>` + h,e,l,l,o + `<eos>`)


- pad([1,3,4,5,5,6,2], 10) → [1,3,4,5,5,6,2,0,0,0]
Adds `<pad>` tokens to reach length 10.

---

**✅ Why do we need this?**

Transformers work with fixed-length tensors for batching. Padding ensures:

- All sequences have the same length.
- The model ignores padding during attention (using masks).

---

**✅ Training Loop Summary**

For each pair (English → French):

- Encode and pad both source and target.
- Feed source and target (except last token) into the model.
- Predict next token and compute loss against target shifted by one position.
- Repeat for multiple epochs.

