In [1]:
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import math
import random

In [None]:


# ---------------------------
# Toy Dataset: English → French
# ---------------------------
pairs = [
    ("hello", "bonjour"),
    ("world", "monde"),
    ("transformer", "transformateur"),
    ("attention", "attention"),
    ("python", "python"),
    ("model", "modèle"),
]

# Build vocab
src_vocab = {"<pad>":0, "<sos>":1, "<eos>":2}
tgt_vocab = {"<pad>":0, "<sos>":1, "<eos>":2}

for src, tgt in pairs:
    for ch in src:
        if ch not in src_vocab: src_vocab[ch] = len(src_vocab)
    for ch in tgt:
        if ch not in tgt_vocab: tgt_vocab[ch] = len(tgt_vocab)

inv_src_vocab = {v:k for k,v in src_vocab.items()}
inv_tgt_vocab = {v:k for k,v in tgt_vocab.items()}

def encode(text, vocab):
    return [vocab["<sos>"]] + [vocab[ch] for ch in text] + [vocab["<eos>"]]

def pad(seq, max_len):
    return seq + [0]*(max_len-len(seq))

src_max_len = max(len(s) for s,_ in pairs)+2
tgt_max_len = max(len(t) for _,t in pairs)+2

data = []
for src, tgt in pairs:
    src_ids = pad(encode(src, src_vocab), src_max_len)
    tgt_ids = pad(encode(tgt, tgt_vocab), tgt_max_len)
    data.append((src_ids, tgt_ids))

# ---------------------------
# Model Components 
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.out(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=256):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        return self.linear2(torch.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
    def forward(self, x, mask=None):
        x = self.norm1(x + self.attn(x, x, x, mask))
        return self.norm2(x + self.ff(x))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.self_attn(x, x, x, tgt_mask))
        x = self.norm2(x + self.cross_attn(x, enc_output, enc_output, src_mask))
        return self.norm3(x + self.ff(x))

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, num_heads=4, num_layers=2, d_ff=256):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    def forward(self, src, tgt):
        src = self.pos_enc(self.src_embed(src))
        tgt = self.pos_enc(self.tgt_embed(tgt))
        for layer in self.encoder_layers:
            src = layer(src)
        enc_output = src
        for layer in self.decoder_layers:
            tgt = layer(tgt, enc_output)
        return self.fc_out(tgt)

# ---------------------------
# Training Loop
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Transformer(len(src_vocab), len(tgt_vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 200
for epoch in range(epochs):
    total_loss = 0
    for src_ids, tgt_ids in data:
        src_tensor = torch.tensor([src_ids], device=device)
        tgt_tensor = torch.tensor([tgt_ids], device=device)
        optimizer.zero_grad()
        output = model(src_tensor, tgt_tensor[:, :-1])  # predict next token
        loss = criterion(output.view(-1, len(tgt_vocab)), tgt_tensor[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# ---------------------------
# Test Prediction
# ---------------------------
def predict(word):
    src_ids = pad(encode(word, src_vocab), src_max_len)
    tgt_ids = [tgt_vocab["<sos>"]]
    for _ in range(tgt_max_len-1):
        tgt_tensor = torch.tensor([tgt_ids], device=device)
        src_tensor = torch.tensor([src_ids], device=device)
        output = model(src_tensor, tgt_tensor)
        next_token = output[0, -1].argmax().item()
        if next_token == tgt_vocab["<eos>"]:
            break
        tgt_ids.append(next_token)
    return "".join(inv_tgt_vocab[i] for i in tgt_ids[1:])




Epoch 0, Loss: 18.6754
Epoch 20, Loss: 0.1884
Epoch 40, Loss: 0.0635
Epoch 60, Loss: 0.0248
Epoch 80, Loss: 0.0162
Epoch 100, Loss: 0.0116
Epoch 120, Loss: 0.0089
Epoch 140, Loss: 0.0070
Epoch 160, Loss: 0.0057
Epoch 180, Loss: 0.0047


In [11]:
print("Translate 'hello' ->", predict("hello"))

Translate 'hello' -> bonjon


✅ What is tgt_vocab and why these tokens?


<pad>: Padding token (index 0)
Used to make all sequences the same length by filling empty positions with zeros.


<sos>: Start-of-sequence token (index 1)
Indicates the beginning of the target sentence during decoding.


<eos>: End-of-sequence token (index 2)
Marks the end of the sentence so the model knows when to stop generating.


These special tokens are essential for sequence models because:

Padding ensures batches have uniform shape.
Start and end tokens help the model learn where sentences begin and end.

✅ How vocabularies are built

We start with special tokens.
Then we add characters from the dataset (English and Pig Latin words).
Each character gets a unique integer ID.

```

src_vocab might look like:
{"<pad>":0, "<sos>":1, "<eos>":2, "h":3, "e":4, "l":5, "o":6, "w":7, ...}

tgt_vocab might look like:
{"<pad>":0, "<sos>":1, "<eos>":2, "e":3, "l":4, "o":5, "h":6, "a":7, "y":8, ...}
```


✅ Encoding and Padding


encode("hello", src_vocab) → [1, 3, 4, 5, 5, 6, 2]
(<sos> + h,e,l,l,o + <eos>)


pad([1,3,4,5,5,6,2], 10) → [1,3,4,5,5,6,2,0,0,0]
Adds <pad> tokens to reach length 10.

✅ Why do we need this?
Transformers work with fixed-length tensors for batching. Padding ensures:

All sequences have the same length.
The model ignores padding during attention (using masks).


✅ Training Loop Summary

For each pair (English → Pig Latin):

Encode and pad both source and target.
Feed source and target (except last token) into the model.
Predict next token and compute loss against target shifted by one position.


Repeat for multiple epochs.



Pre-trained model 

In [12]:
# English → French Transformer Training with HuggingFace

## Install dependencies
# !pip install datasets transformers torch sentencepiece

import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer

# ---------------------------
# 1. Load Dataset
# ---------------------------
dataset = load_dataset('opus_books', 'en-fr')
train_data = dataset['train']

# ---------------------------
# 2. Tokenizer
# ---------------------------
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

src_lang = 'en'
tgt_lang = 'fr'

max_length = 64

def tokenize_batch(batch):
    inputs = tokenizer(batch['translation'][src_lang], truncation=True, padding='max_length', max_length=max_length)
    targets = tokenizer(batch['translation'][tgt_lang], truncation=True, padding='max_length', max_length=max_length)
    inputs['labels'] = targets['input_ids']
    return inputs

train_data = train_data.map(tokenize_batch, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# ---------------------------
# 3. DataLoader
# ---------------------------
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

# ---------------------------
# 4. Define Transformer Model
# ---------------------------
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# ---------------------------
# 5. Training Loop
# ---------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):  # For demo, 1 epoch
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss/len(train_loader):.4f}")

# ---------------------------
# 6. Inference
# ---------------------------
model.eval()
example = "Hello, how are you?"
inputs = tokenizer(example, return_tensors='pt').to(model.device)
outputs = model.generate(**inputs, max_length=40)
print("English:", example)
print("French:", tokenizer.decode(outputs[0], skip_special_tokens=True))


  from .autonotebook import tqdm as notebook_tqdm
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /datasets/opus_books/resolve/main/README.md (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000027AB66D21E0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 20603ca1-ab5a-46ab-aa9f-3e7fa0d8234a)')' thrown while requesting HEAD https://huggingface.co/datasets/opus_books/resolve/main/README.md
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /datasets/opus_books/resolve/main/README.md (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000027AB66A57C0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: e92fe33f-4031-48a7-bf30-c1ea70c96992)')' thrown while requesting HEAD https://huggingface.co/datasets/opus_books/resolve/main/README.m

KeyboardInterrupt: 

New model 

In [None]:

# %% [markdown]
# # Micro Story Generator - Train a Tiny Transformer From Scratch
# This script trains a character-level Transformer language model on a tiny text corpus.
# It demonstrates tokenization, positional encoding, attention, and autoregressive generation.

# %%
import math, random, time
import torch
import torch.nn as nn
import torch.nn.functional as F

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# -----------------------------
# 1) Tiny corpus (training text)
# -----------------------------
# We use a small text block for quick training.
# Students can add more lines during class to see how the model adapts.
TRAIN_TEXT = """
Once upon a time, in a distant valley, a small robot learned to write.
It wrote tiny stories about stars, rain, and quiet forests.
Sometimes the stories were silly; sometimes they were wise.
Students loved reading them and adding their own twists.
"""
extra_lines = [
    "The moon blinked.",
    "A fox carried a poem.",
    "She discovered a new word: luminal."
]
TRAIN_TEXT += "\n".join(extra_lines)

# -----------------------------
# 2) Character-level tokenizer
# -----------------------------
# We build a vocabulary of all unique characters in TRAIN_TEXT.
# stoi: maps character -> integer ID
# itos: maps integer ID -> character
chars = sorted(list(set(TRAIN_TEXT)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch,i in stoi.items()}
vocab_size = len(chars)
print("Vocab size (chars):", vocab_size)

# Encode: convert string to tensor of IDs
# Decode: convert tensor of IDs back to string
def encode(s): return torch.tensor([stoi[c] for c in s], dtype=torch.long)
def decode(t): return "".join(itos[int(i)] for i in t)

# Convert entire training text to tensor
data = encode(TRAIN_TEXT)
print("Data length:", len(data))

# -----------------------------
# 3) Create training batches
# -----------------------------
# We train the model to predict the next character given previous ones.
# Each batch contains BATCH_SIZE sequences of length SEQ_LEN.
SEQ_LEN = 128
BATCH_SIZE = 64
def get_batch():
    # Random starting positions for sequences
    ix = torch.randint(0, len(data)-SEQ_LEN-1, (BATCH_SIZE,))
    x = torch.stack([data[i:i+SEQ_LEN] for i in ix])       # input sequence
    y = torch.stack([data[i+1:i+SEQ_LEN+1] for i in ix])   # target sequence (shifted by 1)
    return x.to(DEVICE), y.to(DEVICE)

# -----------------------------
# 4) Positional encoding
# -----------------------------
# Adds position information to embeddings so the model knows token order.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # shape [1, max_len, d_model]
    def forward(self, x):
        # Add positional encoding to input embeddings
        T = x.size(1)
        return x + self.pe[:,:T,:]

# -----------------------------
# 5) Multi-head attention
# -----------------------------
# Splits embeddings into multiple heads for parallel attention.
# Uses a causal mask to prevent looking ahead (autoregressive).
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        self.o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        B, T, D = x.shape
        H, Dh = self.num_heads, self.d_k

        # Project input to Q, K, V and split into heads
        q = self.q(x).view(B, T, H, Dh).transpose(1, 2)
        k = self.k(x).view(B, T, H, Dh).transpose(1, 2)
        v = self.v(x).view(B, T, H, Dh).transpose(1, 2)

        # Compute scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(Dh)
        if mask is not None:
            scores = scores.masked_fill(~mask, float("-inf"))
        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)

        # Combine heads back
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        return self.o(out)

# Causal mask: lower-triangular matrix to block future tokens
def causal_mask(T):
    return torch.tril(torch.ones(T, T, dtype=torch.bool, device=DEVICE))

# -----------------------------
# 6) Feed-forward block
# -----------------------------
# Applies two linear layers with ReLU for non-linearity.
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
    def forward(self, x): return self.net(x)

# -----------------------------
# 7) Transformer block
# -----------------------------
# Combines attention and feed-forward with LayerNorm and residual connections.
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)

    def forward(self, x, mask):
        x = x + self.attn(self.ln1(x), mask)
        x = x + self.ff(self.ln2(x))
        return x

# -----------------------------
# 8) Full Transformer LM
# -----------------------------
# Embedding + positional encoding + N transformer blocks + final linear head.
class TinyTransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_layers=2, num_heads=4, d_ff=256, max_len=1024):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model, max_len)
        self.blocks = nn.ModuleList([TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, idx):
        x = self.embed(idx)
        x = self.pos(x)
        m = causal_mask(idx.size(1))
        for blk in self.blocks:
            x = blk(x, m)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

    # Generation: autoregressive sampling
    @torch.no_grad()
    def generate(self, prompt_ids, max_new_tokens=100, temperature=1.0, top_k=None):
        x = prompt_ids.unsqueeze(0).to(DEVICE)
        for _ in range(max_new_tokens):
            logits = self.forward(x)[:, -1, :] / max(temperature, 1e-6)
            if top_k is not None:
                values, indices = torch.topk(logits, top_k)
                probs = torch.zeros_like(logits).scatter_(1, indices, torch.softmax(values, dim=-1))
            else:
                probs = torch.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, next_id], dim=1)
        return x[0].detach().cpu()

# -----------------------------
# 9) Instantiate model & optimizer
# -----------------------------
model = TinyTransformerLM(vocab_size=vocab_size, d_model=128, num_layers=2, num_heads=4, d_ff=256).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

# -----------------------------
# 10) Training loop
# -----------------------------
# We train for a few epochs to keep runtime short.
EPOCHS = 3
PRINT_EVERY = 200

model.train()
start = time.time()
for epoch in range(1, EPOCHS+1):
    total_loss, steps = 0.0, 0
    for _ in range(800):  # number of mini-batches per epoch
        x, y = get_batch()
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        steps += 1
        if steps % PRINT_EVERY == 0:
            print(f"Epoch {epoch} | Step {steps} | Loss {total_loss/steps:.4f}")
    print(f"Epoch {epoch} done. Avg loss: {total_loss/steps:.4f}")

elapsed = time.time() - start
print(f"Training finished in {elapsed:.1f}s on {DEVICE}")

# -----------------------------
# 11) Generation
# -----------------------------
# After training, generate text from a seed prompt.
model.eval()
seed = "Once upon"
prompt_ids = encode(seed)
gen_ids = model.generate(prompt_ids, max_new_tokens=200, temperature=0.9, top_k=20)
print("\n--- Generated text ---")
print(decode(gen_ids))


Device: cpu
Vocab size (chars): 35
Data length: 324
Epoch 1 | Step 200 | Loss 1.6210
Epoch 1 | Step 400 | Loss 0.9021
Epoch 1 | Step 600 | Loss 0.6161
Epoch 1 | Step 800 | Loss 0.4692
Epoch 1 done. Avg loss: 0.4692
Epoch 2 | Step 200 | Loss 0.0241
Epoch 2 | Step 400 | Loss 0.0472
Epoch 2 | Step 600 | Loss 0.0392
Epoch 2 | Step 800 | Loss 0.0348
Epoch 2 done. Avg loss: 0.0348
Epoch 3 | Step 200 | Loss 0.0210
Epoch 3 | Step 400 | Loss 0.0208
Epoch 3 | Step 600 | Loss 0.0207
Epoch 3 | Step 800 | Loss 0.0206
Epoch 3 done. Avg loss: 0.0206
Training finished in 659.1s on cpu

--- Generated text ---
Once upon a time, in a distant valley, a small robot learned to write.
It wrote tiny stories about stars, rain, and quiet forests.
Somes , abomes abomes abomestststan, llllllomed loobomed lomestouin, fo, sthes
