# Assignment 3 — Question 1: English → French Machine Translation (Stacked RNNs)

This notebook gives you a clean, **ready-to-run** template for Q1. It builds and compares **stacked (2-layer) SimpleRNN, LSTM, and GRU** sequence-to-sequence models on the provided *Small_vocab_en* → *Small_vocab_fr* dataset.

**What you’ll get out of this notebook**  
- Reusable data pipeline (read → clean → tokenize → pad)  
- Three seq2seq models: 2-layer SimpleRNN, 2-layer LSTM, 2-layer GRU  
- Training with early stopping and LR scheduling  
- Side-by-side metrics (loss, token accuracy, BLEU)  
- **10 example translations from each model** ready to paste into your report  

> ⚠️ Put your `Small_vocab_en` and `Small_vocab_fr` text files in a local folder (e.g., `./data/`).  
> Each line should be one sentence. We’ll wrap the French targets with `<sos>` and `<eos>` tokens for decoding.


In [None]:
# Environment & Imports
import os, re, random, math, json, itertools, time
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers as KL, models as KM, callbacks as KC
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Optional: for BLEU
try:
    import nltk
    from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
except Exception as e:
    print("nltk not available; BLEU will be skipped until installed. You can run:")
    print("!pip install nltk && python -m nltk.downloader punkt")
    nltk = None

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(tf.__version__)

## 1) Paths & Hyperparameters

Update `DATA_DIR` if needed. You can start with small settings and scale up once it runs.


In [None]:
# ---- Configure paths ----
DATA_DIR = "./data"           # put your text files here
EN_FILE  = os.path.join(DATA_DIR, "Small_vocab_en")  # one sentence per line (English)
FR_FILE  = os.path.join(DATA_DIR, "Small_vocab_fr")  # one sentence per line (French)

# ---- Training hyperparameters ----
BATCH_SIZE = 128
EPOCHS = 30
EMBED_DIM = 128
UNITS = 256
ENCODER_LAYERS = 2   # stacked depth
DECODER_LAYERS = 2   # stacked depth
DROPOUT = 0.2
REC_DROPOUT = 0.2
VAL_SPLIT = 0.1

# For speed during debugging, you can set MAX_SAMPLES (None to use all)
MAX_SAMPLES = None   # e.g., 10000

# Utility
def exists_all(*paths):
    return all(os.path.exists(p) for p in paths)

print("Data present? ", exists_all(EN_FILE, FR_FILE))

## 2) Load & Clean

We lowercase and keep basic punctuation. French targets get `<sos>` and `<eos>` markers.


In [None]:
# ---- Load raw text ----
def load_lines(path, max_samples=None):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [ln.strip() for ln in f]
    if max_samples is not None:
        lines = lines[:max_samples]
    return lines

assert exists_all(EN_FILE, FR_FILE), "Missing dataset files. Please place Small_vocab_en and Small_vocab_fr under DATA_DIR."

en_lines = load_lines(EN_FILE, MAX_SAMPLES)
fr_lines = load_lines(FR_FILE, MAX_SAMPLES)
assert len(en_lines) == len(fr_lines), "Mismatch between EN and FR line counts"

print(f"Loaded {len(en_lines):,} sentence pairs")

# ---- Basic clean: lower-case, keep letters, digits, space and basic punctuation ----
_keep_re = re.compile(r"[^a-zA-Z0-9'?,.!\s-]", re.UNICODE)

def basic_clean(s: str) -> str:
    s = s.lower().strip()
    s = _keep_re.sub('', s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

en_clean = [basic_clean(s) for s in en_lines]
fr_clean = [basic_clean(s) for s in fr_lines]

# Add <sos> and <eos> to French targets
fr_clean = [f"<sos> {s} <eos>" for s in fr_clean]

print(en_clean[0])
print(fr_clean[0])

## 3) Tokenize & Pad

We use Keras `Tokenizer` to map words→ids and `pad_sequences` to align lengths.


In [None]:
# ---- Tokenize ----
en_tok = Tokenizer(filters='')  # keep punctuation we already curated
fr_tok = Tokenizer(filters='')

en_tok.fit_on_texts(en_clean)
fr_tok.fit_on_texts(fr_clean)

en_vocab_size = len(en_tok.word_index) + 1  # +1 for padding idx=0
fr_vocab_size = len(fr_tok.word_index) + 1

# Convert to integer sequences
en_seqs = en_tok.texts_to_sequences(en_clean)
fr_seqs = fr_tok.texts_to_sequences(fr_clean)

# Determine max lengths
max_len_en = max(len(s) for s in en_seqs)
max_len_fr = max(len(s) for s in fr_seqs)

print("Vocab sizes -> EN:{}, FR:{}".format(en_vocab_size, fr_vocab_size))
print("Max lens -> EN:{}, FR:{}".format(max_len_en, max_len_fr))

# Pad
X = pad_sequences(en_seqs, maxlen=max_len_en, padding='post')
Y = pad_sequences(fr_seqs, maxlen=max_len_fr, padding='post')

# For sparse_categorical_crossentropy, Y needs a final dimension
Y_expanded = np.expand_dims(Y, -1)  # shape: (N, T_fr, 1)

print("X shape:", X.shape, "Y shape:", Y.shape, "Y_expanded:", Y_expanded.shape)

## 4) Build Stacked Seq2Seq Models

We implement a shared builder that picks `SimpleRNN`, `LSTM`, or `GRU`.  
**Encoder:** Embedding → N stacked recurrent layers (last returns a vector).  
**Decoder:** RepeatVector(T_fr) → N stacked recurrent layers (return sequences) → TimeDistributed(Dense(|V_fr|, softmax)).


In [None]:
# ---- Model Factory ----
def stacked_seq2seq(layer_type: str,
                    en_vocab: int, fr_vocab: int,
                    max_len_en: int, max_len_fr: int,
                    embed_dim: int = 128, units: int = 256,
                    enc_layers: int = 2, dec_layers: int = 2,
                    dropout: float = 0.2, rec_dropout: float = 0.2):
    assert layer_type in {"SimpleRNN", "LSTM", "GRU"}
    RNN = getattr(KL, layer_type)

    # Encoder
    en_in = KL.Input(shape=(max_len_en,), name=f"{layer_type}_encoder_input")
    x = KL.Embedding(en_vocab, embed_dim, mask_zero=True, name=f"{layer_type}_src_embed")(en_in)
    # N-1 layers with return_sequences=True, last with return_sequences=False
    for i in range(enc_layers - 1):
        x = RNN(units, return_sequences=True, dropout=dropout, recurrent_dropout=rec_dropout,
                name=f"{layer_type}_enc_{i+1}")(x)
    x = RNN(units, return_sequences=False, dropout=dropout, recurrent_dropout=rec_dropout,
            name=f"{layer_type}_enc_{enc_layers}")(x)

    # Bridge
    x = KL.RepeatVector(max_len_fr, name=f"{layer_type}_repeat")(x)

    # Decoder (all return sequences)
    for j in range(dec_layers):
        x = RNN(units, return_sequences=True, dropout=dropout, recurrent_dropout=rec_dropout,
                name=f"{layer_type}_dec_{j+1}")(x)

    out = KL.TimeDistributed(KL.Dense(fr_vocab, activation='softmax'), name=f"{layer_type}_classifier")(x)

    model = KM.Model(en_in, out, name=f"Stacked_{layer_type}_Seq2Seq")
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='tok_acc')])
    return model

rnn_model = stacked_seq2seq("SimpleRNN", en_vocab_size, fr_vocab_size, max_len_en, max_len_fr,
                            EMBED_DIM, UNITS, ENCODER_LAYERS, DECODER_LAYERS, DROPOUT, REC_DROPOUT)
lstm_model = stacked_seq2seq("LSTM", en_vocab_size, fr_vocab_size, max_len_en, max_len_fr,
                             EMBED_DIM, UNITS, ENCODER_LAYERS, DECODER_LAYERS, DROPOUT, REC_DROPOUT)
gru_model = stacked_seq2seq("GRU", en_vocab_size, fr_vocab_size, max_len_en, max_len_fr,
                            EMBED_DIM, UNITS, ENCODER_LAYERS, DECODER_LAYERS, DROPOUT, REC_DROPOUT)

for m in [rnn_model, lstm_model, gru_model]:
    m.summary(line_length=120)

## 5) Train All Three Models

We use the same hyperparameters across models for a fair comparison and employ early stopping + LR scheduling.


In [None]:
# ---- Callbacks ----
cbs = [
    KC.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    KC.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
]

history = {}
for name, model in [("rnn", rnn_model), ("lstm", lstm_model), ("gru", gru_model)]:
    print(f"\n=== Training {name.upper()} ===\n")
    hist = model.fit(
        X, Y_expanded,
        validation_split=VAL_SPLIT,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        shuffle=True,
        verbose=2,
        callbacks=cbs
    )
    history[name] = hist.history

# Save histories for later plotting
with open("training_history_q1.json", "w") as f:
    json.dump(history, f, indent=2)

## 6) Decode Predictions & Evaluate

We perform greedy decoding (argmax per timestep), strip `<sos>/<eos>`, and compute BLEU if `nltk` is available.


In [None]:
# ---- Utilities to decode ----
id2fr = {idx: w for w, idx in fr_tok.word_index.items()}
id2en = {idx: w for w, idx in en_tok.word_index.items()}

def seqs_to_texts_id2tok(seqs, id2tok):
    texts = []
    for s in seqs:
        toks = [id2tok.get(i, '') for i in s if i != 0]
        texts.append(' '.join([t for t in toks if t]))
    return texts

def greedy_decode(model, X_in, max_len_fr, id2fr):
    # Predict probs -> ids
    probs = model.predict(X_in, verbose=0)
    ids = probs.argmax(-1)
    texts = []
    for s in ids:
        toks = []
        for i in s:
            w = id2fr.get(i, '')
            if w == '<sos>':
                continue
            if w == '<eos>' or w == '':
                break
            toks.append(w)
        texts.append(' '.join(toks))
    return texts

# Build references (without <sos>/<eos>)
refs = []
for s in Y:
    toks = [id2fr.get(i, '') for i in s if i not in (0,)]
    # remove sos/eos
    toks = [t for t in toks if t not in ('<sos>', '<eos>', '')]
    refs.append([toks])  # nested list for corpus_bleu

def evaluate_model(model, name: str, sample_k: int = 10):
    preds = greedy_decode(model, X, max_len_fr, id2fr)
    # Compute BLEU if nltk is present
    bleu2 = bleu4 = None
    if nltk is not None:
        sf = SmoothingFunction().method3
        cand_tokens = [p.split() for p in preds]
        bleu2 = corpus_bleu(refs, cand_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=sf)
        bleu4 = corpus_bleu(refs, cand_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=sf)

    # Show samples
    idxs = np.random.choice(len(X), size=min(sample_k, len(X)), replace=False)
    print(f"\n---- {name.upper()} SAMPLE TRANSLATIONS ----\n")
    for i in idxs:
        en_txt = ' '.join([id2en.get(t, '') for t in X[i] if t != 0])
        # reference (detokenized)
        ref_txt = ' '.join([id2fr.get(t, '') for t in Y[i] if t not in (0,)])
        ref_txt = ref_txt.replace('<sos> ', '').replace(' <eos>', '')
        print("EN:", en_txt)
        print("GT:", ref_txt)
        print("PR:", preds[i])
        print('-'*80)

    return {"bleu2": bleu2, "bleu4": bleu4}

scores = {}
for name, model in [("rnn", rnn_model), ("lstm", lstm_model), ("gru", gru_model)]:
    scores[name] = evaluate_model(model, name, sample_k=10)

print("\nBLEU summary:")
print(json.dumps(scores, indent=2))

## 7) Plot Training Curves

Loss and token-level accuracy (on the teacher-forced target).

In [None]:
import json
import matplotlib.pyplot as plt

with open("training_history_q1.json", "r") as f:
    history = json.load(f)

def plot_metric(metric='loss'):
    plt.figure()
    for name, hist in history.items():
        plt.plot(hist[metric], label=f"{name}-{metric}")
        if f"val_{metric}" in hist:
            plt.plot(hist[f"val_{metric}"], linestyle='--', label=f"{name}-val_{metric}")
    plt.title(metric)
    plt.xlabel('epoch')
    plt.ylabel(metric)
    plt.legend()
    plt.grid(True)
    plt.show()

plot_metric('loss')
plot_metric('tok_acc')