In [18]:

import os
import re
import unicodedata
import random

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.translate.bleu_score import corpus_bleu

In [19]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [20]:
path_to_file = '/content/fra.txt'

In [21]:
with open(path_to_file, "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

In [22]:
NUM_SAMPLES = 10000
pairs_raw = lines[-NUM_SAMPLES:]
print("Using last", NUM_SAMPLES, "sentence pairs.")

Using last 10000 sentence pairs.


In [23]:
def unicode_to_ascii(s):
    # Normalize accents: "é" -> "e"
    return "".join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )

def preprocess_sentence(s):
    # Lowercase, strip, remove accents
    s = unicode_to_ascii(s.lower().strip())

    # Put spaces around punctuation we want to keep as tokens
    s = re.sub(r"([?.!,¿])", r" \1 ", s)

    # Remove anything that's not a letter, punctuation or space
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)

    # Collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()

    return s

# Test the preprocessing
print(preprocess_sentence("I'm very happy!!! À bientôt."))


i m very happy ! ! ! a bientot .


In [24]:
# Building sentence pairs (English, French)
#         and add <start> / <end> tokens to French

eng_texts = []
fra_texts = []          # with <start> and <end>
fra_texts_target = []   # target sentences without <start> (for convenience)

for line in pairs_raw:
    eng, fra, *_ = line.split("\t")

    eng_clean = preprocess_sentence(eng)
    fra_clean = preprocess_sentence(fra)

    # Add special tokens for target language
    fra_in = "<start> " + fra_clean
    fra_out = fra_clean + " <end>"

    eng_texts.append(eng_clean)
    fra_texts.append(fra_in)
    fra_texts_target.append(fra_out)

print("Examples:")
for i in range(3):
    print(f"EN: {eng_texts[i]}")
    print(f"FR in : {fra_texts[i]}")
    print(f"FR out: {fra_texts_target[i]}")
    print("---")

num_sentences = len(eng_texts)
print("Total cleaned sentence pairs:", num_sentences)


Examples:
EN: even tom wasn t able to answer the teacher s questions .
FR in : <start> meme tom n a pas pu repondre aux questions du professeur .
FR out: meme tom n a pas pu repondre aux questions du professeur . <end>
---
EN: even if it costs , yen , i must buy the dictionary .
FR in : <start> je dois acheter ce dictionnaire , meme s il coute . yens .
FR out: je dois acheter ce dictionnaire , meme s il coute . yens . <end>
---
EN: even if she comes to see me , tell her i am not at home .
FR in : <start> meme si elle devait passer , dis lui que je ne suis pas a la maison .
FR out: meme si elle devait passer , dis lui que je ne suis pas a la maison . <end>
---
Total cleaned sentence pairs: 10000


In [25]:
# Tokenization and integer sequences

# Tokenizer for English
eng_tokenizer = Tokenizer(filters="", lower=False)  # data already lowercased
eng_tokenizer.fit_on_texts(eng_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)

# Tokenizer for French
fra_tokenizer = Tokenizer(filters="", lower=False)
fra_tokenizer.fit_on_texts(fra_texts + fra_texts_target)  # include both in/out
fra_sequences_in = fra_tokenizer.texts_to_sequences(fra_texts)
fra_sequences_out = fra_tokenizer.texts_to_sequences(fra_texts_target)

# Vocabulary sizes (+1 for padding index 0)
num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(fra_tokenizer.word_index) + 1

print("English vocab size:", num_encoder_tokens)
print("French vocab size :", num_decoder_tokens)

# Find max sequence lengths
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_fra = max(len(seq) for seq in fra_sequences_in)

print("Max English length:", max_len_eng)
print("Max French length :", max_len_fra)


English vocab size: 7595
French vocab size : 10105
Max English length: 70
Max French length : 74


In [26]:
# ============================================================
# Cell 6: Padding and train-test split (80/20)
# ============================================================

# Pad sequences
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_len_eng, padding="post")
decoder_input_data = pad_sequences(fra_sequences_in, maxlen=max_len_fra, padding="post")
decoder_target_data = pad_sequences(fra_sequences_out, maxlen=max_len_fra, padding="post")

print("encoder_input_data shape:", encoder_input_data.shape)
print("decoder_input_data shape:", decoder_input_data.shape)
print("decoder_target_data shape:", decoder_target_data.shape)

# Train/test split: first 80% train, last 20% test
train_size = int(0.8 * num_sentences)

encoder_input_train = encoder_input_data[:train_size]
decoder_input_train = decoder_input_data[:train_size]
decoder_target_train = decoder_target_data[:train_size]

encoder_input_test = encoder_input_data[train_size:]
decoder_input_test = decoder_input_data[train_size:]
decoder_target_test = decoder_target_data[train_size:]

print("Training samples:", encoder_input_train.shape[0])
print("Test samples     :", encoder_input_test.shape[0])

# For sparse_categorical_crossentropy, we need an extra dimension for targets
decoder_target_train = np.expand_dims(decoder_target_train, -1)
decoder_target_test = np.expand_dims(decoder_target_test, -1)
print("decoder_target_train shape (for sparse loss):", decoder_target_train.shape)


encoder_input_data shape: (10000, 70)
decoder_input_data shape: (10000, 74)
decoder_target_data shape: (10000, 74)
Training samples: 8000
Test samples     : 2000
decoder_target_train shape (for sparse loss): (8000, 74, 1)


In [27]:
# Helper dictionaries for word <-> index mapping

eng_index_to_word = {idx: w for w, idx in eng_tokenizer.word_index.items()}
fra_index_to_word = {idx: w for w, idx in fra_tokenizer.word_index.items()}

def decode_sequence_indices(indices, idx_to_word):
    words = []
    for idx in indices:
        if idx == 0:
            continue
        word = idx_to_word.get(idx, "")
        if word == "<end>":
            break
        words.append(word)
    return " ".join(words)

print("Example decoded English sentence from padded indices:")
print(decode_sequence_indices(encoder_input_data[0], eng_index_to_word))


Example decoded English sentence from padded indices:
even tom wasn t able to answer the teacher s questions .


In [28]:
# Function to build Seq2Seq model (Encoder-Decoder)

def build_seq2seq_model(latent_dim, embedding_dim=256):
    """
    Build a standard Encoder-Decoder LSTM model with given latent_dim.
    Returns:
        model, encoder_inputs, encoder_states,
        decoder_inputs, decoder_lstm, decoder_dense, decoder_embedding
    """
    # Encoder
    encoder_inputs = Input(shape=(max_len_eng,), name="encoder_inputs")
    encoder_embedding = Embedding(
        input_dim=num_encoder_tokens,
        output_dim=embedding_dim,
        mask_zero=True,
        name="encoder_embedding",
    )(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(max_len_fra,), name="decoder_inputs")
    decoder_embedding_layer = Embedding(
        input_dim=num_decoder_tokens,
        output_dim=embedding_dim,
        mask_zero=True,
        name="decoder_embedding",
    )
    decoder_embedding = decoder_embedding_layer(decoder_inputs)
    decoder_lstm = LSTM(
        latent_dim,
        return_sequences=True,
        return_state=True,
        name="decoder_lstm",
    )
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation="softmax", name="decoder_dense")
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(
        optimizer="rmsprop",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return (
        model,
        encoder_inputs,
        encoder_states,
        decoder_inputs,
        decoder_lstm,
        decoder_dense,
        decoder_embedding_layer,
    )

# Quick sanity check
test_model, *_ = build_seq2seq_model(latent_dim=128)
test_model.summary()


In [29]:
# Build inference models (encoder_model, decoder_model)

def build_inference_models(
    encoder_inputs,
    encoder_states,
    decoder_inputs,
    decoder_lstm,
    decoder_dense,
    decoder_embedding_layer,
    latent_dim,
):
    # Encoder model for inference
    encoder_model = Model(encoder_inputs, encoder_states)

    # Decoder setup for inference
    # We need new input tensors for hidden states
    decoder_state_input_h = Input(shape=(latent_dim,), name="decoder_state_input_h")
    decoder_state_input_c = Input(shape=(latent_dim,), name="decoder_state_input_c")
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    # Decoder input (one token at a time)
    decoder_single_input = Input(shape=(1,), name="decoder_single_input")
    dec_emb2 = decoder_embedding_layer(decoder_single_input)

    decoder_outputs2, state_h2, state_c2 = decoder_lstm(
        dec_emb2, initial_state=decoder_states_inputs
    )
    decoder_states2 = [state_h2, state_c2]

    decoder_outputs2 = decoder_dense(decoder_outputs2)

    decoder_model = Model(
        [decoder_single_input] + decoder_states_inputs,
        [decoder_outputs2] + decoder_states2,
    )

    return encoder_model, decoder_model

In [30]:
# Decoding function (greedy search)

start_token = fra_tokenizer.word_index["<start>"]
end_token = fra_tokenizer.word_index["<end>"]

def decode_sequence_greedy(input_seq, encoder_model, decoder_model, max_target_len=None):
    if max_target_len is None:
        max_target_len = max_len_fra

    states_value = encoder_model.predict(input_seq, verbose=0)

    target_seq = np.array([[start_token]], dtype="int32")
    decoded_tokens = []

    for _ in range(max_target_len):
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            break

        sampled_word = fra_index_to_word.get(sampled_token_index, "")

        if sampled_word == "<end>":
            break

        decoded_tokens.append(sampled_word)
        target_seq = np.array([[sampled_token_index]], dtype="int32")
        states_value = [h, c]

    return " ".join(decoded_tokens)


In [31]:
import time

def train_and_evaluate(
    latent_dim,
    epochs=10,
    batch_size=64,
    eval_samples=200,       # how many test samples to use for BLEU
    max_decode_len=25       # max tokens to generate per sentence
):
    print("\n" + "=" * 60)
    print(f"Training model with latent_dim = {latent_dim}")
    print("=" * 60)

    t0 = time.time()
    print("[1] Building model...")

    (
        model,
        encoder_inputs,
        encoder_states,
        decoder_inputs,
        decoder_lstm,
        decoder_dense,
        decoder_embedding_layer,
    ) = build_seq2seq_model(latent_dim=latent_dim)

    print(f"[1] Model built in {time.time() - t0:.2f} sec")

    print("\n[2] Starting Training...")
    t1 = time.time()

    history = model.fit(
        [encoder_input_train, decoder_input_train],
        decoder_target_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2,
    )

    print(f"[2] Training finished in {time.time() - t1:.2f} sec")

    print("\n[3] Building inference models...")
    t2 = time.time()

    encoder_model, decoder_model = build_inference_models(
        encoder_inputs,
        encoder_states,
        decoder_inputs,
        decoder_lstm,
        decoder_dense,
        decoder_embedding_layer,
        latent_dim,
    )

    print(f"[3] Inference models built in {time.time() - t2:.2f} sec")

    # ------------ BLEU EVALUATION (LIGHT) ------------
    print("\n[4] Starting BLEU evaluation on subset...")
    t3 = time.time()

    references = []
    candidates = []

    total_test = encoder_input_test.shape[0]
    num_test_samples = min(eval_samples, total_test)
    print(f"[4] Using {num_test_samples} / {total_test} test samples for BLEU")

    for i in range(num_test_samples):
        if i % 50 == 0:
            print(f"  Decoding sample {i}/{num_test_samples} ...")

        input_seq = encoder_input_test[i : i + 1]

        decoded_sentence = decode_sequence_greedy(
            input_seq,
            encoder_model,
            decoder_model,
            max_target_len=max_decode_len,   # <-- shorter decoding
        )

        ref_indices = decoder_target_test[i].squeeze()
        ref_tokens = []
        for idx in ref_indices:
            if idx == 0:
                continue
            word = fra_index_to_word.get(idx, "")
            if word == "<end>" or word == "<start>":
                continue
            ref_tokens.append(word)

        references.append([ref_tokens])
        candidates.append(decoded_sentence.split())

    print(f"[4] BLEU loop completed in {time.time() - t3:.2f} sec")

    bleu_score = corpus_bleu(references, candidates)
    print(f"\nBLEU score for latent_dim = {latent_dim}: {bleu_score:.4f}")

    # ------------ QUALITATIVE EXAMPLES (FEW) ------------
    print("\n[5] Generating qualitative examples...")
    t4 = time.time()

    for i in range(3):   # show only 3 instead of 5
        print(f"  Example {i+1}/3")
        idx = random.randint(0, total_test - 1)
        input_seq = encoder_input_test[idx : idx + 1]
        decoded_sentence = decode_sequence_greedy(
            input_seq,
            encoder_model,
            decoder_model,
            max_target_len=max_decode_len,
        )

        eng_original = decode_sequence_indices(
            encoder_input_test[idx], eng_index_to_word
        )
        true_fra = decode_sequence_indices(
            decoder_input_test[idx], fra_index_to_word
        )

        print("-" * 40)
        print("EN (input)   :", eng_original)
        print("FR (true)    :", true_fra)
        print("FR (decoded) :", decoded_sentence)

    print(f"[5] Examples finished in {time.time() - t4:.2f} sec")

    total_time = time.time() - t0
    print("\n[ DONE ] Total time =", round(total_time, 2), "seconds")
    print("="*60)

    return model, encoder_model, decoder_model, bleu_score


In [32]:
model_128, enc_128, dec_128, bleu_128 = train_and_evaluate(
    latent_dim=128,
    epochs=10,
    batch_size=64,
    eval_samples=200,     # subset of test set
    max_decode_len=25     # cap decoded length
)

print("BLEU (128 units):", bleu_128)


Training model with latent_dim = 128
[1] Building model...
[1] Model built in 0.05 sec

[2] Starting Training...
Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 108ms/step - accuracy: 0.0148 - loss: 7.9695 - val_accuracy: 0.0135 - val_loss: 6.1341
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 104ms/step - accuracy: 0.0142 - loss: 6.0271 - val_accuracy: 0.0190 - val_loss: 6.0368
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 111ms/step - accuracy: 0.0183 - loss: 5.9223 - val_accuracy: 0.0202 - val_loss: 5.9870
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 110ms/step - accuracy: 0.0218 - loss: 5.8557 - val_accuracy: 0.0294 - val_loss: 5.8994
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 109ms/step - accuracy: 0.0318 - loss: 5.7370 - val_accuracy: 0.0343 - val_loss: 5.8182
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


----------------------------------------
EN (input)   : i don t think that i want to answer any more of your questions right now .
FR (true)    : <start> je ne pense pas avoir envie de repondre a plus de vos questions pour le moment .
FR (decoded) : je ne ne que je ne ne que je ne ne que je ne ne que je ne ne a de .
  Example 2/3
----------------------------------------
EN (input)   : don t get so irritated . rushing things will cost you more time in the end .
FR (true)    : <start> ne t agace pas comme cela . se precipiter te fera perdre du temps en fin de compte .
FR (decoded) : je ne ne que je ne ne que je ne ne que je ne ne a de a de .
  Example 3/3
----------------------------------------
EN (input)   : a more experienced lawyer would have dealt with the case in a different way .
FR (true)    : <start> un avocat plus experimente aurait traite l affaire differemment .
FR (decoded) : je ne ne a de a de a de a de a de a de a de a de a de a .
[5] Examples finished in 6.28 sec

[ DONE 

In [33]:
results = {}
for latent_dim in [128, 256, 512]:
    model, enc, dec, bleu = train_and_evaluate(
        latent_dim=latent_dim,
        epochs=10,
        batch_size=64,
        eval_samples=200,
        max_decode_len=25,
    )
    results[latent_dim] = bleu

print("\nSummary of BLEU scores:")
for d, b in results.items():
    print(f"latent_dim = {d}: BLEU = {b:.4f}")


Training model with latent_dim = 128
[1] Building model...
[1] Model built in 0.06 sec

[2] Starting Training...
Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 111ms/step - accuracy: 0.0142 - loss: 7.9307 - val_accuracy: 0.0135 - val_loss: 6.1548
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 109ms/step - accuracy: 0.0136 - loss: 6.0599 - val_accuracy: 0.0135 - val_loss: 6.0998
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 110ms/step - accuracy: 0.0141 - loss: 5.9920 - val_accuracy: 0.0189 - val_loss: 6.0493
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 104ms/step - accuracy: 0.0195 - loss: 5.9099 - val_accuracy: 0.0334 - val_loss: 5.9080
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 102ms/step - accuracy: 0.0320 - loss: 5.7508 - val_accuracy: 0.0339 - val_loss: 5.8144
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [39]:
for i,j in results.items():
  print('latent_dim:',i,' '*3,'BLEU:',j)

latent_dim: 128     BLEU: 4.1761439764183904e-79
latent_dim: 256     BLEU: 4.6017578836795294e-79
latent_dim: 512     BLEU: 5.827555451330451e-79


Discuss how sequence length affect performance?

Sequence length has a strong negative effect on Seq2Seq performance. Longer sentences make it harder for the encoder to compress all information into a fixed-length vector, and the decoder struggles to generate long, coherent outputs. Errors compound over time, and BLEU-4 severely penalizes mismatches on long sequences. As a result, our model produces repetitive or generic patterns and ends up with near-zero BLEU scores across all hidden sizes. Increasing the LSTM size (128 → 256 → 512) provides minor improvement, but does not overcome the limitations imposed by long input and output sequences in a vanilla Seq2Seq architecture without attention.