In [None]:
#  %pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


In [27]:
# %pip install numpy==2.0
# %pip install "numpy<2.0"



Collecting numpy==2.0
  Using cached numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl (5.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-2.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting numpy<2.0
  Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.0
    Uninstalling numpy-2.0.0:
      Successfully uninstalled numpy-2.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.

In [28]:
import torch                                                # PyTorch main package for tensor operations and deep learning
import torch.nn as nn                                       # PyTorch module for building neural network layers
import torch.optim as optim                                 # PyTorch module for optimization algorithms (e.g., Adam, SGD)
import sacrebleu                                            # Library for calculating BLEU score (translation quality metric)
from torchtext.data.utils import get_tokenizer              # Utility to get tokenizers for text preprocessing
from torchtext.vocab import build_vocab_from_iterator       # Function to build vocabulary from tokenized data
from torchtext.datasets import Multi30k                     # Multi30k dataset for English-German/French translation tasks
from typing import Tuple                                    # Type hinting for functions that return tuples
import spacy                                                # NLP library for tokenization and linguistic features
import warnings
warnings.filterwarnings("ignore")
import numpy as np



In [29]:
print(torch.__version__)                #check the version of pytorch

2.1.0


In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        """
        Initializes the MultiHeadSelfAttention module.

        Args:
        d_model (int): Total dimension of the model.
        num_heads (int): Number of attention heads.
        """
        super(MultiHeadSelfAttention, self).__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # Ensure the model dimension is divisible by number of heads
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"

        # Linear layers to project input into Q, K, V
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        # Final linear layer after concatenating attention output from all heads
        self.wo = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        """
        Forward pass for MultiHeadSelfAttention.

        Args:
        q (Tensor): Query tensor of shape (batch_size, seq_length, d_model).
        k (Tensor): Key tensor of shape (batch_size, seq_length, d_model).
        v (Tensor): Value tensor of shape (batch_size, seq_length, d_model).
        mask (Tensor, optional): Mask tensor. Defaults to None.

        Returns:
        Tensor: Output tensor of shape (batch_size, seq_length, d_model).
        """
        batch_size = q.size(0)

        # Apply linear layers and split into multiple heads
        # Output shape: (batch_size, num_heads, seq_length, head_dim)
        q = self.wq(q).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.wk(k).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.wv(v).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # shape: (batch_size, num_heads, seq_length, seq_length)

        if mask is not None:
            # Apply mask (e.g., for causal or padding attention)
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        attn_weights = F.softmax(attn_scores, dim=-1)  # Normalize over keys
        attn_output = torch.matmul(attn_weights, v)    # shape: (batch_size, num_heads, seq_length, head_dim)

        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # Final linear projection
        out = self.wo(attn_output)

        return out


### Multi-head self-attention splits the input into multiple attention "heads" to learn different patterns.
### Queries, Keys, and Values are projected from the same input using linear layers.
### Scaled dot-product attention is applied to compute weighted representations.
### All heads are concatenated and passed through a final linear projection to get the final output.

# Position- wise Feed Forward Networks

In [31]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        """
        Initialize the PositionwiseFeedForward module.
        
        Args:
        d_model (int): The dimensionality of the input.
        d_ff (int): The dimensionality of the hidden layer in the feed-forward network.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
        """
        super(PositionwiseFeedForward,self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self,x):
        """
        Forward pass for PositionwiseFeedForward.
        
        Args:
        x (Tensor): The input tensor of shape (batch_size, seq_length, d_model).
        
        Returns:
        Tensor: The output tensor of shape (batch_size, seq_length, d_model).
        """

        out = self.linear1(x)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear2(out)

        return out



# Add and Norm

In [32]:
class AddNorm(nn.Module):
    def __init__(self, d_model, eps =1e-6):
        """
        Initialize the AddNorm module.

        Args:
        d_model (int): The dimensionality of the input.
        eps (float, optional): A small constant for numerical stability. Defaults to 1e-6.
        """
        super(AddNorm,self).__init__()
        self.norm = nn.LayerNorm(d_model,eps = eps)

    
    def forward(self,x,residual):
        """
        Forward pass for AddNorm.

        Args:
        x (Tensor): The input tensor of shape (batch_size, seq_length, d_model).
        residual (Tensor): The residual tensor of the same shape as the input tensor.

        Returns:
        Tensor: The output tensor of shape (batch_size, seq_length, d_model).
        """

        out = x +residual
        out = self.norm(out)

        return out


#  Positional Encoding

In [33]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len, dropout = 0.1):
        """
        Initialize the PositionalEncoding module.

        Args:
        d_model (int): The dimensionality of the input.
        max_seq_len (int): The maximum length of the input sequence.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
        """

        super(PositionalEncoding,self).__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_seq_len,d_model)
        position = torch.arange(0, max_seq_len, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model,2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)


    def forward(self, x):
        """
        Forward pass for PositionalEncoding.

        Args:
        x (Tensor): The input tensor of shape (batch_size, seq_length, d_model).

        Returns:
        Tensor: The output tensor of shape (batch_size, seq_length, d_model).
        """

        x = x + self.pe[:, :x.size(1), :]
        x = self.dropout(x)

        return x
            



## Encoder Block

The Encoder Block in the Transformer architecture consists of the following layers:

- Multi-Head Self-Attention layer
- Add & Norm (Residual connection and Layer Normalization)
- Position-wise Feed-Forward Network layer
- Add & Norm (Residual connection and Layer Normalization)

In the Transformer, multiple encoder blocks (6 according to the paper) are stacked on top of each other to form the complete encoder module.

In [34]:
class EncoderBlock(nn.Module):
    def __init__(self,d_model,num_heads,d_ff, dropout = 0.1):
        """
        Initialize the EncoderBlock module.

        Args:
        d_model (int): The dimensionality of the input.
        num_heads (int): The number of attention heads.
        d_ff (int): The dimensionality of the hidden layer in the feed-forward network.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
        """

        super(EncoderBlock,self).__init__()
        self.self_attn = MultiHeadSelfAttention(d_model,num_heads)
        self.norm1 = AddNorm(d_model)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = AddNorm(d_model)


    def forward(self, x , mask = None):
        x1 = self.self_attn(x,x,x,mask)
        x = self.norm1(x,x1)
        x1 = self.ffn(x)
        x = self.norm2(x,x1)

        return x


## Decoder

The Decoder Block in the Transformer architecture consists of the following layers:

- Masked Multi-Head Self-Attention layer
- Add & Norm (Residual connection and Layer Normalization)
- Encoder-Decoder Multi-Head Attention layer
- Add & Norm (Residual connection and Layer Normalization)
- Position-wise Feed-Forward Network layer
- Add & Norm (Residual connection and Layer Normalization)

In the Transformer, multiple decoder blocks are stacked on top of each other to form the complete decoder module.

In [35]:

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        """
        Initializes the DecoderBlock module.

        Args:
        d_model (int): The dimensionality of the input.
        num_heads (int): The number of attention heads.
        d_ff (int): The dimensionality of the hidden layer in the feed-forward network.
        dropout (float): The dropout probability.
        """
        super(DecoderBlock, self).__init__()

        # 1. Masked Multi-Head Self-Attention (causal)
        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = AddNorm(d_model)

        # 2. Encoder-Decoder Multi-Head Attention
        self.enc_dec_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm2 = AddNorm(d_model)

        # 3. Feed Forward Network
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm3 = AddNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        """
        Forward pass for DecoderBlock.

        Args:
        x (Tensor): Target sequence input (batch_size, tgt_seq_len, d_model)
        enc_output (Tensor): Encoder output (batch_size, src_seq_len, d_model)
        src_mask (Tensor, optional): Encoder mask. Shape (batch_size, 1, 1, src_seq_len)
        tgt_mask (Tensor, optional): Decoder mask. Shape (batch_size, 1, tgt_seq_len, tgt_seq_len)

        Returns:
        Tensor: Output tensor (batch_size, tgt_seq_len, d_model)
        """
        # 1. Masked Self-Attention with target mask
        attn1 = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x, attn1)  # Residual + LayerNorm

        # 2. Encoder-Decoder Attention with source mask
        attn2 = self.enc_dec_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x, attn2)

        # 3. Feed Forward Network
        ffn_out = self.ffn(x)
        x = self.norm3(x, ffn_out)

        return x


## The Transformer

Now that we have implemented all the building blocks, let's assemble the complete Transformer architecture.

We initialize the following components:

- Source and target embedding layers
- Positional encoding module
- Encoder and decoder layer stacks
- Final linear layer to produce the probability distribution over the target vocabulary

In the forward method, we first pass the source and target input tensors through their respective embedding layers and add the positional encoding. Then, we pass the source input through each encoder layer sequentially, followed by passing the target input and encoder output through each decoder layer sequentially. Finally, we apply the linear layer to produce the output tensor with shape (batch_size, tgt_seq_length, tgt_vocab_size).

In [36]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, D_MODEL, num_heads, d_ff, max_seq_len, num_layers, dropout=0.1):
        """
        Initialize the Transformer module.

        Args:
        src_vocab_size (int): The size of the source vocabulary.
        tgt_vocab_size (int): The size of the target vocabulary.
        d_model (int): The dimensionality of the embedding
        num_heads (int): The number of attention heads.
        d_ff (int): The dimensionality of the hidden layer in the feed-forward network.
        max_seq_len (int): The maximum length of the input sequence.
        num_layers (int): The number of layers in the encoder and decoder.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
        """
        super(Transformer, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, D_MODEL)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, D_MODEL)
        self.pos_encoding = PositionalEncoding(D_MODEL, max_seq_len, dropout)

        self.encoder_layers = nn.ModuleList([EncoderBlock(D_MODEL, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderBlock(D_MODEL, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(D_MODEL, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        """
        Forward pass for Transformer.

        Args:
        src (Tensor): The source input tensor of shape (batch_size, src_seq_length).
        tgt (Tensor): The target input tensor of shape (batch_size, tgt_seq_length).
        src_mask (Tensor, optional): The source mask tensor for ignoring certain elements. Defaults to None.
        tgt_mask (Tensor, optional): The target mask tensor for ignoring certain elements. Defaults to None.

        Returns:
        Tensor: The output tensor of shape (batch_size, tgt_seq_length, tgt_vocab_size).
        """
        src = self.src_embedding(src)
        src = self.pos_encoding(src)

        tgt = self.tgt_embedding(tgt)
        tgt = self.pos_encoding(tgt)

        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)

        out = self.fc(tgt)

        return out

In [37]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def visualize_transformer(num_encoder_layers, num_decoder_layers, num_heads, d_model):
    """
    Visualizes a high-level architecture of the Transformer model.

    Args:
        num_encoder_layers (int): Number of encoder layers.
        num_decoder_layers (int): Number of decoder layers.
        num_heads (int): Number of attention heads.
        d_model (int): Embedding dimension.
    """
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.axis('off')

    # Encoder blocks
    encoder_x = 1
    encoder_y_start = 1
    encoder_height = 0.6
    encoder_gap = 0.15
    for i in range(num_encoder_layers):
        rect = mpatches.FancyBboxPatch(
            (encoder_x, encoder_y_start + i * (encoder_height + encoder_gap)),
            1.2, encoder_height,
            boxstyle="round,pad=0.02",
            edgecolor='navy', facecolor='#cce5ff', linewidth=2
        )
        ax.add_patch(rect)
        ax.text(encoder_x + 0.6, encoder_y_start + i * (encoder_height + encoder_gap) + encoder_height/2,
                f'Encoder Layer {i+1}\n(Multi-Head x{num_heads})', ha='center', va='center', fontsize=10)

    # Decoder blocks
    decoder_x = 5
    decoder_y_start = 1
    decoder_height = 0.6
    decoder_gap = 0.15
    for i in range(num_decoder_layers):
        rect = mpatches.FancyBboxPatch(
            (decoder_x, decoder_y_start + i * (decoder_height + decoder_gap)),
            1.2, decoder_height,
            boxstyle="round,pad=0.02",
            edgecolor='darkgreen', facecolor='#d4edda', linewidth=2
        )
        ax.add_patch(rect)
        ax.text(decoder_x + 0.6, decoder_y_start + i * (decoder_height + decoder_gap) + decoder_height/2,
                f'Decoder Layer {i+1}\n(Multi-Head x{num_heads})', ha='center', va='center', fontsize=10)

    # Embedding and Positional Encoding
    ax.text(encoder_x - 0.7, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2,
            'Input\nEmbedding\n+\nPositional\nEncoding', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#f8d7da", ec="crimson"))
    ax.text(decoder_x - 0.7, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2,
            'Output\nEmbedding\n+\nPositional\nEncoding', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#fff3cd", ec="#856404"))

    # Output Linear Layer
    ax.text(decoder_x + 2.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2,
            'Linear\n& Softmax', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#d1ecf1", ec="#0c5460"))

    # Arrows: Input -> Encoder
    ax.annotate('', xy=(encoder_x, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                xytext=(encoder_x - 0.2, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Arrows: Encoder -> Decoder (cross attention)
    ax.annotate('', xy=(encoder_x + 1.2, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                xytext=(decoder_x, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='gray', arrowstyle='->', lw=2, linestyle='dashed'))

    # Arrows: Output Embedding -> Decoder
    ax.annotate('', xy=(decoder_x, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                xytext=(decoder_x - 0.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Arrows: Decoder -> Linear
    ax.annotate('', xy=(decoder_x + 1.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                xytext=(decoder_x + 2.0, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Title and legend
    ax.set_title(f"Transformer Architecture\n(Encoder Layers: {num_encoder_layers}, Decoder Layers: {num_decoder_layers}, Heads: {num_heads}, d_model: {d_model})", fontsize=14, pad=20)
    plt.xlim(0, 8)
    plt.ylim(0, 3 + max(num_encoder_layers, num_decoder_layers) * (encoder_height + encoder_gap) / 2)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_transformer(num_encoder_layers=6, num_decoder_layers=6, num_heads=8, d_model=512)


ModuleNotFoundError: No module named 'matplotlib'

This image illustrates the **Transformer Architecture**, a foundational model used in many modern NLP systems such as BERT, GPT, and T5. It visualizes how information flows through the **encoder-decoder structure** of a transformer.

Let’s break it down **intuitively and step by step**:

---

## 🧠 High-Level Summary

* The **left side** is the **Encoder**, which reads and understands the input.
* The **right side** is the **Decoder**, which generates the output (like a translated sentence).
* In between, the encoder and decoder **communicate** using attention mechanisms.
* The **transformer uses 8 attention heads** and **6 stacked layers** in both encoder and decoder, with a model dimension (`d_model`) of 512.

---

## 🔴 Step 1: Input Embedding + Positional Encoding

* Each word/token in the input sequence is **converted to a dense vector (embedding)**.
* Since transformers don’t understand order, **positional encoding** is added to inject the notion of word order (e.g., who comes first in the sentence).

📦 Example:

> `"I love transformers"` → becomes a matrix of shape `(seq_len, 512)`.

---

## 🔵 Encoder Stack (Left Side)

There are **6 identical encoder layers**, each with:

1. **Multi-head self-attention**
   → Every word looks at every other word (including itself) to understand context.

   > e.g., “bank” can mean money or river — attention helps disambiguate it by context.

2. **Feedforward network**
   → A small neural network to refine each word’s representation.

3. **Residual connections + LayerNorm**
   → Helps in stabilizing training and preserving input signals.

📌 Output: A context-enriched representation for each word.

---

## 🟡 Output Embedding + Positional Encoding

This is for the **decoder input** (often previous tokens during training or inference).

* The decoder also needs **positional info**.
* It uses **shifted right** sequences during training (i.e., we don't feed the full output at once, only up to the current word).

---

## 🟢 Decoder Stack (Right Side)

Also has **6 layers**, and each contains:

1. **Masked Multi-head self-attention**
   → Each position can only attend to previous tokens (to prevent cheating during generation).

2. **Encoder-Decoder attention**
   → The decoder attends to encoder outputs — this is how the decoder knows what the input meant.

3. **Feedforward network**
   → Like in the encoder, applies transformation to each position.

Each layer builds a **richer representation** of the output sequence being generated.

---

## 🔷 Final Step: Linear + Softmax

* After decoder layers, the final output goes through a **linear layer** followed by **softmax** to predict the next word.
* This output is a probability distribution over the vocabulary.

---

## 🔁 Example: English to French Translation

```text
Input: "I love transformers"
↓
Encoder processes this and creates contextual embeddings
↓
Decoder begins with: "<start>" token
↓
Decoder predicts "J'"
↓
Then uses "J'" + context to predict "aime"
↓
Repeats until "<end>" is generated
```

---

## ⚙️ Config Summary (from diagram):

* `Encoder Layers: 6`
* `Decoder Layers: 6`
* `Heads: 8` (each layer has 8 attention heads)
* `d_model: 512` (embedding size)



In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import Multi30k
from collections import Counter
from tqdm import tqdm
import spacy
import sys
import subprocess

def ensure_spacy_model(model_name):
    try:
        spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model '{model_name}'...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])

# Ensure required spaCy models are installed
ensure_spacy_model("en_core_web_sm")
ensure_spacy_model("de_core_news_sm")

en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
de_tokenizer = get_tokenizer("spacy", language="de_core_news_sm")

def tokenize_en(text):
    return en_tokenizer(str(text))

def tokenize_de(text):
    return de_tokenizer(str(text))

# --- FIXED DATA LOADING ---
train_iter = Multi30k(split='train', language_pair=('en', 'de'))
train_data_en = []
train_data_de = []
for en, de in train_iter:
    train_data_en.append(en)
    train_data_de.append(de)
# --- END FIX ---

class VOCAB:
    def __init__(self, tokenizer, min_freq=2, data=None, special_tokens=['<pad>', '<sos>', '<eos>', '<unk>']):
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        self.special_tokens = special_tokens
        self.build_vocab(data)

    def build_vocab(self, data):
        counter = Counter()
        for text in tqdm(data, desc="Building vocab"):
            tokens = self.tokenizer(text)
            counter.update(tokens)
        tokens = [token for token, freq in counter.items() if freq >= self.min_freq and token not in self.special_tokens]
        tokens = self.special_tokens + tokens
        self.stoi = {token: index for index, token in enumerate(tokens)}
        self.itos = tokens

    def __len__(self):
        return len(self.stoi)

    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi['<unk>'])

EN_VOCAB = VOCAB(tokenize_en, min_freq=1, data=train_data_en)
DE_VOCAB = VOCAB(tokenize_de, min_freq=1, data=train_data_de)
print("\nVocab Size English", len(EN_VOCAB))
print("\nVocab Size German", len(DE_VOCAB))

Building vocab: 100%|██████████| 29001/29001 [00:00<00:00, 36164.77it/s]
Building vocab: 100%|██████████| 29001/29001 [00:01<00:00, 23170.36it/s]


Vocab Size English 10837

Vocab Size German 19214






This code prepares bilingual text data (English ↔ German) for a machine translation model using the **Multi30k dataset**.

Here's what it does:

* ✅ **Loads English–German sentence pairs** using `torchtext`.
* 🧠 **Tokenizes** the sentences using **spaCy**, a language-aware tokenizer that handles grammar, punctuation, and morphology.
* 📦 **Builds vocabularies** for both languages:

  * Assigns each token a unique index.
  * Includes special tokens like `<pad>`, `<sos>`, `<eos>`, and `<unk>`.
  * Filters out rare words (optional via `min_freq`).
* 🔢 The final output is a mapping from words → numbers, which is essential for training neural networks.

This setup forms the **first step in building a translation model**—converting raw text into something a neural model can understand and learn from.


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, en_data, de_data, src_tokenizer, tgt_tokenizer, src_vocab, tgt_vocab):
        """
        Initialize the dataset with source (English) and target (German) data,
        along with their respective tokenizers and vocabularies.
        """
        self.en_data = en_data  # List of English sentences
        self.de_data = de_data  # List of German sentences
        self.src_tokenizer = src_tokenizer  # Tokenizer for English
        self.tgt_tokenizer = tgt_tokenizer  # Tokenizer for German
        self.src_vocab = src_vocab  # English vocabulary (token -> index)
        self.tgt_vocab = tgt_vocab  # German vocabulary (token -> index)

    def __getitem__(self, index):
        """
        Process a single (English, German) sentence pair:
        - Tokenize
        - Convert tokens to indices
        - Add <sos> and <eos> special tokens
        - Return tensors for both source and target
        """
        src_txt, tgt_txt = self.en_data[index], self.de_data[index]

        # Tokenize and convert to indices using vocab
        src_tokens = [self.src_vocab[token] for token in self.src_tokenizer(src_txt)]
        tgt_tokens = [self.tgt_vocab[token] for token in self.tgt_tokenizer(tgt_txt)]

        # Add <sos> and <eos> tokens around the sequences
        src_tokens = [self.src_vocab['<sos>']] + src_tokens + [self.src_vocab['<eos>']]
        tgt_tokens = [self.tgt_vocab['<sos>']] + tgt_tokens + [self.tgt_vocab['<eos>']]

        # Convert to PyTorch tensors
        src_tensor = torch.LongTensor(src_tokens)
        tgt_tensor = torch.LongTensor(tgt_tokens)

        return src_tensor, tgt_tensor

    def __len__(self):
        """
        Return the total number of sentence pairs in the dataset.
        """
        assert len(self.en_data) == len(self.de_data)  # Ensure aligned data
        return len(self.en_data)

    def collate_fn(self, batch):
        """
        Custom function to pad batches of variable-length sequences:
        - Pads all source and target tensors in the batch to the same length
        - Uses <pad> token index from vocab
        - Returns two padded tensors (src_batch, tgt_batch)
        """
        src_tensors, tgt_tensors = zip(*batch)  # Unzip list of (src, tgt) pairs

        # Pad sequences to match longest in batch (for batching)
        src_tensors = torch.nn.utils.rnn.pad_sequence(
            src_tensors, padding_value=self.src_vocab['<pad>'], batch_first=True
        )
        tgt_tensors = torch.nn.utils.rnn.pad_sequence(
            tgt_tensors, padding_value=self.tgt_vocab['<pad>'], batch_first=True
        )

        return src_tensors, tgt_tensors


In [12]:
# Use the correct Multi30k data folder and file names as per the multi30k_data folder and the 2016 split

import gzip

def read_gzipped_lines(filepath):
    with gzip.open(filepath, "rt", encoding="utf-8") as f:
        return [line.strip() for line in f]

# Paths to gzipped files in the multi30k_data folder
train_en_file = "multi30k/train.en.gz"
train_de_file = "multi30k/train.de.gz"
val_en_file   = "multi30k/val.en.gz"
val_de_file   = "multi30k/val.de.gz"
test_en_file  = "multi30k/test_2016_flickr.en.gz"
test_de_file  = "multi30k/test_2016_flickr.de.gz"

# Read the data from gzipped files
train_data_en = read_gzipped_lines(train_en_file)
train_data_de = read_gzipped_lines(train_de_file)
val_data_en   = read_gzipped_lines(val_en_file)
val_data_de   = read_gzipped_lines(val_de_file)
test_data_en  = read_gzipped_lines(test_en_file)
test_data_de  = read_gzipped_lines(test_de_file)

train_dataset = TranslationDataset(train_data_en, train_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)
val_dataset = TranslationDataset(val_data_en, val_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)
test_dataset = TranslationDataset(test_data_en, test_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)

BATCH_SIZE = 128

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=val_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=test_dataset.collate_fn)

In [13]:
train_dataset[0]

(tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2]),
 tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  2]))

In [14]:
' '.join([EN_VOCAB.itos[i] for i in train_dataset[0][0]]), ' '.join([DE_VOCAB.itos[i] for i in train_dataset[0][1]])

('<sos> Two young , White males are outside near many bushes . <eos>',
 '<sos> Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche . <eos>')

In [15]:
test_data_en[0], test_data_de[0]

('A man in an orange hat starring at something.',
 'Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.')

In [18]:
' '.join([EN_VOCAB.itos[i] for i in test_dataset[0][0]]), ' '.join([DE_VOCAB.itos[i] for i in test_dataset[0][1]])

('<sos> A man in an orange hat starring at something . <eos>',
 '<sos> Ein Mann mit einem orangefarbenen Hut , der etwas <unk> . <eos>')

# Define Model and associated parameters

In [16]:
#Define Hyperparameters
# Define Hyper Parameters
NUM_EPOCHS      = 20
D_MODEL         = 256
ATTN_HEADS      = 8
NUM_LAYERS      = 3
FEEDFORWARD_DIM = 512
DROPOUT         = 0.1
MAX_SEQ_LEN     = 150
SRC_VOCAB_SIZE  = len(EN_VOCAB)
TGT_VOCAB_SIZE  = len(DE_VOCAB)
LR              = 0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
    
class NoamScheduler:
    def __init__(self,optimizer,d_model, warmup_steps = 4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.current_step = 0

    def step(self):
        self.current_step += 1
        lr = self.learning_rate()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def learning_rate(self):
        step = self.current_step
        # Add a small epsilon to avoid division by zero if step is 0
        return (self.d_model ** -0.5) * min((step + 1e-9) ** -0.5, step * self.warmup_steps ** -1.5)

In [18]:
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
model = Transformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, D_MODEL, ATTN_HEADS, FEEDFORWARD_DIM, MAX_SEQ_LEN, NUM_LAYERS, DROPOUT).to(DEVICE)
# optimizer = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9, weight_decay=5e-2)
warmup_steps = 2 * len(train_dataloader)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)
# scheduler = LambdaLR(optimizer, lr_lambda=lambda step: (D_MODEL ** -0.5) * min((step + 1) ** -0.5, (step + 1) * warmup_steps ** -1.5), verbose=True)
scheduler = NoamScheduler(optimizer, d_model=D_MODEL, warmup_steps=warmup_steps)
criterion = torch.nn.CrossEntropyLoss(ignore_index=DE_VOCAB['<pad>'], label_smoothing=0.1)
scaler = torch.cuda.amp.GradScaler()

In [19]:
# Install nltk if not already installed
import sys
import subprocess

try:
    import nltk
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk

import sacrebleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def generate_tgt_mask(tgt, pad_idx):
    seq_len = tgt.size(1)
    no_future_mask = torch.tril(torch.ones((seq_len, seq_len), device=DEVICE)).bool()
    pad_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
    combined_mask = pad_mask & no_future_mask
    return combined_mask

def generate_src_mask(src, pad_idx):
    mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)
    return mask

def calculate_bleu(tgt_output, output):
    tgt_output = tgt_output.cpu().numpy()
    output = output.cpu().numpy()

    refs = []
    hyps = []

    for tgt, pred in zip(tgt_output, output):
        ref = ' '.join([DE_VOCAB.itos[t] for t in tgt if t not in (DE_VOCAB['<pad>'], DE_VOCAB['<eos>'], DE_VOCAB['<sos>'])])
        hyp = ' '.join([DE_VOCAB.itos[t] for t in pred if t not in (DE_VOCAB['<pad>'], DE_VOCAB['<eos>'], DE_VOCAB['<sos>'])])

        refs.append(ref)
        hyps.append(hyp)

    bleu = sacrebleu.corpus_bleu(hyps, [refs], force=True).score
    return bleu

In [20]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Train')

    for i, (src, tgt) in enumerate(dataloader):
        src, tgt = src.to(device), tgt.to(device)
        # print("Src", src.shape, "Tgt", tgt.shape)
        src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            output = model(src, tgt_input, src_mask, tgt_mask)

            loss = criterion(output.reshape(-1, output.size(2)), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        # optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(total_loss / (i + 1)),
            lr="{:.09f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update()

    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, criterion, DEVICE):
    model.eval()
    epoch_loss = 0
    epoch_bleu_score = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Validate')

    with torch.no_grad():
        for i, (src, tgt) in enumerate(dataloader):
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])
            tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])

            with torch.cuda.amp.autocast():
                output = model(src, tgt_input, src_mask, tgt_mask)
                loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))

            epoch_loss += loss.item()
            epoch_bleu_score += calculate_bleu(tgt_output, output.argmax(-1))

            batch_bar.set_postfix(
            loss="{:.04f}".format(epoch_loss / (i + 1)),
            bleu="{:.04f}".format(epoch_bleu_score / (i + 1)))

            batch_bar.update()

    # Normalize the loss and BLEU score by the number of validation samples
    epoch_loss /= len(dataloader)
    epoch_bleu_score /= len(dataloader)

    return epoch_loss, epoch_bleu_score

In [21]:
def inference(model, src, de_tokenizer):
    model.eval()

    src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])

    # Initialize target input tensors with <sos> tokens
    tgt_input = torch.full((src.size(0), 1), DE_VOCAB['<sos>'], dtype=torch.long, device=DEVICE)

    # Create a flag for each sequence in the batch
    eos_flags = torch.zeros(src.size(0), dtype=torch.bool, device=DEVICE)

    # Perform inference for each target token
    with torch.no_grad():
        for _ in range(70):
            tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])
            output = model(src, tgt_input, src_mask, tgt_mask)
            next_tokens = output.argmax(2)[:, -1].unsqueeze(1)
            tgt_input = torch.cat((tgt_input, next_tokens), dim=1)

            # Update the eos_flags for sequences that have generated <eos>
            eos_flags |= (next_tokens.squeeze() == DE_VOCAB['<eos>'])

            # Stop generating tokens if all sequences have generated <eos> or reached maximum length
            if torch.all(eos_flags):
                break

    # Convert target input tensors to translated sentences
    translated_sentences = []
    for i in range(tgt_input.size(0)):
        translated_tokens = []
        for token in tgt_input[i][1:]:
            if token == DE_VOCAB['<eos>']:
                break
            else:
                translated_tokens.append(DE_VOCAB.itos[token.item()])
        translated_sentence = ' '.join(translated_tokens)
        translated_sentences.append(translated_sentence)
    return translated_sentences

In [41]:
# Print a summary of the model architecture
print(model)


Transformer(
  (src_embedding): Embedding(10837, 256)
  (tgt_embedding): Embedding(19214, 256)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderBlock(
      (self_attn): MultiHeadSelfAttention(
        (wq): Linear(in_features=256, out_features=256, bias=True)
        (wk): Linear(in_features=256, out_features=256, bias=True)
        (wv): Linear(in_features=256, out_features=256, bias=True)
        (wo): Linear(in_features=256, out_features=256, bias=True)
      )
      (norm1): AddNorm(
        (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
      )
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=256, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=True)
      )
      (norm2): AddNorm(
        (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
   

In [42]:
import numpy as np
import torch

# Make sure the following are defined before running:
# NUM_EPOCHS, model, optimizer, criterion, DEVICE, scheduler (optional)
# train_epoch, validate_epoch, train_dataloader, val_dataloader

def main():
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    bleu_scores = []

    for epoch in range(1, NUM_EPOCHS + 1):
        print(f"\nEpoch {epoch}/{NUM_EPOCHS}")

        # ----- Training -----
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, DEVICE)
        train_losses.append(train_loss)

        # ----- Validation -----
        val_loss, bleu_score = validate_epoch(model, val_dataloader, criterion, DEVICE)
        val_losses.append(val_loss)
        bleu_scores.append(bleu_score)

        # ----- Print epoch summary -----
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | BLEU Score: {bleu_score:.4f}")

        # ----- Save the best model -----
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print("✅ Best model saved.")

        # ----- Step the scheduler if available -----
        if 'scheduler' in locals() and scheduler is not None:
            scheduler.step()

    # Save loss/metric history
    np.save("train_losses.npy", np.array(train_losses))
    np.save("val_losses.npy", np.array(val_losses))
    np.save("bleu_scores.npy", np.array(bleu_scores))
    print(f"\nTraining complete. Best validation loss: {best_val_loss:.4f}")

if __name__ == "__main__":
    main()


Epoch 1/20


                                                                                     

Train Loss: 5.8258 | Val Loss: 4.2103 | BLEU Score: 6.6498
✅ Best model saved.

Epoch 2/20


                                                                                     

Train Loss: 3.8228 | Val Loss: 3.6152 | BLEU Score: 10.5960
✅ Best model saved.

Epoch 3/20


                                                                                     

Train Loss: 3.3153 | Val Loss: 3.3305 | BLEU Score: 14.9178
✅ Best model saved.

Epoch 4/20


                                                                                     

Train Loss: 2.9369 | Val Loss: 3.1557 | BLEU Score: 14.2710
✅ Best model saved.

Epoch 5/20


                                                                                     

Train Loss: 2.6769 | Val Loss: 3.0911 | BLEU Score: 16.9041
✅ Best model saved.

Epoch 6/20


                                                                                     

Train Loss: 2.4787 | Val Loss: 3.0350 | BLEU Score: 19.7898
✅ Best model saved.

Epoch 7/20


                                                                                     

Train Loss: 2.3186 | Val Loss: 3.0254 | BLEU Score: 20.5710
✅ Best model saved.

Epoch 8/20


                                                                                     

Train Loss: 2.1906 | Val Loss: 2.9974 | BLEU Score: 27.9643
✅ Best model saved.

Epoch 9/20


                                                                                     

Train Loss: 2.0868 | Val Loss: 3.0169 | BLEU Score: 19.5570

Epoch 10/20


                                                                                     

Train Loss: 2.0041 | Val Loss: 3.0241 | BLEU Score: 19.3396

Epoch 11/20


                                                                                     

Train Loss: 1.9401 | Val Loss: 3.0485 | BLEU Score: 19.6496

Epoch 12/20


                                                                                     

Train Loss: 1.8880 | Val Loss: 3.0643 | BLEU Score: 20.1703

Epoch 13/20


                                                                                     

Train Loss: 1.8431 | Val Loss: 3.0741 | BLEU Score: 20.1040

Epoch 14/20


                                                                                     

Train Loss: 1.8054 | Val Loss: 3.1011 | BLEU Score: 19.0616

Epoch 15/20


                                                                                     

Train Loss: 1.7747 | Val Loss: 3.1071 | BLEU Score: 17.7373

Epoch 16/20


                                                                                     

Train Loss: 1.7470 | Val Loss: 3.1435 | BLEU Score: 18.8883

Epoch 17/20


                                                                                     

Train Loss: 1.7211 | Val Loss: 3.1534 | BLEU Score: 18.3990

Epoch 18/20


                                                                                     

Train Loss: 1.7001 | Val Loss: 3.1793 | BLEU Score: 23.1467

Epoch 19/20


                                                                                     

Train Loss: 1.6813 | Val Loss: 3.1750 | BLEU Score: 28.2455

Epoch 20/20


                                                                                     

Train Loss: 1.6653 | Val Loss: 3.1796 | BLEU Score: 23.5873

Training complete. Best validation loss: 2.9974




In [44]:
import random
def evaluate_test_set_bleu(model, test_dataloader, de_tokenizer):
    translated_sentences = []
    ground_truth_sentences = []

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        src, tgt_output = batch
        src, tgt = src.to(DEVICE), tgt_output.to(DEVICE)
        tgt_sentences = [' '.join([DE_VOCAB.itos[token.item()] for token in sequence if token.item() not in [DE_VOCAB['<pad>'], DE_VOCAB['<sos>'], DE_VOCAB['<eos>']]]) for sequence in tgt_output]

        translations = inference(model, src, de_tokenizer)
        translated_sentences.extend(translations)
        ground_truth_sentences.extend([[tgt] for tgt in tgt_sentences])

    rand_index = random.randint(0, len(test_dataset))
    print("\n\nExample Sentence and its Translation")
    print("Source Sentence in English               :", ' '.join([EN_VOCAB.itos[i] for i in test_dataset[rand_index][0] if EN_VOCAB.itos[i] not in ['<pad>', '<sos>', '<eos>']]))
    print("German have the truth          :", ground_truth_sentences[rand_index][0])
    print("Machine Translated Sentence in German    :", translated_sentences[rand_index])
    bleu_score = sacrebleu.corpus_bleu(translated_sentences, ground_truth_sentences)
    return bleu_score

# Usage example
test_bleu = evaluate_test_set_bleu(model, test_dataloader, de_tokenizer)
print("Test BLEU score:", test_bleu.score)

Evaluating: 100%|██████████| 8/8 [00:50<00:00,  6.32s/it]



Example Sentence and its Translation
Source Sentence in English               : A female performer with a violin plays on a street while a woman with a blue guitar looks on .
German have the truth          : Eine <unk> mit einer Violine spielt auf der Straße während eine Frau mit einer blauen Gitarre zusieht .
Machine Translated Sentence in German    : Eine Künstlerin spielt auf einer Straße mit einer Violine , während eine Frau mit einer blauen Gitarre zuschaut .
Test BLEU score: 54.91004867761124



