In [None]:
#  %pip install torchinfo

#comment it along with the code for building the intuition and articulative manners that's mean whenever I read code I am able to understand the structure and code 
# write a markdown for intuition and articulation what I did in this cell and how it is going to work and why it is important and when we have to use this 

Note: you may need to restart the kernel to use updated packages.


In [None]:
# %pip install numpy==2.0
# %pip install "numpy<2.0"



In [28]:
import torch                                                # PyTorch main package for tensor operations and deep learning
import torch.nn as nn                                       # PyTorch module for building neural network layers
import torch.optim as optim                                 # PyTorch module for optimization algorithms (e.g., Adam, SGD)
import sacrebleu                                            # Library for calculating BLEU score (translation quality metric)
from torchtext.data.utils import get_tokenizer              # Utility to get tokenizers for text preprocessing
from torchtext.vocab import build_vocab_from_iterator       # Function to build vocabulary from tokenized data
from torchtext.datasets import Multi30k                     # Multi30k dataset for English-German/French translation tasks
from typing import Tuple                                    # Type hinting for functions that return tuples
import spacy                                                # NLP library for tokenization and linguistic features
import warnings
warnings.filterwarnings("ignore")
import numpy as np



In [29]:
print(torch.__version__)                #check the version of pytorch

2.1.0


In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        """
        Initialize the Multi-Head Self-Attention mechanism.

        Args:
        d_model (int): Total dimension of the model (e.g., 512). This is the size of input embeddings.
        num_heads (int): Number of attention heads to split the model into.

        Concept:
        Instead of performing one large attention, we divide it into multiple 'heads'.
        Each head learns to focus on different parts of the input using smaller dimensional spaces (head_dim = d_model / num_heads).
        """
        super(MultiHeadSelfAttention, self).__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # Ensure d_model is divisible evenly among all heads
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"

        # Linear layers to project input into queries (Q), keys (K), and values (V)
        # These transform the input into three separate learned representations
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        # Final linear layer to project concatenated heads' output back to d_model
        self.wo = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        """
        Forward pass for Multi-Head Self-Attention.

        Args:
        q (Tensor): Query tensor (batch_size, seq_len, d_model)
        k (Tensor): Key tensor (batch_size, seq_len, d_model)
        v (Tensor): Value tensor (batch_size, seq_len, d_model)
        mask (Tensor, optional): Attention mask to block certain positions (like padding or future tokens in decoding)

        Returns:
        Tensor: Output after applying multi-head attention (batch_size, seq_len, d_model)
        """
        batch_size = q.size(0)

        # Step 1: Project input into query, key, and value vectors
        # Then reshape each into multiple heads for parallel attention
        # Output shape: (batch_size, num_heads, seq_len, head_dim)
        q = self.wq(q).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.wk(k).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.wv(v).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Step 2: Compute scaled dot-product attention
        # Attention score = Q . K^T / sqrt(d_k)
        # This measures how much each word should attend to others
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        # Shape: (batch_size, num_heads, seq_len, seq_len)

        # Step 3 (Optional): Apply mask if provided
        # Useful to prevent attention to certain tokens (like padding or future tokens)
        if mask is not None:
            # Mask out unwanted positions with very negative value (-inf) so they become 0 after softmax
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        # Step 4: Normalize scores to probabilities using softmax
        attn_weights = F.softmax(attn_scores, dim=-1)
        # Shape: (batch_size, num_heads, seq_len, seq_len)

        # Step 5: Weighted sum of value vectors using attention weights
        attn_output = torch.matmul(attn_weights, v)
        # Shape: (batch_size, num_heads, seq_len, head_dim)

        # Step 6: Concatenate all heads' output together
        # Transpose and reshape back to (batch_size, seq_len, d_model)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # Step 7: Final linear layer to mix the heads and produce the output
        out = self.wo(attn_output)

        return out


## 🔍 Multi-Head Self-Attention: Intuition, Role, and Importance

### 🧠 What I Did in This Cell
In this cell, I implemented the **Multi-Head Self-Attention (MHSA)** module from scratch using PyTorch. This block is one of the most critical components of the Transformer architecture. The goal of this module is to allow the model to attend to different parts of the sequence **in parallel**, capturing richer relationships between words or tokens.

Key steps involved:
- Projected the input embeddings into **Query (Q)**, **Key (K)**, and **Value (V)** vectors.
- Split the embedding into multiple "attention heads" to allow the model to learn different attention patterns.
- Performed **scaled dot-product attention** across each head.
- Applied optional **masking** for decoder/causal attention or padding.
- Concatenated the results from all heads and passed them through a final linear layer.

---

### 🎯 Intuition Behind Multi-Head Attention
Imagine reading a sentence like:
> "The animal didn't cross the road because it was too tired."

To understand what "it" refers to, your brain simultaneously considers:
- Recent nouns ("animal", "road")
- Verb context ("cross", "was")
- Grammatical cues

**Multi-head attention mimics this by allowing the model to look at the input from multiple perspectives at once.** Each head can focus on a different type of dependency — e.g., subject-verb, object-adjective, etc.

---

### ⚙️ How It Works Internally
1. The input tensor is projected into Q, K, and V vectors.
2. Each head computes attention using `Q × K^T / sqrt(d_k)` to get how much focus should be given to each token.
3. These scores are passed through a softmax to get attention weights.
4. We then take a weighted sum of the value vectors using those attention weights.
5. Finally, we concatenate all head outputs and project back to the model's original dimension.

---

### 🔥 Why This Is Important
- Enables the model to capture **contextual relationships** across all words in the input sequence, regardless of their position.
- Improves expressiveness over single-head attention by capturing **multiple types of interactions** in parallel.
- Crucial for **language understanding, translation, summarization**, and almost all modern NLP tasks.

Without multi-head attention, the model would be limited to a single way of attending, making it less flexible and less powerful.

---

### 🕰️ When To Use This
Use Multi-Head Self-Attention when:
- You're building a **Transformer-based model** (e.g., for NLP or Vision Transformers).
- You need to capture **complex, long-range dependencies** between sequence elements.
- You're implementing **machine translation, text classification, summarization, question answering**, etc.
- You want to build a model that treats every position in a sequence with **equal access** to other positions, rather than relying on sequential recurrence (like RNNs).

---

### 🧩 Key Parameters
- `d_model`: Dimension of the model (e.g., 512 or 768)
- `num_heads`: Number of parallel attention heads (e.g., 8 or 12)
- `mask`: Optional masking to restrict attention to certain positions (used heavily in the decoder)

---

In summary, this cell is not just a building block—it's the **core engine** behind the Transformer’s ability to “pay attention” to relevant context in a smart and parallelized way. 🚀


# Position- wise Feed Forward Networks

In [46]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        """
        Initialize the Position-wise Feed-Forward module.

        Args:
        - d_model (int): The input/output dimensionality of the model (e.g., 512).
        - d_ff (int): The dimensionality of the intermediate hidden layer (e.g., 2048).
        - dropout (float): Probability of dropout for regularization.

        📌 Intuition:
        This module is applied **independently to each position** (token) in the sequence.
        It acts like a mini fully-connected neural network that gives the model **non-linearity** and **depth**.
        """
        super(PositionwiseFeedForward, self).__init__()

        # First linear layer expands the dimension from d_model to d_ff
        self.linear1 = nn.Linear(d_model, d_ff)

        # Dropout helps prevent overfitting by randomly zeroing some of the activations
        self.dropout = nn.Dropout(dropout)

        # Second linear layer projects back from d_ff to d_model
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        Forward pass through the Position-wise Feed-Forward layer.

        Args:
        - x (Tensor): Input tensor with shape (batch_size, seq_length, d_model)

        Returns:
        - Tensor: Output tensor with the same shape (batch_size, seq_length, d_model)

        ⚙️ Step-by-step:
        1. Expand the dimensionality of each token's embedding (like a hidden layer).
        2. Apply ReLU non-linearity to introduce non-linear transformations.
        3. Apply dropout to add regularization.
        4. Project the representation back to the original d_model dimension.
        """
        
        # Step 1: Apply first linear layer
        out = self.linear1(x)

        # Step 2: Apply ReLU non-linearity element-wise
        out = F.relu(out)

        # Step 3: Apply dropout for regularization
        out = self.dropout(out)

        # Step 4: Project back to d_model dimensions
        out = self.linear2(out)

        return out



## ⚙️ Position-wise Feed-Forward Network (FFN)

### 🧠 What I Did in This Cell
In this cell, I implemented the **Position-wise Feed-Forward Network** — a critical component used inside every Transformer block. Unlike attention layers that operate across different positions in the sequence, this module operates **independently on each position** (i.e., token) in the input sequence.

What happens in this cell:
- I created a two-layer feed-forward neural network.
- The first layer expands the dimensionality from `d_model` to a larger `d_ff` (usually 2048).
- Applied ReLU activation to introduce non-linearity.
- Used dropout for regularization.
- Projected it back to the original `d_model` dimension.

---

### 🎯 Intuition Behind Position-wise FFN
In the Transformer architecture, self-attention allows tokens to **interact with each other** and exchange context. However, we still need a mechanism to allow each token to **process its own representation** and extract features **non-linearly**.

This is where the Position-wise FFN comes in:
- It acts like a **miniature MLP (Multi-Layer Perceptron)** applied to each token's embedding.
- This helps the model **refine and transform the information** that each token receives after attention.

The idea is: **"Let each token process its contextualized meaning individually."**

---

### ⚙️ How It Works
For each token embedding (i.e., a vector of size `d_model`), the FFN performs:
1. A linear transformation to a higher-dimensional space (`d_ff`).
2. A ReLU activation to add non-linearity.
3. Dropout to reduce overfitting.
4. Another linear transformation back to `d_model`.

This process happens **independently for every token** in the batch and sequence.

---

### 🔥 Why This Is Important
- **Adds Depth & Non-Linearity**: The Transformer would be too shallow and linear without this module.
- **Token-wise Learning**: While attention focuses on relationships between tokens, the FFN improves the **representation of each token individually**.
- **Flexible Transformation**: Gives the model more expressive power to capture complex patterns in token embeddings.
- **Weight Sharing**: The same FFN is applied at each position — saving memory while keeping consistency.

---

### 🕰️ When Do We Use This?
Use this module:
- As a **standard component inside each Transformer block**, right after the self-attention and residual connection.
- When building models like **BERT, GPT, T5, etc.**
- Any time you need to **refine token-level representations after attention**, especially in tasks like:
  - Machine translation
  - Text classification
  - Summarization
  - Question answering

---

In summary, this cell builds a simple yet powerful transformation layer that ensures each token is not just "context-aware" (via attention) but also **individually smart** and **non-linearly enhanced**. 🧩🚀


# Add and Norm

In [49]:

class AddNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        """
        Initialize the AddNorm module.

        Args:
        - d_model (int): The dimensionality of the input tensor (e.g., 512 or 768).
        - eps (float): A small constant to ensure numerical stability in LayerNorm.

        📌 Intuition:
        This module implements the "Add & Norm" step used in Transformer blocks.
        It combines a **residual connection** with **layer normalization** to help the model train efficiently and stay stable.
        """
        super(AddNorm, self).__init__()

        # Layer normalization normalizes each feature vector (along last dim) to have mean 0 and variance 1
        # It helps with training deep models and accelerates convergence
        self.norm = nn.LayerNorm(d_model, eps=eps)

    def forward(self, x, residual):
        """
        Forward pass through Add & Norm.

        Args:
        - x (Tensor): Output tensor from the sublayer (e.g., self-attention or feed-forward).
                     Shape: (batch_size, seq_len, d_model)
        - residual (Tensor): The original input to the sublayer, which is added back (residual connection).
                             Same shape as x.

        Returns:
        - Tensor: Normalized output after adding the residual.
                  Shape: (batch_size, seq_len, d_model)

        ⚙️ Step-by-step:
        1. Add the residual connection: This helps prevent vanishing gradients and retains original information.
        2. Apply LayerNorm to stabilize the training and keep the scale of activations controlled.
        """

        # Step 1: Add input (residual connection)
        out = x + residual  # Preserve original signal + apply new transformation

        # Step 2: Normalize the result
        out = self.norm(out)

        return out


## 🔄 Add & Norm: Residual Connections with Layer Normalization

### 🧠 What I Did in This Cell
In this cell, I implemented the **AddNorm** module, which performs two critical operations found in every block of a Transformer:
1. **Residual (Skip) Connection**: Adds the original input (`residual`) back to the sublayer output (`x`).
2. **Layer Normalization**: Normalizes the combined output to stabilize the learning process.

This is known as the **"Add & Norm"** step in the original Transformer architecture and is used both after the self-attention block and after the feed-forward block.

---

### 🎯 Intuition Behind Add & Norm
Deep neural networks can suffer from vanishing gradients and unstable training, especially when stacked in many layers.

The solution:
- **Residual connections** help preserve the original input signal and make gradient flow easier, reducing the risk of degradation in deep networks.
- **Layer normalization** ensures that the outputs of each layer maintain a stable distribution (zero mean, unit variance), which helps the optimizer converge faster and makes training more stable.

Together, they allow each layer to:
- Learn a residual mapping instead of a full transformation.
- Avoid exploding or vanishing activations during training.

---

### ⚙️ How It Works Step-by-Step
1. **Residual Addition**:
   - `x + residual`: Combines the transformed representation (`x`) with the original input.
   - This helps the model learn the *change* needed instead of the full transformation.
2. **Layer Normalization**:
   - Normalizes each embedding (feature vector per token) across the last dimension.
   - Keeps values on a similar scale, improves convergence, and prevents instability.

---

### 🔥 Why This Is Important
- Prevents degradation in very deep models by simplifying learning.
- Speeds up convergence by keeping distributions stable.
- Maintains contextual signals from earlier layers, which is important in long sequences.
- Essential in virtually **all Transformer-based models**, including BERT, GPT, T5, and more.

Without "Add & Norm", the deep stacking of layers in the Transformer would result in poor gradient flow and unstable training.

---

### 🕰️ When Do We Use This?
Use AddNorm:
- After **multi-head self-attention** layers.
- After **position-wise feed-forward** layers.
- Whenever you want to **retain original features** while still benefiting from transformations.
- In any model that relies on **deep layer stacking** and **sequential context**, especially:
  - Transformers (NLP & Vision)
  - Encoders / Decoders
  - BERT, GPT, T5, ViT, etc.

---

In summary, this cell implements a powerful design pattern that supports both **stability** and **depth** in the Transformer architecture — helping the model learn efficiently without losing valuable contextual information. 🧩🚀


#  Positional Encoding

In [48]:
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len, dropout=0.1):
        """
        Initialize the PositionalEncoding module.

        Args:
        - d_model (int): Dimensionality of the embeddings (e.g., 512).
        - max_seq_len (int): Maximum length of the input sequence.
        - dropout (float): Dropout probability to prevent overfitting.

        📌 Intuition:
        Since Transformers don’t have any built-in notion of word order (unlike RNNs or CNNs),
        we inject information about token positions using **positional encodings**.
        These are fixed, learnable patterns added to the input embeddings to help the model 
        understand **"who came before/after whom"** in a sequence.
        """
        super(PositionalEncoding, self).__init__()

        # Dropout for regularization (optional but common in Transformer models)
        self.dropout = nn.Dropout(dropout)

        # Step 1: Initialize positional encoding matrix (shape: [max_seq_len, d_model])
        pe = torch.zeros(max_seq_len, d_model)

        # Step 2: Generate a list of positions [0, 1, 2, ..., max_seq_len-1] as a column vector
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)

        # Step 3: Compute the denominator term for sinusoidal frequency scaling
        # This ensures different frequencies for each dimension
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )

        # Step 4: Apply sine to even indices (2i) and cosine to odd indices (2i+1)
        pe[:, 0::2] = torch.sin(position * div_term)  # shape: [max_seq_len, d_model/2]
        pe[:, 1::2] = torch.cos(position * div_term)  # shape: [max_seq_len, d_model/2]

        # Step 5: Add a batch dimension to match input shape (1, max_seq_len, d_model)
        pe = pe.unsqueeze(0)  # shape: (1, max_seq_len, d_model)

        # Step 6: Register this as a buffer so it's saved with the model but not updated by gradients
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Forward pass for PositionalEncoding.

        Args:
        - x (Tensor): Input embeddings of shape (batch_size, seq_len, d_model)

        Returns:
        - Tensor: Output embeddings with positional information added.
        """

        # Step 7: Add positional encodings to input embeddings (broadcast across batch dimension)
        x = x + self.pe[:, :x.size(1), :]

        # Step 8: Apply dropout for regularization
        x = self.dropout(x)

        return x


## 📐 Positional Encoding: Giving Transformers a Sense of Order

### 🧠 What I Did in This Cell
In this cell, I implemented the **PositionalEncoding** module, which adds **position information** to input token embeddings in a Transformer model.

Unlike RNNs or CNNs, Transformers process all tokens **in parallel** and have **no built-in understanding of order**. To solve this, we inject position information using deterministic **sinusoidal positional encodings**, which are then **added** to the input embeddings.

This module:
- Precomputes a fixed positional encoding matrix using sine and cosine functions.
- Adds it to the token embeddings before feeding them into the attention mechanism.
- Applies dropout to prevent overfitting.

---

### 🎯 Intuition Behind Positional Encoding
Transformers treat all tokens equally — the word "dog" at position 1 is indistinguishable from "dog" at position 8.

But in language, **position matters**:
- “He ate before sleeping” ≠ “He slept before eating.”

So, we inject position information by **encoding each position as a unique vector** using sine and cosine functions. These vectors:
- Vary smoothly across positions.
- Allow the model to infer relative and absolute positions using mathematical properties.
- Are **not learned**, making them lightweight and generalizable.

---

### ⚙️ How It Works
1. Create a positional encoding matrix of shape `(max_seq_len, d_model)`.
2. For each position:
   - Use `sin(pos / 10000^(2i/d_model))` for even indices.
   - Use `cos(pos / 10000^(2i/d_model))` for odd indices.
3. Add this positional encoding to the input embeddings (element-wise addition).
4. Apply dropout for regularization.

This results in **embeddings that contain both the content (word meaning)** and **the position (order)** of each token in the sequence.

---

### 🔥 Why This Is Important
- **Injects positional information** into a model that otherwise has none.
- Maintains **parallelism** (unlike RNNs) by not using recurrence.
- Helps the model learn **sequence-aware attention** — critical for language tasks.
- The use of **sinusoids allows the model to generalize to longer sequences** than it was trained on.

Without positional encodings, a Transformer would treat a shuffled sentence the same as the original — which is undesirable.

---

### 🕰️ When Do We Use This?
Use PositionalEncoding:
- At the very beginning of a Transformer model, **right after word embeddings**.
- In both **encoder** and **decoder** parts of the architecture.
- In any model that uses **self-attention and parallel token processing** without recurrence.
- Common in:
  - Machine Translation
  - Summarization
  - Text Classification
  - Vision Transformers (ViT uses learnable positional encodings)

---

In summary, this cell equips the Transformer with a **sense of token order** — an essential component for making meaning from sequences. Without this, the model would be **powerful, but blind to structure**. 🧠📏⚡


In [50]:


class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        """
        Initialize the EncoderBlock module.

        Args:
        - d_model (int): Dimensionality of input embeddings (e.g., 512).
        - num_heads (int): Number of attention heads in multi-head self-attention.
        - d_ff (int): Dimensionality of hidden layer in the feed-forward network.
        - dropout (float): Dropout rate to avoid overfitting.

        📌 Intuition:
        This block forms one layer of the Transformer encoder. Each block contains:
        1. Multi-head self-attention to capture contextual relationships.
        2. Add & Norm to stabilize and preserve gradients.
        3. Position-wise feed-forward network for token-wise transformation.
        4. Another Add & Norm to wrap the FFN.

        All of this is applied **in parallel** to every token in the input sequence.
        """
        super(EncoderBlock, self).__init__()

        # Multi-head self-attention sublayer
        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)

        # LayerNorm + residual connection after attention
        self.norm1 = AddNorm(d_model)

        # Position-wise feed-forward sublayer
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)

        # LayerNorm + residual connection after feed-forward
        self.norm2 = AddNorm(d_model)

    def forward(self, x, mask=None):
        """
        Forward pass for EncoderBlock.

        Args:
        - x (Tensor): Input tensor of shape (batch_size, seq_len, d_model)
        - mask (Tensor, optional): Attention mask (e.g., for padding)

        Returns:
        - Tensor: Output tensor of same shape (batch_size, seq_len, d_model)

        ⚙️ Step-by-step:
        1. Apply self-attention across the sequence (tokens attend to each other).
        2. Add the result to the input (residual) and normalize.
        3. Feed through position-wise FFN for non-linearity and transformation.
        4. Add and normalize again.

        Each encoder block refines the representation by deepening and enriching token-level information.
        """

        # Step 1: Self-attention (each token attends to all others, including itself)
        x1 = self.self_attn(x, x, x, mask)

        # Step 2: Add & Norm (residual connection + layer normalization)
        x = self.norm1(x, x1)

        # Step 3: Position-wise feed-forward network
        x1 = self.ffn(x)

        # Step 4: Add & Norm again
        x = self.norm2(x, x1)

        return x


## 🔗 Transformer Encoder Block: Self-Attention + FFN + Add & Norm

### 🧠 What I Did in This Cell
In this cell, I implemented the core **EncoderBlock** used in the Transformer architecture.

This block is a composition of the following submodules:
1. **Multi-Head Self-Attention** – enables each token to attend to every other token in the sequence.
2. **Add & Norm** – adds a residual connection followed by layer normalization to stabilize training.
3. **Position-wise Feed-Forward Network** – applies a small neural network independently to each token.
4. **Another Add & Norm** – ensures stability and smooth gradient flow after the FFN.

This design is modular and stackable — multiple such blocks can be layered to form a deep encoder.

---

### 🎯 Intuition Behind EncoderBlock
Each token in a sentence should not be understood in isolation. It needs to gather context from other words.

The encoder block helps with this by:
- **First** letting each token "attend" to others via self-attention.
- **Then** using a feed-forward network to transform each token individually.
- **Both steps** are wrapped in residual connections and layer normalization to make deep learning feasible and efficient.

It's like:
> "Listen to everyone else, update your understanding, then process your own meaning — repeat."

---

### ⚙️ How It Works Step-by-Step
1. **Multi-Head Self-Attention**:
   - Computes attention across the sequence using multiple parallel attention heads.
   - Helps capture diverse contextual relationships (e.g., syntactic, semantic).

2. **Add & Norm 1**:
   - Adds the original input back to the attention output (residual connection).
   - Applies layer normalization to stabilize training and maintain scale.

3. **Feed-Forward Network**:
   - Applies a two-layer neural network to each token independently.
   - Introduces non-linearity and deeper transformations for each token.

4. **Add & Norm 2**:
   - Again uses a residual connection and normalization to retain information and improve gradient flow.

---

### 🔥 Why This Is Important
This block is the **building unit of the Transformer encoder**.
- Enables **contextual understanding** of each token with respect to others.
- Balances **global interaction (via attention)** and **local transformation (via FFN)**.
- The use of **residual connections + layer norm** helps train very deep models efficiently.

Without this block, the model would struggle to understand word relationships, learn deep patterns, or generalize across long sequences.

---

### 🕰️ When Do We Use This?
Use this EncoderBlock:
- As part of a **Transformer Encoder** (e.g., in BERT, T5, ViT).
- When you want to **extract rich representations** from an input sequence.
- In tasks like:
  - **Machine Translation** (encoder side)
  - **Text Classification**
  - **Summarization**
  - **Speech and Vision tasks** using Transformer-based architectures

---

In summary, this block captures the **essence of what makes Transformers powerful**: deep context-aware reasoning combined with stable and efficient training. 🧠⚡


In [51]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        """
        Initializes the DecoderBlock module.

        Args:
        - d_model (int): Dimensionality of input embeddings (e.g., 512).
        - num_heads (int): Number of attention heads.
        - d_ff (int): Hidden layer size for feed-forward network.
        - dropout (float): Dropout rate to avoid overfitting.

        📌 Intuition:
        The Decoder Block is designed to generate tokens one at a time during inference.
        It processes the already-generated output tokens (target sequence) by:
        1. Attending to previously generated tokens (via **masked self-attention**),
        2. Attending to the encoder’s outputs (via **encoder-decoder attention**),
        3. Transforming the token representations using a **position-wise feed-forward network**.
        """
        super(DecoderBlock, self).__init__()

        # 1️⃣ Masked Multi-Head Self-Attention: attends only to earlier positions in the target sequence
        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = AddNorm(d_model)

        # 2️⃣ Encoder-Decoder Attention: lets decoder attend to encoder outputs
        self.enc_dec_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm2 = AddNorm(d_model)

        # 3️⃣ Feed-Forward Network: non-linear transformation per token
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm3 = AddNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        """
        Forward pass for DecoderBlock.

        Args:
        - x (Tensor): Target sequence input (batch_size, tgt_seq_len, d_model)
        - enc_output (Tensor): Encoder output (batch_size, src_seq_len, d_model)
        - src_mask (Tensor, optional): Encoder mask (e.g., padding)
        - tgt_mask (Tensor, optional): Decoder mask (e.g., causal mask)

        Returns:
        - Tensor: Output tensor (batch_size, tgt_seq_len, d_model)

        ⚙️ Step-by-step:
        1. Apply masked self-attention so decoder can only look at past tokens.
        2. Apply encoder-decoder attention so decoder can focus on relevant input tokens.
        3. Transform each token using feed-forward network.
        4. Apply residual connections + normalization after each sub-layer.
        """

        # 1️⃣ Masked Self-Attention: prevents attending to future tokens during training
        attn1 = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x, attn1)

        # 2️⃣ Encoder-Decoder Attention: decoder attends to encoder’s output to guide generation
        attn2 = self.enc_dec_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x, attn2)

        # 3️⃣ Feed-Forward Network: enrich each token’s features
        ffn_out = self.ffn(x)
        x = self.norm3(x, ffn_out)

        return x


## 🔁 Transformer Decoder Block: Intuition and Architecture

### 🧠 What I Did in This Cell
In this cell, I implemented a **Decoder Block**, which forms one layer of the Transformer decoder. Each decoder block is designed to **generate sequences step by step**, attending to both:
- The **already generated tokens** (via masked self-attention), and
- The **encoded input sequence** (via encoder-decoder attention).

The block consists of three main sublayers, each followed by an Add & Norm operation:
1. **Masked Multi-Head Self-Attention**
2. **Encoder-Decoder Multi-Head Attention**
3. **Feed-Forward Network**

Each sublayer is wrapped in a residual connection and normalized to ensure stability during deep training.

---

### 🎯 Intuition Behind the Decoder Block
The goal of a decoder block is to **predict the next token** in a sequence during training and inference. To do this effectively:
- It needs to **look at the previously generated tokens** — but **not future tokens** — hence the use of **masked self-attention**.
- It must also **refer back to the encoder output**, i.e., what it has "understood" from the input sentence.
- Finally, it transforms each token's features using a small neural network (FFN) to enhance the learned representations.

This design allows the model to generate coherent, contextual, and input-aware outputs in tasks like translation, summarization, and dialogue generation.

---

### ⚙️ How It Works (Step-by-Step)
1. **Masked Multi-Head Self-Attention**:
   - The decoder attends to previously generated tokens (causal mask prevents peeking ahead).
   - Ensures that predictions are made **autoregressively**.

2. **Add & Norm**:
   - Applies a residual connection (original input + attention output).
   - Followed by layer normalization to stabilize training.

3. **Encoder-Decoder Attention**:
   - Allows the decoder to attend to the **relevant parts of the input sequence**.
   - This cross-attention helps the decoder align input and output sequences.

4. **Second Add & Norm**:
   - Again, we apply a residual connection and normalization for stability.

5. **Feed-Forward Network (FFN)**:
   - Applies a two-layer MLP to each token individually.
   - Adds depth and non-linearity to the model.

6. **Final Add & Norm**:
   - Final stabilization before passing output to the next decoder block or output head.

---

### 🔥 Why This is Important
- **Masked attention enforces autoregressive behavior** during training and inference.
- **Cross-attention creates alignment** between input and output — essential for translation and sequence mapping tasks.
- **Residuals + normalization make training deep networks feasible and efficient**.
- **Layer stacking improves depth and expressiveness**, allowing the model to learn complex linguistic patterns.

---

### 🕰️ When Do We Use This?
Use a Decoder Block:
- In any **Transformer-based decoder model**, such as:
  - Machine translation (e.g., English to German)
  - Summarization
  - Dialogue generation
  - Image captioning (paired with vision encoders)
- In models like **GPT**, **T5**, **Transformer-Decoder-only** architectures, or **seq2seq** systems.

It’s a core module in **auto-regressive sequence generation**.

---

In short, this Decoder Block enables the Transformer to **generate tokens step-by-step**, learn from both **context and input**, and maintain **stability** across many layers — making it a cornerstone of modern NLP and sequence modeling. 🧱⚡🧠


## The Transformer

Now that we have implemented all the building blocks, let's assemble the complete Transformer architecture.

We initialize the following components:

- Source and target embedding layers
- Positional encoding module
- Encoder and decoder layer stacks
- Final linear layer to produce the probability distribution over the target vocabulary

In the forward method, we first pass the source and target input tensors through their respective embedding layers and add the positional encoding. Then, we pass the source input through each encoder layer sequentially, followed by passing the target input and encoder output through each decoder layer sequentially. Finally, we apply the linear layer to produce the output tensor with shape (batch_size, tgt_seq_length, tgt_vocab_size).

In [36]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, D_MODEL, num_heads, d_ff, max_seq_len, num_layers, dropout=0.1):
        """
        Initialize the Transformer module.

        Args:
        src_vocab_size (int): The size of the source vocabulary.
        tgt_vocab_size (int): The size of the target vocabulary.
        d_model (int): The dimensionality of the embedding
        num_heads (int): The number of attention heads.
        d_ff (int): The dimensionality of the hidden layer in the feed-forward network.
        max_seq_len (int): The maximum length of the input sequence.
        num_layers (int): The number of layers in the encoder and decoder.
        dropout (float, optional): The dropout probability. Defaults to 0.1.
        """
        super(Transformer, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, D_MODEL)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, D_MODEL)
        self.pos_encoding = PositionalEncoding(D_MODEL, max_seq_len, dropout)

        self.encoder_layers = nn.ModuleList([EncoderBlock(D_MODEL, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderBlock(D_MODEL, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(D_MODEL, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        """
        Forward pass for Transformer.

        Args:
        src (Tensor): The source input tensor of shape (batch_size, src_seq_length).
        tgt (Tensor): The target input tensor of shape (batch_size, tgt_seq_length).
        src_mask (Tensor, optional): The source mask tensor for ignoring certain elements. Defaults to None.
        tgt_mask (Tensor, optional): The target mask tensor for ignoring certain elements. Defaults to None.

        Returns:
        Tensor: The output tensor of shape (batch_size, tgt_seq_length, tgt_vocab_size).
        """
        src = self.src_embedding(src)
        src = self.pos_encoding(src)

        tgt = self.tgt_embedding(tgt)
        tgt = self.pos_encoding(tgt)

        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)

        out = self.fc(tgt)

        return out

## 🧠 Complete Transformer Architecture: Encoder-Decoder Design

### 🔧 What I Did in This Cell
In this cell, I built the **complete Transformer model** by integrating:
- **Embedding layers** for both source and target vocabularies.
- **Positional encoding** to inject order information.
- A stack of `EncoderBlock`s to process the input sequence.
- A stack of `DecoderBlock`s to generate the output sequence autoregressively.
- A final **linear layer** to project the decoder outputs into vocabulary logits for prediction.

This forms a **sequence-to-sequence (seq2seq)** model that can translate, summarize, or generate text.

---

### 🎯 Intuition Behind the Transformer Model
The Transformer model uses a pure **attention-based architecture** (no recurrence or convolution) to model sequences. The core idea is:
- The **encoder** reads the input (e.g., an English sentence) and generates a deep contextual representation.
- The **decoder** reads this representation while attending to its own generated tokens (e.g., in German) to produce the final output one token at a time.

Thanks to **self-attention**, **residual connections**, and **parallelism**, this architecture is extremely effective for language understanding and generation tasks.

---

### ⚙️ How the Transformer Works

1. **Embeddings + Positional Encoding**:
   - Input and output tokens are first converted into dense vectors via embedding layers.
   - Positional encoding is added to help the model understand token order.

2. **Encoder Stack**:
   - A series of `EncoderBlock`s process the source sequence.
   - Each block refines the token representation by capturing global dependencies via self-attention.

3. **Decoder Stack**:
   - A series of `DecoderBlock`s generate the output sequence.
   - Each block uses:
     - **Masked self-attention** to prevent looking at future tokens.
     - **Encoder-decoder attention** to align with the input.
     - **Feed-forward layers** for nonlinear transformation.

4. **Final Linear Layer**:
   - The decoder output (of shape `(batch_size, tgt_seq_len, d_model)`) is passed through a linear layer to convert it into **logits over the target vocabulary**.

---

### 🔥 Why This is Important

- The Transformer is the **foundation of modern NLP and generative AI models**.
- Enables **parallel computation** during training (unlike RNNs).
- Can model **long-range dependencies** more effectively than traditional sequence models.
- Scales well to very large datasets and model sizes (e.g., GPT, BERT, T5).
- General-purpose: works for text translation, summarization, question answering, and even vision and protein modeling.

---

### 🕰️ When to Use a Transformer

Use this architecture when you need:
- **Sequence-to-sequence modeling** (e.g., machine translation, text generation, dialogue systems).
- **Parallelism and speed** during training (compared to RNNs or LSTMs).
- **Flexibility** in modeling long-range context and complex relationships in data.
- A base for **pretraining** large language models (e.g., GPT, BERT, T5).

---

### ✅ Summary
This `Transformer` class is a modular, flexible, and powerful implementation of the original encoder-decoder Transformer from the "Attention is All You Need" paper. It serves as a foundation for building and customizing many state-of-the-art deep learning models for language, vision, and multimodal tasks. ⚙️🚀📚


In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def visualize_transformer(num_encoder_layers, num_decoder_layers, num_heads, d_model):
    """
    Visualizes a high-level architecture of the Transformer model.

    Args:
        num_encoder_layers (int): Number of encoder layers.
        num_decoder_layers (int): Number of decoder layers.
        num_heads (int): Number of attention heads.
        d_model (int): Embedding dimension.
    """
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.axis('off')

    # Encoder blocks
    encoder_x = 1
    encoder_y_start = 1
    encoder_height = 0.6
    encoder_gap = 0.15
    for i in range(num_encoder_layers):
        rect = mpatches.FancyBboxPatch(
            (encoder_x, encoder_y_start + i * (encoder_height + encoder_gap)),
            1.2, encoder_height,
            boxstyle="round,pad=0.02",
            edgecolor='navy', facecolor='#cce5ff', linewidth=2
        )
        ax.add_patch(rect)
        ax.text(encoder_x + 0.6, encoder_y_start + i * (encoder_height + encoder_gap) + encoder_height/2,
                f'Encoder Layer {i+1}\n(Multi-Head x{num_heads})', ha='center', va='center', fontsize=10)

    # Decoder blocks
    decoder_x = 5
    decoder_y_start = 1
    decoder_height = 0.6
    decoder_gap = 0.15
    for i in range(num_decoder_layers):
        rect = mpatches.FancyBboxPatch(
            (decoder_x, decoder_y_start + i * (decoder_height + decoder_gap)),
            1.2, decoder_height,
            boxstyle="round,pad=0.02",
            edgecolor='darkgreen', facecolor='#d4edda', linewidth=2
        )
        ax.add_patch(rect)
        ax.text(decoder_x + 0.6, decoder_y_start + i * (decoder_height + decoder_gap) + decoder_height/2,
                f'Decoder Layer {i+1}\n(Multi-Head x{num_heads})', ha='center', va='center', fontsize=10)

    # Embedding and Positional Encoding
    ax.text(encoder_x - 0.7, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2,
            'Input\nEmbedding\n+\nPositional\nEncoding', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#f8d7da", ec="crimson"))
    ax.text(decoder_x - 0.7, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2,
            'Output\nEmbedding\n+\nPositional\nEncoding', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#fff3cd", ec="#856404"))

    # Output Linear Layer
    ax.text(decoder_x + 2.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2,
            'Linear\n& Softmax', ha='center', va='center', fontsize=11, bbox=dict(boxstyle="round", fc="#d1ecf1", ec="#0c5460"))

    # Arrows: Input -> Encoder
    ax.annotate('', xy=(encoder_x, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                xytext=(encoder_x - 0.2, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Arrows: Encoder -> Decoder (cross attention)
    ax.annotate('', xy=(encoder_x + 1.2, encoder_y_start + (num_encoder_layers * (encoder_height + encoder_gap))/2),
                xytext=(decoder_x, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='gray', arrowstyle='->', lw=2, linestyle='dashed'))

    # Arrows: Output Embedding -> Decoder
    ax.annotate('', xy=(decoder_x, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                xytext=(decoder_x - 0.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Arrows: Decoder -> Linear
    ax.annotate('', xy=(decoder_x + 1.2, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                xytext=(decoder_x + 2.0, decoder_y_start + (num_decoder_layers * (decoder_height + decoder_gap))/2),
                arrowprops=dict(facecolor='black', arrowstyle='->', lw=2))

    # Title and legend
    ax.set_title(f"Transformer Architecture\n(Encoder Layers: {num_encoder_layers}, Decoder Layers: {num_decoder_layers}, Heads: {num_heads}, d_model: {d_model})", fontsize=14, pad=20)
    plt.xlim(0, 8)
    plt.ylim(0, 3 + max(num_encoder_layers, num_decoder_layers) * (encoder_height + encoder_gap) / 2)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_transformer(num_encoder_layers=6, num_decoder_layers=6, num_heads=8, d_model=512)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

This image illustrates the **Transformer Architecture**, a foundational model used in many modern NLP systems such as BERT, GPT, and T5. It visualizes how information flows through the **encoder-decoder structure** of a transformer.

Let’s break it down **intuitively and step by step**:

---

## 🧠 High-Level Summary

* The **left side** is the **Encoder**, which reads and understands the input.
* The **right side** is the **Decoder**, which generates the output (like a translated sentence).
* In between, the encoder and decoder **communicate** using attention mechanisms.
* The **transformer uses 8 attention heads** and **6 stacked layers** in both encoder and decoder, with a model dimension (`d_model`) of 512.

---

## 🔴 Step 1: Input Embedding + Positional Encoding

* Each word/token in the input sequence is **converted to a dense vector (embedding)**.
* Since transformers don’t understand order, **positional encoding** is added to inject the notion of word order (e.g., who comes first in the sentence).

📦 Example:

> `"I love transformers"` → becomes a matrix of shape `(seq_len, 512)`.

---

## 🔵 Encoder Stack (Left Side)

There are **6 identical encoder layers**, each with:

1. **Multi-head self-attention**
   → Every word looks at every other word (including itself) to understand context.

   > e.g., “bank” can mean money or river — attention helps disambiguate it by context.

2. **Feedforward network**
   → A small neural network to refine each word’s representation.

3. **Residual connections + LayerNorm**
   → Helps in stabilizing training and preserving input signals.

📌 Output: A context-enriched representation for each word.

---

## 🟡 Output Embedding + Positional Encoding

This is for the **decoder input** (often previous tokens during training or inference).

* The decoder also needs **positional info**.
* It uses **shifted right** sequences during training (i.e., we don't feed the full output at once, only up to the current word).

---

## 🟢 Decoder Stack (Right Side)

Also has **6 layers**, and each contains:

1. **Masked Multi-head self-attention**
   → Each position can only attend to previous tokens (to prevent cheating during generation).

2. **Encoder-Decoder attention**
   → The decoder attends to encoder outputs — this is how the decoder knows what the input meant.

3. **Feedforward network**
   → Like in the encoder, applies transformation to each position.

Each layer builds a **richer representation** of the output sequence being generated.

---

## 🔷 Final Step: Linear + Softmax

* After decoder layers, the final output goes through a **linear layer** followed by **softmax** to predict the next word.
* This output is a probability distribution over the vocabulary.

---

## 🔁 Example: English to French Translation

```text
Input: "I love transformers"
↓
Encoder processes this and creates contextual embeddings
↓
Decoder begins with: "<start>" token
↓
Decoder predicts "J'"
↓
Then uses "J'" + context to predict "aime"
↓
Repeats until "<end>" is generated
```

---

## ⚙️ Config Summary (from diagram):

* `Encoder Layers: 6`
* `Decoder Layers: 6`
* `Heads: 8` (each layer has 8 attention heads)
* `d_model: 512` (embedding size)



In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import Multi30k
from collections import Counter
from tqdm import tqdm
import spacy
import sys
import subprocess

def ensure_spacy_model(model_name):
    try:
        spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model '{model_name}'...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])

# Ensure required spaCy models are installed
ensure_spacy_model("en_core_web_sm")
ensure_spacy_model("de_core_news_sm")

en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
de_tokenizer = get_tokenizer("spacy", language="de_core_news_sm")

def tokenize_en(text):
    return en_tokenizer(str(text))

def tokenize_de(text):
    return de_tokenizer(str(text))

# --- FIXED DATA LOADING ---
train_iter = Multi30k(split='train', language_pair=('en', 'de'))
train_data_en = []
train_data_de = []
for en, de in train_iter:
    train_data_en.append(en)
    train_data_de.append(de)
# --- END FIX ---

class VOCAB:
    def __init__(self, tokenizer, min_freq=2, data=None, special_tokens=['<pad>', '<sos>', '<eos>', '<unk>']):
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        self.special_tokens = special_tokens
        self.build_vocab(data)

    def build_vocab(self, data):
        counter = Counter()
        for text in tqdm(data, desc="Building vocab"):
            tokens = self.tokenizer(text)
            counter.update(tokens)
        tokens = [token for token, freq in counter.items() if freq >= self.min_freq and token not in self.special_tokens]
        tokens = self.special_tokens + tokens
        self.stoi = {token: index for index, token in enumerate(tokens)}
        self.itos = tokens

    def __len__(self):
        return len(self.stoi)

    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi['<unk>'])

EN_VOCAB = VOCAB(tokenize_en, min_freq=1, data=train_data_en)
DE_VOCAB = VOCAB(tokenize_de, min_freq=1, data=train_data_de)
print("\nVocab Size English", len(EN_VOCAB))
print("\nVocab Size German", len(DE_VOCAB))

Building vocab: 100%|██████████| 29001/29001 [00:00<00:00, 36164.77it/s]
Building vocab: 100%|██████████| 29001/29001 [00:01<00:00, 23170.36it/s]


Vocab Size English 10837

Vocab Size German 19214





## 📥 Data Ingestion & Vocabulary Creation for Machine Translation

### 🧠 What I Did in This Cell

In this cell, I implemented the **data ingestion pipeline and vocabulary builder** for a machine translation task using the **Multi30k dataset** from `torchtext`. Specifically:

1. Downloaded and verified the required **spaCy tokenizers** for English (`en_core_web_sm`) and German (`de_core_news_sm`).
2. Defined **language-specific tokenizers** to split sentences into words/tokens.
3. Loaded the **training split** of the Multi30k dataset, extracting parallel English-German sentence pairs.
4. Built a custom `VOCAB` class that:
   - Tokenizes each sentence,
   - Counts token frequencies,
   - Filters out rare tokens (based on `min_freq`),
   - Stores mappings from token to index (`stoi`) and index to token (`itos`).
5. Constructed vocabularies for both English and German data.

---

### 🎯 Intuition Behind This Cell

Before feeding sentences into a Transformer, we must convert words into **numerical indices** that the model can process. However, raw text is unstructured — every model needs a **vocabulary** to map each unique token to an integer index.

This step is **crucial** in NLP workflows because:
- It determines the **size and content of the embedding space**.
- It defines how **unknown and special tokens** (`<pad>`, `<sos>`, `<eos>`, `<unk>`) are handled.
- A good vocabulary reduces overfitting by filtering out noisy/rare words.

Tokenization also ensures that input sentences are split **accurately and consistently**, especially across languages like English and German.

---

### ⚙️ How It Works Step-by-Step

1. **spaCy Model Verification**:
   - Calls `ensure_spacy_model()` to check if the tokenizer models are available.
   - Downloads them if missing.

2. **Tokenizers**:
   - `tokenize_en()` and `tokenize_de()` wrap the spaCy tokenizers for easy access.

3. **Data Loading**:
   - Uses `torchtext.datasets.Multi30k` to load English-German sentence pairs from the training set.
   - Stores them in separate lists: `train_data_en` and `train_data_de`.

4. **Vocabulary Building**:
   - The `VOCAB` class tokenizes each sentence, counts token frequencies, and keeps only those above `min_freq`.
   - Initializes the mapping of tokens to indices (`stoi`) and vice versa (`itos`), including special tokens.

---

### 🔥 Why This Is Important

- **Token-to-index mapping is foundational** for all NLP models.
- Ensures **consistent preprocessing** for training, validation, and inference.
- **Handles unknowns, padding, and sequence boundaries** in a standardized way using special tokens.
- Enables **efficient batch processing** and embedding lookup via integer indexing.
- Filtering rare tokens reduces vocabulary size, which leads to **faster training and better generalization**.

Without this step, the Transformer would not understand what to embed, attend to, or decode.

---

### 🕰️ When Do We Use This?

This step is used:
- **Before training** a sequence-to-sequence model like a Transformer.
- **During inference** to preprocess input and decode output.
- Whenever working with **custom or external datasets** where tokenization and vocabulary building are not pre-defined.
- In **low-resource** or **domain-specific** translation settings where vocabularies need to be tailored manually.

---

### ✅ Summary

This cell sets up the entire **preprocessing foundation** required for training a machine translation model. From downloading language tokenizers to building numerical vocabularies, this is a critical first step to enable deep learning models to understand and process language effectively. 🌍🔤➡️📊



This code prepares bilingual text data (English ↔ German) for a machine translation model using the **Multi30k dataset**.

Here's what it does:

* ✅ **Loads English–German sentence pairs** using `torchtext`.
* 🧠 **Tokenizes** the sentences using **spaCy**, a language-aware tokenizer that handles grammar, punctuation, and morphology.
* 📦 **Builds vocabularies** for both languages:

  * Assigns each token a unique index.
  * Includes special tokens like `<pad>`, `<sos>`, `<eos>`, and `<unk>`.
  * Filters out rare words (optional via `min_freq`).
* 🔢 The final output is a mapping from words → numbers, which is essential for training neural networks.

This setup forms the **first step in building a translation model**—converting raw text into something a neural model can understand and learn from.


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, en_data, de_data, src_tokenizer, tgt_tokenizer, src_vocab, tgt_vocab):
        """
        Initialize the dataset with source (English) and target (German) data,
        along with their respective tokenizers and vocabularies.
        """
        self.en_data = en_data  # List of English sentences
        self.de_data = de_data  # List of German sentences
        self.src_tokenizer = src_tokenizer  # Tokenizer for English
        self.tgt_tokenizer = tgt_tokenizer  # Tokenizer for German
        self.src_vocab = src_vocab  # English vocabulary (token -> index)
        self.tgt_vocab = tgt_vocab  # German vocabulary (token -> index)

    def __getitem__(self, index):
        """
        Process a single (English, German) sentence pair:
        - Tokenize
        - Convert tokens to indices
        - Add <sos> and <eos> special tokens
        - Return tensors for both source and target
        """
        src_txt, tgt_txt = self.en_data[index], self.de_data[index]

        # Tokenize and convert to indices using vocab
        src_tokens = [self.src_vocab[token] for token in self.src_tokenizer(src_txt)]
        tgt_tokens = [self.tgt_vocab[token] for token in self.tgt_tokenizer(tgt_txt)]

        # Add <sos> and <eos> tokens around the sequences
        src_tokens = [self.src_vocab['<sos>']] + src_tokens + [self.src_vocab['<eos>']]
        tgt_tokens = [self.tgt_vocab['<sos>']] + tgt_tokens + [self.tgt_vocab['<eos>']]

        # Convert to PyTorch tensors
        src_tensor = torch.LongTensor(src_tokens)
        tgt_tensor = torch.LongTensor(tgt_tokens)

        return src_tensor, tgt_tensor

    def __len__(self):
        """
        Return the total number of sentence pairs in the dataset.
        """
        assert len(self.en_data) == len(self.de_data)  # Ensure aligned data
        return len(self.en_data)

    def collate_fn(self, batch):
        """
        Custom function to pad batches of variable-length sequences:
        - Pads all source and target tensors in the batch to the same length
        - Uses <pad> token index from vocab
        - Returns two padded tensors (src_batch, tgt_batch)
        """
        src_tensors, tgt_tensors = zip(*batch)  # Unzip list of (src, tgt) pairs

        # Pad sequences to match longest in batch (for batching)
        src_tensors = torch.nn.utils.rnn.pad_sequence(
            src_tensors, padding_value=self.src_vocab['<pad>'], batch_first=True
        )
        tgt_tensors = torch.nn.utils.rnn.pad_sequence(
            tgt_tensors, padding_value=self.tgt_vocab['<pad>'], batch_first=True
        )

        return src_tensors, tgt_tensors


## 📦 TranslationDataset: Custom Dataset for Sequence-to-Sequence Learning

### 🧠 What I Did in This Cell

In this cell, I implemented a **custom PyTorch Dataset class** `TranslationDataset` to preprocess and prepare English-German sentence pairs for training a Transformer model. The class:

- Tokenizes each sentence using spaCy tokenizers.
- Converts tokens to vocabulary indices.
- Adds special tokens (`<sos>`, `<eos>`) for marking sentence start and end.
- Pads variable-length sequences to support efficient batching using a custom `collate_fn`.

This forms a fully functional data pipeline for model training and evaluation.

---

### 🎯 Intuition Behind the Dataset Class

Deep learning models like Transformers require:
1. **Tensors as inputs**, not raw text.
2. **Fixed-length batches**, which means dynamic padding is needed.
3. **Start and end markers** in sequence generation tasks (e.g., translation, summarization).

This class wraps those requirements into a clean, modular interface compatible with PyTorch’s `DataLoader`.

---

### ⚙️ How It Works Step-by-Step

1. **Initialization (`__init__`)**:
   - Accepts English and German sentence lists.
   - Takes corresponding tokenizers and vocabularies for both languages.
   - Stores them for use in `__getitem__`.

2. **Single Example Processing (`__getitem__`)**:
   - Selects one English-German sentence pair by index.
   - Tokenizes both using their respective tokenizers.
   - Converts tokens to indices using the `VOCAB` class.
   - Adds `<sos>` (start of sequence) and `<eos>` (end of sequence) tokens to help the model learn generation boundaries.
   - Returns tensors for both source and target sequences.

3. **Dataset Size (`__len__`)**:
   - Returns the number of sentence pairs in the dataset (used by `DataLoader` for batching).

4. **Batch Padding (`collate_fn`)**:
   - Custom function for `DataLoader` to batch variable-length sequences.
   - Uses `torch.nn.utils.rnn.pad_sequence` to pad all sequences in a batch to the maximum length in that batch.
   - Pads with the `<pad>` token index so the model can later ignore those positions.

---

### 🔥 Why This is Important

- Ensures **consistent preprocessing** across every sample during training.
- Enables **efficient batch processing** using PyTorch’s `DataLoader` (essential for GPU training).
- Handles **padding and token alignment** correctly — a common source of bugs in seq2seq models.
- Easily integrates with attention-based models that require sequence lengths to be uniform within a batch.

Without this, it would be much harder to:
- Train on large-scale data efficiently.
- Ensure correct behavior of masking and alignment in Transformer layers.

---

### 🕰️ When to Use This

Use this custom dataset when:
- You are working on **sequence-to-sequence tasks** (like translation, summarization, or dialogue generation).
- Your sequences vary in length and need **dynamic padding**.
- You want to integrate tokenization, index conversion, and special token management in one place.
- You plan to use a PyTorch `DataLoader` for training your model in batches.

---

### ✅ Summary

This `TranslationDataset` is a critical bridge between raw data and model input. It handles everything from tokenization and indexing to padding and batching, enabling a robust and clean training pipeline for NLP tasks using PyTorch. ⚙️📚🚀


In [12]:
# Use the correct Multi30k data folder and file names as per the multi30k_data folder and the 2016 split

import gzip

def read_gzipped_lines(filepath):
    with gzip.open(filepath, "rt", encoding="utf-8") as f:
        return [line.strip() for line in f]

# Paths to gzipped files in the multi30k_data folder
train_en_file = "multi30k/train.en.gz"
train_de_file = "multi30k/train.de.gz"
val_en_file   = "multi30k/val.en.gz"
val_de_file   = "multi30k/val.de.gz"
test_en_file  = "multi30k/test_2016_flickr.en.gz"
test_de_file  = "multi30k/test_2016_flickr.de.gz"

# Read the data from gzipped files
train_data_en = read_gzipped_lines(train_en_file)
train_data_de = read_gzipped_lines(train_de_file)
val_data_en   = read_gzipped_lines(val_en_file)
val_data_de   = read_gzipped_lines(val_de_file)
test_data_en  = read_gzipped_lines(test_en_file)
test_data_de  = read_gzipped_lines(test_de_file)

train_dataset = TranslationDataset(train_data_en, train_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)
val_dataset = TranslationDataset(val_data_en, val_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)
test_dataset = TranslationDataset(test_data_en, test_data_de, tokenize_en, tokenize_de, EN_VOCAB, DE_VOCAB)

BATCH_SIZE = 128

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=val_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=test_dataset.collate_fn)

## 📂 Loading Preprocessed Multi30k Data & Creating DataLoaders

### 🧠 What I Did in This Cell

In this cell, I:
1. Defined a utility function to read **gzipped files** line by line (as text).
2. Loaded the **English-German sentence pairs** from the `Multi30k` dataset's gzipped files:
   - For **training**, **validation**, and **test** sets (2016 Flickr split).
3. Wrapped these sentence pairs into `TranslationDataset` instances.
4. Created corresponding **PyTorch `DataLoader`s** to:
   - Efficiently batch and shuffle training data
   - Prepare padded batches for the Transformer model

---

### 🎯 Intuition Behind This Pipeline

- The `Multi30k` dataset is stored as compressed `.gz` text files. We must **manually extract** and preprocess this raw text.
- `TranslationDataset` handles **tokenization, numerical encoding, and padding**, ensuring the data is ready for the model.
- `DataLoader` enables:
  - **Batching** for faster training
  - **Shuffling** for training robustness
  - **Custom collation** to dynamically pad sequences to the longest sentence in each batch

This setup ensures a smooth, repeatable pipeline from raw `.gz` files to ready-to-train batches.

---

### ⚙️ How It Works Step-by-Step

1. **Reading Compressed Files**:
   - `read_gzipped_lines(filepath)` uses Python’s `gzip` module to read `.gz` files line by line.
   - Each line is stripped of whitespace to prepare a clean sentence list.

2. **Loading Multi30k 2016 Data**:
   - Trains with `train.en.gz` and `train.de.gz`
   - Validates with `val.en.gz` and `val.de.gz`
   - Tests with `test_2016_flickr.en.gz` and `test_2016_flickr.de.gz`
   - Each file contains **sentence-aligned** English and German data.

3. **Wrapping with `TranslationDataset`**:
   - Handles tokenization, vocabulary lookup, and special token addition (`<sos>`, `<eos>`).
   - Returns padded tensors via `collate_fn`.

4. **Creating PyTorch DataLoaders**:
   - Batches of size `BATCH_SIZE = 128` are created.
   - `shuffle=True` ensures randomness during training.
   - Padding is handled dynamically in `collate_fn` so sequences are model-ready.

---

### 🔥 Why This Is Important

- Most real-world datasets are **compressed** or in non-PyTorch-native formats — this step shows how to bridge that.
- Batching and padding are **essential for GPU efficiency and stability** in training.
- Ensures **consistency and reproducibility** in data processing across training, validation, and testing.
- Helps in **avoiding OOM errors** and ensures that shorter sequences don't affect model convergence.

---

### 🕰️ When to Use This

Use this setup when:
- You are working with **custom or raw parallel corpora** in `.txt.gz` or `.csv.gz` formats.
- You want to build a **training pipeline compatible with Transformers or any seq2seq model**.
- You need **fast iteration, batching, and GPU-optimized t**


In [13]:
train_dataset[0]

(tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2]),
 tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  2]))

In [14]:
' '.join([EN_VOCAB.itos[i] for i in train_dataset[0][0]]), ' '.join([DE_VOCAB.itos[i] for i in train_dataset[0][1]])

('<sos> Two young , White males are outside near many bushes . <eos>',
 '<sos> Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche . <eos>')

In [15]:
test_data_en[0], test_data_de[0]

('A man in an orange hat starring at something.',
 'Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.')

In [18]:
' '.join([EN_VOCAB.itos[i] for i in test_dataset[0][0]]), ' '.join([DE_VOCAB.itos[i] for i in test_dataset[0][1]])

('<sos> A man in an orange hat starring at something . <eos>',
 '<sos> Ein Mann mit einem orangefarbenen Hut , der etwas <unk> . <eos>')

# Define Model and associated parameters

In [16]:
#Define Hyperparameters
# Define Hyper Parameters
NUM_EPOCHS      = 20
D_MODEL         = 256
ATTN_HEADS      = 8
NUM_LAYERS      = 3
FEEDFORWARD_DIM = 512
DROPOUT         = 0.1
MAX_SEQ_LEN     = 150
SRC_VOCAB_SIZE  = len(EN_VOCAB)
TGT_VOCAB_SIZE  = len(DE_VOCAB)
LR              = 0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 🧪 Defining Hyperparameters for the Transformer Model

### 🧠 What I Did in This Cell

In this cell, I defined the key **hyperparameters** required to build, train, and evaluate the Transformer model for machine translation (English to German). These parameters control the model’s size, training duration, regularization, and computation settings.

---

### 🎯 Intuition Behind Each Hyperparameter

- **NUM_EPOCHS = 20**  
  Number of full passes over the training dataset. A higher number allows better convergence but risks overfitting if too large.

- **D_MODEL = 256**  
  The size of each token embedding and the hidden dimension in the Transformer. This controls the model's capacity — a higher value means more expressive power but also more compute.

- **ATTN_HEADS = 8**  
  Number of parallel attention heads in the multi-head self-attention mechanism. Each head learns different relational patterns in the sequence.

- **NUM_LAYERS = 3**  
  Number of encoder and decoder layers (i.e., Transformer blocks). More layers improve modeling depth but increase training time and memory use.

- **FEEDFORWARD_DIM = 512**  
  Size of the intermediate layer in the position-wise feedforward network. Typically set to 2–4× `D_MODEL`.

- **DROPOUT = 0.1**  
  Dropout probability used for regularization to prevent overfitting. Applied after attention and feedforward layers.

- **MAX_SEQ_LEN = 150**  
  Maximum sequence length expected for both source and target. Used to construct the positional encoding matrix.

- **SRC_VOCAB_SIZE = len(EN_VOCAB)**  
  Size of the input vocabulary (English). Needed to initialize the embedding layer for the encoder.

- **TGT_VOCAB_SIZE = len(DE_VOCAB)**  
  Size of the output vocabulary (German). Needed to initialize the embedding and final projection layer for the decoder.

- **LR = 0**  
  Placeholder for learning rate (will typically be defined later, e.g., with warmup scheduling or Adam optimizer setup).

- **DEVICE = torch.device(...)**  
  Automatically sets the computation device to **GPU (if available)** or **CPU**, ensuring training compatibility across environments.

---

### 🔥 Why These Are Important

- These parameters directly affect **model performance**, **training time**, and **resource usage**.
- They allow easy experimentation and tuning without modifying the model architecture.
- Keeping hyperparameters centralized makes the project more **maintainable and reproducible**.

---

### 🕰️ When to Modify These

Adjust these hyperparameters:
- When switching to a **larger or smaller dataset**
- To reduce **memory usage or training time**
- To improve **model accuracy or convergence**
- During **hyperparameter tuning** or **grid/random search** for optimal configuration

---

### ✅ Summary

This cell defines the foundational settings that control the behavior and training of the Transformer model. They balance expressiveness, training stability, and compute efficiency — and can be tuned based on the problem scale and hardware availability. 🧠⚙️🚀


In [17]:
    
class NoamScheduler:
    def __init__(self,optimizer,d_model, warmup_steps = 4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.current_step = 0

    def step(self):
        self.current_step += 1
        lr = self.learning_rate()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def learning_rate(self):
        step = self.current_step
        # Add a small epsilon to avoid division by zero if step is 0
        return (self.d_model ** -0.5) * min((step + 1e-9) ** -0.5, step * self.warmup_steps ** -1.5)

In [18]:
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
model = Transformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, D_MODEL, ATTN_HEADS, FEEDFORWARD_DIM, MAX_SEQ_LEN, NUM_LAYERS, DROPOUT).to(DEVICE)
# optimizer = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9, weight_decay=5e-2)
warmup_steps = 2 * len(train_dataloader)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)
# scheduler = LambdaLR(optimizer, lr_lambda=lambda step: (D_MODEL ** -0.5) * min((step + 1) ** -0.5, (step + 1) * warmup_steps ** -1.5), verbose=True)
scheduler = NoamScheduler(optimizer, d_model=D_MODEL, warmup_steps=warmup_steps)
criterion = torch.nn.CrossEntropyLoss(ignore_index=DE_VOCAB['<pad>'], label_smoothing=0.1)
scaler = torch.cuda.amp.GradScaler()

## ⚙️ Model Initialization, Optimizer, Scheduler, Loss & Precision Setup

### 🧠 What I Did in This Cell

In this step, I completed the full training configuration for the Transformer-based machine translation model. This includes:
1. Initializing the Transformer model with vocabulary sizes and architecture.
2. Defining the `AdamW` optimizer with proper settings for Transformers.
3. Setting up a custom learning rate scheduler (`NoamScheduler`) as described in the original paper.
4. Defining a smoothed cross-entropy loss function with padding ignored.
5. Enabling automatic mixed-precision (AMP) training for efficient GPU usage.

---

### 🧩 Breakdown & Intuition

#### 1. **Model Initialization**
```python
model = Transformer(...).to(DEVICE)


In [19]:
# Install nltk if not already installed
import sys
import subprocess

try:
    import nltk
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk

import sacrebleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def generate_tgt_mask(tgt, pad_idx):
    seq_len = tgt.size(1)
    no_future_mask = torch.tril(torch.ones((seq_len, seq_len), device=DEVICE)).bool()
    pad_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(2)
    combined_mask = pad_mask & no_future_mask
    return combined_mask

def generate_src_mask(src, pad_idx):
    mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)
    return mask

def calculate_bleu(tgt_output, output):
    tgt_output = tgt_output.cpu().numpy()
    output = output.cpu().numpy()

    refs = []
    hyps = []

    for tgt, pred in zip(tgt_output, output):
        ref = ' '.join([DE_VOCAB.itos[t] for t in tgt if t not in (DE_VOCAB['<pad>'], DE_VOCAB['<eos>'], DE_VOCAB['<sos>'])])
        hyp = ' '.join([DE_VOCAB.itos[t] for t in pred if t not in (DE_VOCAB['<pad>'], DE_VOCAB['<eos>'], DE_VOCAB['<sos>'])])

        refs.append(ref)
        hyps.append(hyp)

    bleu = sacrebleu.corpus_bleu(hyps, [refs], force=True).score
    return bleu

## 🧪 Evaluation Utilities: BLEU Score, Masks, and Setup

### 🧠 What This Cell Does

This cell prepares the essential components required for **evaluating the performance** of the Transformer model during inference, especially using BLEU score — a popular metric in machine translation.

It includes:
1. Library imports and installation for `nltk` and `sacrebleu`.
2. Functions to generate appropriate **masks** for source and target sequences.
3. A function to **calculate the BLEU score** based on model predictions and true target sentences.

---

### 📦 1. Import & Install Evaluation Libraries

```python
try:
    import nltk
except ImportError:
    subprocess.check_call([...])


In [20]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Train')

    for i, (src, tgt) in enumerate(dataloader):
        src, tgt = src.to(device), tgt.to(device)
        # print("Src", src.shape, "Tgt", tgt.shape)
        src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            output = model(src, tgt_input, src_mask, tgt_mask)

            loss = criterion(output.reshape(-1, output.size(2)), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        # optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(total_loss / (i + 1)),
            lr="{:.09f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update()

    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, criterion, DEVICE):
    model.eval()
    epoch_loss = 0
    epoch_bleu_score = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Validate')

    with torch.no_grad():
        for i, (src, tgt) in enumerate(dataloader):
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])
            tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])

            with torch.cuda.amp.autocast():
                output = model(src, tgt_input, src_mask, tgt_mask)
                loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))

            epoch_loss += loss.item()
            epoch_bleu_score += calculate_bleu(tgt_output, output.argmax(-1))

            batch_bar.set_postfix(
            loss="{:.04f}".format(epoch_loss / (i + 1)),
            bleu="{:.04f}".format(epoch_bleu_score / (i + 1)))

            batch_bar.update()

    # Normalize the loss and BLEU score by the number of validation samples
    epoch_loss /= len(dataloader)
    epoch_bleu_score /= len(dataloader)

    return epoch_loss, epoch_bleu_score

## 🔁 Epoch Training & Validation Loops

### 🧠 What This Cell Does

This cell defines two critical functions for the training workflow:
- `train_epoch(...)`: Trains the model for **one full pass** over the training data.
- `validate_epoch(...)`: Evaluates the model on validation data after each epoch.

Together, these functions handle:
- Mask generation
- Forward and backward passes
- Optimizer and scheduler updates
- Loss tracking
- BLEU score evaluation (for translation quality)

---

### 🔂 `train_epoch(...)` — Function Overview

#### ✅ Function Purpose
- Trains the Transformer model on each batch of the training data.
- Computes loss using teacher forcing.
- Applies mixed-precision training with gradient scaling.
- Updates model parameters and learning rate.

#### 🔧 Key Steps

1. **Set model to training mode**
```python
model.train()


In [21]:
def inference(model, src, de_tokenizer):
    model.eval()

    src_mask = generate_src_mask(src, EN_VOCAB['<pad>'])

    # Initialize target input tensors with <sos> tokens
    tgt_input = torch.full((src.size(0), 1), DE_VOCAB['<sos>'], dtype=torch.long, device=DEVICE)

    # Create a flag for each sequence in the batch
    eos_flags = torch.zeros(src.size(0), dtype=torch.bool, device=DEVICE)

    # Perform inference for each target token
    with torch.no_grad():
        for _ in range(70):
            tgt_mask = generate_tgt_mask(tgt_input, DE_VOCAB['<pad>'])
            output = model(src, tgt_input, src_mask, tgt_mask)
            next_tokens = output.argmax(2)[:, -1].unsqueeze(1)
            tgt_input = torch.cat((tgt_input, next_tokens), dim=1)

            # Update the eos_flags for sequences that have generated <eos>
            eos_flags |= (next_tokens.squeeze() == DE_VOCAB['<eos>'])

            # Stop generating tokens if all sequences have generated <eos> or reached maximum length
            if torch.all(eos_flags):
                break

    # Convert target input tensors to translated sentences
    translated_sentences = []
    for i in range(tgt_input.size(0)):
        translated_tokens = []
        for token in tgt_input[i][1:]:
            if token == DE_VOCAB['<eos>']:
                break
            else:
                translated_tokens.append(DE_VOCAB.itos[token.item()])
        translated_sentence = ' '.join(translated_tokens)
        translated_sentences.append(translated_sentence)
    return translated_sentences

## 🔍 Inference: Translating New Sentences with the Transformer Model

### 🧠 What This Function Does

The `inference(...)` function performs **greedy decoding** using a trained Transformer model to generate target (German) sentences from source (English) input sequences. It generates tokens one by one, stopping when the `<eos>` (end-of-sequence) token is produced or a maximum length is reached.

---

### 🧪 Key Steps and Intuition

1. **Set Model to Evaluation Mode**
```python
model.eval()


In [41]:
# Print a summary of the model architecture
print(model)


Transformer(
  (src_embedding): Embedding(10837, 256)
  (tgt_embedding): Embedding(19214, 256)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderBlock(
      (self_attn): MultiHeadSelfAttention(
        (wq): Linear(in_features=256, out_features=256, bias=True)
        (wk): Linear(in_features=256, out_features=256, bias=True)
        (wv): Linear(in_features=256, out_features=256, bias=True)
        (wo): Linear(in_features=256, out_features=256, bias=True)
      )
      (norm1): AddNorm(
        (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
      )
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=256, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=True)
      )
      (norm2): AddNorm(
        (norm): LayerNorm((256,), eps=1e-06, elementwise_affine=True)
   

In [42]:
import numpy as np
import torch

# Make sure the following are defined before running:
# NUM_EPOCHS, model, optimizer, criterion, DEVICE, scheduler (optional)
# train_epoch, validate_epoch, train_dataloader, val_dataloader

def main():
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    bleu_scores = []

    for epoch in range(1, NUM_EPOCHS + 1):
        print(f"\nEpoch {epoch}/{NUM_EPOCHS}")

        # ----- Training -----
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, DEVICE)
        train_losses.append(train_loss)

        # ----- Validation -----
        val_loss, bleu_score = validate_epoch(model, val_dataloader, criterion, DEVICE)
        val_losses.append(val_loss)
        bleu_scores.append(bleu_score)

        # ----- Print epoch summary -----
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | BLEU Score: {bleu_score:.4f}")

        # ----- Save the best model -----
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print("✅ Best model saved.")

        # ----- Step the scheduler if available -----
        if 'scheduler' in locals() and scheduler is not None:
            scheduler.step()

    # Save loss/metric history
    np.save("train_losses.npy", np.array(train_losses))
    np.save("val_losses.npy", np.array(val_losses))
    np.save("bleu_scores.npy", np.array(bleu_scores))
    print(f"\nTraining complete. Best validation loss: {best_val_loss:.4f}")

if __name__ == "__main__":
    main()


Epoch 1/20


                                                                                     

Train Loss: 5.8258 | Val Loss: 4.2103 | BLEU Score: 6.6498
✅ Best model saved.

Epoch 2/20


                                                                                     

Train Loss: 3.8228 | Val Loss: 3.6152 | BLEU Score: 10.5960
✅ Best model saved.

Epoch 3/20


                                                                                     

Train Loss: 3.3153 | Val Loss: 3.3305 | BLEU Score: 14.9178
✅ Best model saved.

Epoch 4/20


                                                                                     

Train Loss: 2.9369 | Val Loss: 3.1557 | BLEU Score: 14.2710
✅ Best model saved.

Epoch 5/20


                                                                                     

Train Loss: 2.6769 | Val Loss: 3.0911 | BLEU Score: 16.9041
✅ Best model saved.

Epoch 6/20


                                                                                     

Train Loss: 2.4787 | Val Loss: 3.0350 | BLEU Score: 19.7898
✅ Best model saved.

Epoch 7/20


                                                                                     

Train Loss: 2.3186 | Val Loss: 3.0254 | BLEU Score: 20.5710
✅ Best model saved.

Epoch 8/20


                                                                                     

Train Loss: 2.1906 | Val Loss: 2.9974 | BLEU Score: 27.9643
✅ Best model saved.

Epoch 9/20


                                                                                     

Train Loss: 2.0868 | Val Loss: 3.0169 | BLEU Score: 19.5570

Epoch 10/20


                                                                                     

Train Loss: 2.0041 | Val Loss: 3.0241 | BLEU Score: 19.3396

Epoch 11/20


                                                                                     

Train Loss: 1.9401 | Val Loss: 3.0485 | BLEU Score: 19.6496

Epoch 12/20


                                                                                     

Train Loss: 1.8880 | Val Loss: 3.0643 | BLEU Score: 20.1703

Epoch 13/20


                                                                                     

Train Loss: 1.8431 | Val Loss: 3.0741 | BLEU Score: 20.1040

Epoch 14/20


                                                                                     

Train Loss: 1.8054 | Val Loss: 3.1011 | BLEU Score: 19.0616

Epoch 15/20


                                                                                     

Train Loss: 1.7747 | Val Loss: 3.1071 | BLEU Score: 17.7373

Epoch 16/20


                                                                                     

Train Loss: 1.7470 | Val Loss: 3.1435 | BLEU Score: 18.8883

Epoch 17/20


                                                                                     

Train Loss: 1.7211 | Val Loss: 3.1534 | BLEU Score: 18.3990

Epoch 18/20


                                                                                     

Train Loss: 1.7001 | Val Loss: 3.1793 | BLEU Score: 23.1467

Epoch 19/20


                                                                                     

Train Loss: 1.6813 | Val Loss: 3.1750 | BLEU Score: 28.2455

Epoch 20/20


                                                                                     

Train Loss: 1.6653 | Val Loss: 3.1796 | BLEU Score: 23.5873

Training complete. Best validation loss: 2.9974




## 🚀 Main Training Loop

### 🧠 What This Code Does

This `main()` function runs the **full training pipeline** for the Transformer model across multiple epochs. It orchestrates the training and validation workflow, tracks performance metrics (loss & BLEU score), saves the best-performing model, and records history for later analysis.

---

### 🧪 Key Functionality Explained

#### 🔁 Epoch Loop
```python
for epoch in range(1, NUM_EPOCHS + 1):


In [44]:
import random
def evaluate_test_set_bleu(model, test_dataloader, de_tokenizer):
    translated_sentences = []
    ground_truth_sentences = []

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        src, tgt_output = batch
        src, tgt = src.to(DEVICE), tgt_output.to(DEVICE)
        tgt_sentences = [' '.join([DE_VOCAB.itos[token.item()] for token in sequence if token.item() not in [DE_VOCAB['<pad>'], DE_VOCAB['<sos>'], DE_VOCAB['<eos>']]]) for sequence in tgt_output]

        translations = inference(model, src, de_tokenizer)
        translated_sentences.extend(translations)
        ground_truth_sentences.extend([[tgt] for tgt in tgt_sentences])

    rand_index = random.randint(0, len(test_dataset))
    print("\n\nExample Sentence and its Translation")
    print("Source Sentence in English               :", ' '.join([EN_VOCAB.itos[i] for i in test_dataset[rand_index][0] if EN_VOCAB.itos[i] not in ['<pad>', '<sos>', '<eos>']]))
    print("German have the truth          :", ground_truth_sentences[rand_index][0])
    print("Machine Translated Sentence in German    :", translated_sentences[rand_index])
    bleu_score = sacrebleu.corpus_bleu(translated_sentences, ground_truth_sentences)
    return bleu_score

# Usage example
test_bleu = evaluate_test_set_bleu(model, test_dataloader, de_tokenizer)
print("Test BLEU score:", test_bleu.score)

Evaluating: 100%|██████████| 8/8 [00:50<00:00,  6.32s/it]



Example Sentence and its Translation
Source Sentence in English               : A female performer with a violin plays on a street while a woman with a blue guitar looks on .
German have the truth          : Eine <unk> mit einer Violine spielt auf der Straße während eine Frau mit einer blauen Gitarre zusieht .
Machine Translated Sentence in German    : Eine Künstlerin spielt auf einer Straße mit einer Violine , während eine Frau mit einer blauen Gitarre zuschaut .
Test BLEU score: 54.91004867761124





## 🧪 Test Set Evaluation with BLEU Score

### 🎯 Purpose

This function evaluates the **final performance** of the trained Transformer model on the test dataset using the **BLEU (Bilingual Evaluation Understudy)** score. It provides:
- Quantitative metric: BLEU score
- Qualitative insights: A randomly selected example of a source sentence, its ground truth translation, and the model’s translation

---

### 🧠 What Happens in This Function?

#### 1. **Initialize Storage**
```python
translated_sentences = []
ground_truth_sentences = []
