In [None]:
# Author: Roi Yehoshua
# Date: January 2024
# MIT License

# Based on the PyTorch implementation from https://nlp.seas.harvard.edu/annotated-transformer/

In [2]:
#Checking the required installations
!pip show portalocker
!pip show spacy
!pip show torchtext

Name: portalocker
Version: 2.8.2
Summary: Wraps the portalocker recipe for easy usage
Home-page: 
Author: 
Author-email: Rick van Hattem <wolph@wol.ph>
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 
Name: spacy
Version: 3.7.3
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, smart-open, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: de-core-news-sm, en-core-web-sm, fastai
Name: torchtext
Version: 0.17.0
Summary: Text utilities, models, transforms, and datasets for PyTorch.
Home-page: https://github.com/pytorch/text
Author: PyTorch Text Team
Author-email: packages@pytorch.org
License: BSD
Location: /usr/local/lib/python3.

In [2]:
#Upgrading the required packages
!pip install portalocker --quiet
!pip install spacy --upgrade --quiet
!pip install torchtext --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [1]:
#Importing the required libraries
import torch
import torch.nn as nn
import torch.optim as optim
import math
import spacy
import os

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torch.nn.functional as F

In [2]:
#Setting the manual seed and the device
torch.manual_seed(42)  # For reproducibility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Multi-Head Attention

$$
    \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \ldots, \text{head}_h)W^O \\
    \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\  
    \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

In [3]:
class MultiHeadAttention(nn.Module):
    """The multi-head attention module"""
    def __init__(self, d_model, num_heads):
        super().__init__()

        # Ensure the dimension of the model is divisible by the number of heads.
        # This is necessary to equally divide the embedding dimension across heads.
        assert d_model % num_heads == 0, 'd_model must be divisible by num_heads'

        self.d_model = d_model           # Total dimension of the model
        self.num_heads = num_heads       # Number of attention heads
        self.d_k = d_model // num_heads  # Dimnsion of each head. We assume d_v = d_k

        # Linear transformations for queries, keys, and values
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Final linear layer to project the concatenated heads' outputs back to d_model dimensions
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE

        # Calculate attention scores with scaling
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

        # Apply mask (if provided)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e10)

        # Apply softmax to attention scores to get probabilities
        attention_probs = F.softmax(scores, dim=-1)

        # Return the weighted sum of values based on attention probabilities
        output = torch.matmul(attention_probs, V)

        return output

    def split_heads(self, x):
        # Reshape the input tensor to [batch_size, num_heads, seq_length, d_k]
        # to prepare for multi-head attention processing
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Inverse operation of split_heads: combine the head outputs back into the original tensor shape
        # [batch_size, seq_length, d_model]
        batch_size, num_heads, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE

        # Linearly project the queries, keys, and values, and then split them into heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # scaled dot-product attention for each head
        attention_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Concatenate the heads' outputs
        output = self.combine_heads(attention_output)
        output = self.W_o(output)

        return output

### Feed-Forward NN

$$
    \text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2
$$

In [4]:
class PositionwiseFeedForward(nn.Module):
    """The Positionwise Feedforward Network (FFN) module"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        ### WRITE YOUR CODE HERE
        # FFN(x) = max(0, xW1 + b1)W2 + b2

        # First linear transformation
        intermediate_output = self.linear1(x)

        # Using the ReLU activation function
        intermediate_output = self.relu(intermediate_output)

        # Applying dropout
        intermediate_output = self.dropout(intermediate_output)

        # second linear transformation
        output = self.linear2(intermediate_output)

        return output

### Positional Encoding

$$
    \text{PE}(pos, 2i) = \sin(pos/10000^{2i/d_{\text{model}}}) \\
    \text{PE}(pos, 2i + 1) = \cos(pos/10000^{2i/d_{\text{model}}})
$$

In [5]:
class PositionalEncoding(nn.Module):
    """
    Implements the positional encoding module using sinusoidal functions of different frequencies
    for each dimension of the encoding.
    """
    def __init__(self, d_model, max_seq_length):
        super().__init__()

        # Create a positional encoding (PE) matrix with dimensions [max_seq_length, d_model].
        # This matrix will contain the positional encodings for all possible positions up to max_seq_length.
        pe = torch.zeros(max_seq_length, d_model)

        # Generate a tensor of positions (0 to max_seq_length - 1) and reshape it to [max_seq_length, 1].
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # Compute the division term used in the formulas for sin and cos functions.
        # This term is based on the dimension of the model and the position, ensuring that the wavelengths
        # form a geometric progression from 2π to 10000 * 2π. It uses only even indices for the dimensions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply the sin function to even indices in the PE matrix. These values are determined by
        # multiplying the position by the division term, creating a pattern where each position has
        # a unique sinusoidal encoding.
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply the cos function to odd indices in the PE matrix, complementing the sin-encoded positions.
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register 'pe' as a buffer within the module. Unlike parameters, buffers are not updated during training.
        # This is crucial because positional encodings are fixed and not subject to training updates.
        # The unsqueeze(0) adds a batch dimension for easier broadcasting with input tensors.
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to the input tensor x.
        # x is expected to have dimensions [batch_size, seq_length, d_model].
        # The positional encoding 'pe' is sliced to match the seq_length of 'x', and then added to 'x'.
        # This operation leverages broadcasting to apply the same positional encoding across the batch.
        x = x + self.pe[:, :x.size(1)]
        return x

### Encoder Layer

In [6]:
class EncoderLayer(nn.Module):
    """An encoder layer consists of a multi-head self-attention sublayer and a feed forward sublayer,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        ### WRITE YOUR CODE HERE

        # Multi-head self-attention sublayer
        attn_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.layer_norm1(x)  # layer normalization

        # Feed-forward sublayer
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layer_norm2(x)  # layer normalization

        return x

### Decoder Layer

In [7]:
class DecoderLayer(nn.Module):
    """A decoder layer consists of a multi-head self-attention, cross-attention and a feed-forward sublayers,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        ### WRITE YOUR CODE HERE

        # Multi-head self-attention sublayer
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = x + self.dropout(self_attn_output)
        x = self.layer_norm1(x)

        # Multi-head cross-attention sublayer
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = x + self.dropout(cross_attn_output)
        x = self.layer_norm2(x)

        # Feed-forward sublayer
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layer_norm3(x)

        return x


### The Full Model

In [8]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, N, n_heads, d_ff, max_seq_length, dropout, pad_idx):
        super().__init__()

        # Embedding layers for source and target
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Encoder and Decoder stacks
        self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])

        # Output linear layer
        self.out = nn.Linear(d_model, tgt_vocab_size)

        self.dropout = nn.Dropout(dropout)

        # Initialization
        self.init_weights()
        self.pad_idx = pad_idx

    def init_weights(self):
        """Initialize parameters with Glorot / fan_avg"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def create_source_mask(self, src):
        """Create masks for both padding tokens and future tokens"""
        # Source padding mask
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, src_len]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(2) adds a dimension for the attention scores
        # This mask can be broadcasted across the src_len dimension of the attention scores,
        # effectively masking out specific tokens across all heads and all positions in the sequence.
        return src_mask

    def create_target_mask(self, tgt):
        # Target padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(3)  # [batch_size, 1, tgt_len, 1]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(3) adds a dimension for the attention scores
        # The final shape allows the mask to be broadcast across the attention scores, ensuring positions only
        # attend to allowed positions as dictated by the no-peak mask (the preceding positions) and the padding mask.

        # Target no-peak mask
        tgt_len = tgt.size(1)
        tgt_nopeak_mask = torch.tril(torch.ones(tgt_len, tgt_len, device=device)).bool()

        # Combine masks
        tgt_mask = tgt_pad_mask & tgt_nopeak_mask  # [batch_size, 1, tgt_len, tgt_len]
        return tgt_mask

    def encode(self, src):
        """Encodes the source sequence using the Transformer encoder stack.
        """
        src_mask = self.create_source_mask(src)
        src = self.dropout(self.positional_encoding(self.src_embedding(src)))

        # Pass through each layer in the encoder
        for layer in self.encoder:
            src = layer(src, src_mask)
        return src, src_mask

    def decode(self, tgt, memory, src_mask):
        """Decodes the target sequence using the Transformer decoder stack, given the memory from the encoder.
        """
        tgt_mask = self.create_target_mask(tgt)
        tgt = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))

        # Pass through each layer in the decoder
        for layer in self.decoder:
            tgt = layer(tgt, memory, src_mask, tgt_mask)

        # Output layer
        output = self.out(tgt)
        return output

    def forward(self, src, tgt):
        ### WRITE YOUR CODE HERE

        # Encoding
        memory, source_mask = self.encode(src)

        # Decoding
        output = self.decode(tgt, memory, source_mask)

        return output

In [9]:
# Define the hyperparameters of the model
src_vocab_size = 5000  # Size of source vocabulary
tgt_vocab_size = 5000  # Size of target vocabulary
d_model = 512          # Embedding dimension
N = 6                  # Number of encoder and decoder layers
num_heads = 8          # Number of attention heads
d_ff = 2048            # Dimension of feed forward networks
max_seq_length = 100   # Maximum sequence length
dropout = 0.1          # Dropout rate
pad_idx = 0            # Index of the padding token

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)
model

Transformer(
  (src_embedding): Embedding(5000, 512)
  (tgt_embedding): Embedding(5000, 512)
  (positional_encoding): PositionalEncoding()
  (encoder): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionwiseFeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (relu): ReLU()
      )
      (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decod

### Testing on Random Data

In [11]:
# Generate random sample data
torch.manual_seed(42)

src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)

#### Inference

In [12]:
# Generate the next token using the first token in the first target tensor
model.eval()

memory, src_mask = model.encode(src_data[:1, :])
output = model.decode(tgt_data[:1, :1], memory, src_mask)
y = output.view(-1, tgt_vocab_size).argmax(-1)
y

tensor([990], device='cuda:0')

If your code is correct, you should get tensor([990]).

#### Training

In [13]:
# Train the model for 10 epochs
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)

model.train()

n_epochs = 10

for epoch in range(n_epochs):
    optimizer.zero_grad()

    # Forward pass
    output = model(src_data, tgt_data[:, :-1])

    # tgt_data is of shape [batch_size, tgt_len]
    # output is of shape [batch_size, tgt_len, tgt_vocab_size]
    output = output.contiguous().view(-1, tgt_vocab_size)
    tgt = tgt_data[:, 1:].contiguous().view(-1)
    loss = criterion(output, tgt)

    loss.backward()
    grad_clip = 1.0
    nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

  _torch_pytree._register_pytree_node(


Epoch: 1, Loss: 8.604100227355957
Epoch: 2, Loss: 8.504327774047852
Epoch: 3, Loss: 8.372350692749023
Epoch: 4, Loss: 8.297316551208496
Epoch: 5, Loss: 8.23608112335205
Epoch: 6, Loss: 8.194172859191895
Epoch: 7, Loss: 8.163470268249512
Epoch: 8, Loss: 8.141220092773438
Epoch: 9, Loss: 8.127995491027832
Epoch: 10, Loss: 8.11793327331543


You should see the loss decreasing from around 8.6 to 8.1.

### Machine Translation Example

Now we consider a real-world example using the Multi30k German-English Translation task. This task is much smaller than the WMT task considered in the paper, but it illustrates the whole system. <br>
It is recommended to run this example on Google Colab, or on a machine with a strong GPU.

#### Define Tokenizers

In [14]:
# Load spacy models for tokenization
try:
    spacy_de = spacy.load('de_core_news_sm')
except IOError:
    os.system("python -m spacy download de_core_news_sm")
    spacy_de = spacy.load('de_core_news_sm')

try:
    spacy_en = spacy.load('en_core_web_sm')
except IOError:
    os.system("python -m spacy download en_core_web_sm")
    spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

tokenizer_de = get_tokenizer(tokenize_de)
tokenizer_en = get_tokenizer(tokenize_en)

#### Build Vocabularies

In [15]:
train_data, _, _ = Multi30k(split=('train', 'valid', 'test'))
vocab_src = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_de, 0), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab_tgt = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_en, 1), specials=['<unk>', '<pad>', '<bos>', '<eos>'])

vocab_src.set_default_index(vocab_src['<unk>'])
vocab_tgt.set_default_index(vocab_tgt['<unk>'])



#### Create the Transformer

In [16]:
def initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx):
    """
    Initializes the Transformer model based on the provided hyperparameters.

    Parameters:
    - src_vocab_size (int): Size of source vocabulary
    - tgt_vocab_size (int): Size of target vocabulary
    - d_model (int): Embedding dimension
    - N (int): Number of encoder and decoder layers
    - num_heads (int): Number of attention heads
    - d_ff (int): Dimension of feed forward networks
    - max_seq_length (int): Maximum sequence length
    - dropout (float): Dropout rate
    - pad_idx (int): Padding index in the target vocabulary

    Returns:
    - Transformer: Initialized Transformer model
    """
    model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    return model

In [17]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

default_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(default_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Data Processing

In [19]:
def data_process(raw_data_iter):
    data = []
    for raw_src, raw_tgt in raw_data_iter:
        src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(raw_src)], dtype=torch.long)
        tgt_tensor = torch.tensor([vocab_tgt[token] for token in tokenizer_en(raw_tgt)], dtype=torch.long)
        data.append((src_tensor, tgt_tensor))
    return data

train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
train_data = data_process(train_data)
valid_data = data_process(valid_data)
#test_data = data_process(test_data)
# The test set of Multi30k is corrupted
# See https://discuss.pytorch.org/t/unicodedecodeerror-when-running-test-iterator/192818/3

In [20]:
def generate_batch(data_batch):
    """Processes a batch of source-target pairs by adding start-of-sequence (BOS) and end-of-sequence (EOS) tokens
    to each sequence and padding all sequences to the same length.

    Parameters:
    - data_batch (Iterable[Tuple[Tensor, Tensor]]): A batch of source-target pairs, where each element is a tuple
      containing the source sequence tensor and the target sequence tensor.
    """
    src_batch, tgt_batch = [], []
    src_batch, tgt_batch = [], []

    # Iterate over each source-target pair in the provided batch
    for src_item, tgt_item in data_batch:
        # Prepend the start-of-sequence (BOS) token and append the end-of-sequence (EOS) token to the sequences
        src_batch.append(torch.cat([torch.tensor([vocab_src['<bos>']]), src_item,
                                    torch.tensor([vocab_src['<eos>']])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([vocab_tgt['<bos>']]), tgt_item,
                                    torch.tensor([vocab_tgt['<eos>']])], dim=0))

    # Pad the sequences in the source batch to ensure they all have the same length.
    # 'batch_first=True' indicates that the batch dimension should come first in the resulting tensor.
    src_batch = pad_sequence(src_batch, padding_value=vocab_src['<pad>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocab_tgt['<pad>'], batch_first=True)
    return src_batch, tgt_batch


In [21]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [22]:
def train(model, iterator, optimizer, criterion, grad_clip):
    """
    Trains the model for one epoch over the given dataset.
    This function iterates over the provided data iterator, performing the forward and backward passes for each batch.
    It employs teacher forcing by feeding the shifted target sequence (excluding the last token) as input to the decoder.

    Parameters:
    - model (torch.nn.Module): The model to be trained.
    - iterator (Iterable): An iterable object that returns batches of data.
    - optimizer (torch.optim.Optimizer): The optimizer to use for updating the model parameters.
    - criterion (Callable): The loss function used to compute the difference between the model's predictions and the actual targets.
    - grad_clip (float): The maximum norm of the gradients for gradient clipping.

    Returns:
    - float: The average loss for the epoch, computed as the total loss over all batches divided by the number of batches in the iterator.
    """
    # Set the model to training mode.
    # This enables dropout, layer normalization etc., which behave differently during training.
    model.train()

    epoch_loss = 0

    # Enumerate over the data iterator to get batches
    for i, batch in enumerate(iterator):
        # Unpack the batch to get source (src) and target (tgt) sequences
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # Forward pass through the model.
        # For seq2seq models, the decoder input (tgt[:, :-1]) excludes the last token, implementing teacher forcing.
        output = model(src, tgt[:, :-1])

        # Reshape the output and target tensors to compute loss.
        # The output tensor is reshaped to a 2D tensor where rows correspond to each token in the batch and columns to vocabulary size.

        # tgt is of shape [batch_size, tgt_len]
        # output is of shape [batch_size, tgt_len, tgt_vocab_size]
        output = output.contiguous().view(-1, tgt_vocab_size)

        # The target tensor is reshaped to a 1D tensor, excluding the first token (BOS) from each sequence.
        tgt = tgt[:, 1:].contiguous().view(-1)

        # Compute loss, perform backpropagation, and update model parameters
        loss = criterion(output, tgt)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        epoch_loss += loss.item()

    # Compute average loss per batch for the current epoch
    return epoch_loss / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    """
    Evaluates the model's performance on a given dataset.
    This function is similar to the training loop, but without the backward pass and parameter updates. I
    """
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

#### Training the Model

In [24]:
n_epochs = 20
print("Training the default model\n")
for epoch in range(n_epochs):
    train_loss = train(default_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(default_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the default model


Epoch: 1
	Train Loss: 5.692
	Val Loss: 4.985

Epoch: 2
	Train Loss: 4.880
	Val Loss: 4.793

Epoch: 3
	Train Loss: 4.697
	Val Loss: 4.628

Epoch: 4
	Train Loss: 4.426
	Val Loss: 4.245

Epoch: 5
	Train Loss: 4.102
	Val Loss: 4.012

Epoch: 6
	Train Loss: 3.896
	Val Loss: 3.869

Epoch: 7
	Train Loss: 3.759
	Val Loss: 3.783

Epoch: 8
	Train Loss: 3.637
	Val Loss: 3.676

Epoch: 9
	Train Loss: 3.529
	Val Loss: 3.600

Epoch: 10
	Train Loss: 3.450
	Val Loss: 3.555

Epoch: 11
	Train Loss: 3.379
	Val Loss: 3.505

Epoch: 12
	Train Loss: 3.317
	Val Loss: 3.484

Epoch: 13
	Train Loss: 3.259
	Val Loss: 3.430

Epoch: 14
	Train Loss: 3.198
	Val Loss: 3.390

Epoch: 15
	Train Loss: 3.137
	Val Loss: 3.331

Epoch: 16
	Train Loss: 3.081
	Val Loss: 3.302

Epoch: 17
	Train Loss: 3.025
	Val Loss: 3.268

Epoch: 18
	Train Loss: 2.972
	Val Loss: 3.238

Epoch: 19
	Train Loss: 2.922
	Val Loss: 3.197

Epoch: 20
	Train Loss: 2.876
	Val Loss: 3.175


The train loss should decrease from around 5.7 to 2.8 after 20 epochs.

Saving the model

In [74]:
def save_model(path, model, optimizer, epoch, train_loss, val_loss, src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx):
    # Save the model's state dictionary, optimizer state, and other necessary information
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'src_vocab_size': src_vocab_size,
        'tgt_vocab_size': tgt_vocab_size,
        'd_model': d_model,
        'N': N,
        'num_heads': num_heads,
        'd_ff': d_ff,
        'max_seq_length': max_seq_length,
        'dropout': dropout,
        'pad_idx': pad_idx
    }, path)

    print(f'Model saved to {path}')

In [25]:
# Define the path where you want to save the model
save_path = '/content/sample_data/model_default.pth'

In [26]:
save_model(save_path, default_model, optimizer, n_epochs, train_loss, val_loss, src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

Model saved to /content/sample_data/model_default.pth


In [27]:
def load_model(path):
    # Load the checkpoint
    loaded_checkpoint = torch.load(path)

    # Create a new instance of the model using the parameters from the checkpoint
    loaded_model = Transformer(
    loaded_checkpoint['src_vocab_size'],
    loaded_checkpoint['tgt_vocab_size'],
    loaded_checkpoint['d_model'],
    loaded_checkpoint['N'],
    loaded_checkpoint['num_heads'],
    loaded_checkpoint['d_ff'],
    loaded_checkpoint['max_seq_length'],
    loaded_checkpoint['dropout'],
    loaded_checkpoint['pad_idx'])

    # Load the state dictionaries into the model and optimizer
    loaded_model.load_state_dict(loaded_checkpoint['model_state_dict'])
    optimizer.load_state_dict(loaded_checkpoint['optimizer_state_dict'])

    #Moving loaded model to device
    loaded_model = loaded_model.to(device)

    return loaded_model


In [35]:
load_path = "/content/sample_data/model_default.pth"

In [36]:
default_model = load_model(load_path)

#### Translating a Sample Sentence

In [25]:
def translate_sentence(model, sentence, vocab_src, vocab_tgt, max_length=50):
    """
    Translates a given source sentence into the target language using a trained Transformer model.
    The function preprocesses the input sentence by tokenizing and converting it to tensor format, then uses the model's
    encode and decode methods to generate the translated sentence. The translation process is performed token by token
    using greedy decoding, selecting the most likely next token at each step until an <eos> token is produced or the
    maximum length is reached.

    Parameters:
    - model (torch.nn.Module): The trained Transformer model.
    - sentence (str): The source sentence to translate.
    - vocab_src (dict): The source vocabulary mapping of tokens to indices. It should include special tokens such as
      '<bos>' (beginning of sentence) and '<eos>' (end of sentence).
    - vocab_tgt (dict): The target vocabulary mapping of indices to tokens. It should provide a method `lookup_token`
      to convert token indices back to the string representation.
    - max_length (int, optional): The maximum allowed length for the generated translation. The decoding process will
      stop when this length is reached if an <eos> token has not yet been generated.

    Returns:
    - str: The translated sentence as a string of text in the target language.
    """
    ### WRITE YOUR CODE HERE

    # Tokenize and convert source sentence to tensor
    src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(sentence)], dtype=torch.long)

    # Move tensor to device
    src_tensor = src_tensor.to(device)

    # Encoding the source sentence
    with torch.no_grad():
        encoder_output, src_mask = model.encode(src_tensor.unsqueeze(0))


    tgt_tokens = []

    # Input to begin with BOS token
    input_token = torch.tensor([vocab_tgt['<bos>']], dtype=torch.long).to(device)

    # decoding using greedy approach
    for _ in range(max_length):
        with torch.no_grad():
            output = model.decode(input_token.unsqueeze(0), encoder_output, src_mask)
            next_token = output.argmax(dim=-1)[:, -1].item()

        tgt_tokens.append(next_token)

        # Stop decoding if EOS token is generated
        if next_token == vocab_tgt['<eos>']:
            break

        # Prepare the next input token
        input_token = torch.tensor([next_token], dtype=torch.long).to(device)

    translated_sentence = ' '.join([vocab_tgt.lookup_token(token) for token in tgt_tokens])

    return translated_sentence

In [26]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field


In [27]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in race People in


In [28]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike People bike


In [29]:
src_sentence = "Können Sie ein gutes Restaurant im Stadtzentrum empfehlen."  # German for "Can you recommend a good restaurant in the city center."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The


In [30]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The


In [31]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The


In [32]:
src_sentence = "Guten Tag."  # German for "Good morning."
translated_sentence = translate_sentence(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Children <eos>


You should get a translation similar to the reference after 20 epochs of training.

##Translate sentence using Beam Search

In [33]:
def beam_search(model, sentence, vocab_src, vocab_tgt, max_length=50, beam_size=5):
    # Tokenize and convert source sentence to tensor
    src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(sentence)], dtype=torch.long)

    # Move tensor to device
    src_tensor = src_tensor.to(device)

    # Encode the source sentence
    with torch.no_grad():
        encoder_output, src_mask = model.encode(src_tensor.unsqueeze(0))

    # Initialise with <bos>
    beam = [(torch.tensor([vocab_tgt['<bos>']], dtype=torch.long).to(device), 0.0)]

    # Perform beam search
    for _ in range(max_length):
        candidates = []

        for seq, score in beam:
            output = model.decode(seq.unsqueeze(0), encoder_output, src_mask)

            # Get the top beam_size candidates for the next token
            top_candidates = torch.topk(output[:, -1, :], k=beam_size, dim=-1)

            for i in range(beam_size):
                next_token = top_candidates.indices[0, i].item()
                next_score = top_candidates.values[0, i].item()

                # Add the candidate sequence and its score to the list
                candidates.append((torch.cat([seq, torch.tensor([next_token], dtype=torch.long).to(device)]), score + next_score))

        # Select the top beam_size candidates based on their accumulated scores
        def get_score(candidate):
          return candidate[1]

        # Select the top beam_size candidates based on their accumulated scores
        beam = sorted(candidates, key=get_score, reverse=True)[:beam_size]

        # Check if any candidate has an EOS token
        for seq, score in beam:
            if seq[-1].item() == vocab_tgt['<eos>']:
                return ' '.join([vocab_tgt.lookup_token(token.item()) for token in seq])

    # If max_length is reached and no EOS token is generated, return the best candidate
    return ' '.join([vocab_tgt.lookup_token(token.item()) for token, _ in beam[0]])


In [34]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A person is playing in the water . <eos>


In [35]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Several people are sitting in the water . <eos>


In [36]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> People are walking down the street in a city . <eos>


In [37]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of people are walking down the street . <eos>


In [38]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of people are walking down the street . <eos>


In [39]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = beam_search(default_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Three people are playing soccer . <eos>


##Experimenting with hyperparameters

##1. Increasing no of epochs first from 20 to 30

In [40]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

epochs_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(epochs_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [41]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [42]:
n_epochs = 30
print("Training the default model for more epochs\n")
for epoch in range(n_epochs):
    train_loss = train(epochs_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(epochs_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the default model for more epochs


Epoch: 1
	Train Loss: 5.715
	Val Loss: 5.022

Epoch: 2
	Train Loss: 4.881
	Val Loss: 4.781

Epoch: 3
	Train Loss: 4.695
	Val Loss: 4.621

Epoch: 4
	Train Loss: 4.437
	Val Loss: 4.261

Epoch: 5
	Train Loss: 4.113
	Val Loss: 4.052

Epoch: 6
	Train Loss: 3.925
	Val Loss: 3.897

Epoch: 7
	Train Loss: 3.790
	Val Loss: 3.805

Epoch: 8
	Train Loss: 3.664
	Val Loss: 3.709

Epoch: 9
	Train Loss: 3.561
	Val Loss: 3.635

Epoch: 10
	Train Loss: 3.474
	Val Loss: 3.581

Epoch: 11
	Train Loss: 3.396
	Val Loss: 3.515

Epoch: 12
	Train Loss: 3.329
	Val Loss: 3.483

Epoch: 13
	Train Loss: 3.264
	Val Loss: 3.426

Epoch: 14
	Train Loss: 3.198
	Val Loss: 3.392

Epoch: 15
	Train Loss: 3.138
	Val Loss: 3.339

Epoch: 16
	Train Loss: 3.083
	Val Loss: 3.304

Epoch: 17
	Train Loss: 3.027
	Val Loss: 3.288

Epoch: 18
	Train Loss: 2.971
	Val Loss: 3.233

Epoch: 19
	Train Loss: 2.920
	Val Loss: 3.197

Epoch: 20
	Train Loss: 2.867
	Val Loss: 3.180

Epoch: 21
	Train Loss: 2

In [43]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African A African


In [44]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> The boy is playing on the beach . <eos>


In [46]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several People Several


In [47]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of three men are walking . <eos>


In [49]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


In [48]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Construction players in the middle of a football game in a game . <eos>


In [50]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Workers a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a


In [52]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Construction workers are working on a road . <eos>


In [53]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Workers a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a


In [54]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Construction workers are working on a bench . <eos>


In [57]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <eos>


In [55]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = beam_search(epochs_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> <eos>


##2. Changing batch size from 128 to 64

In [58]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 64
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

batch_64_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(batch_64_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [59]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [60]:
n_epochs = 20
print("Training the model with a batch size of 64")
for epoch in range(n_epochs):
    train_loss = train(batch_64_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(batch_64_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model with a batch size of 64

Epoch: 1
	Train Loss: 5.367
	Val Loss: 4.980

Epoch: 2
	Train Loss: 4.743
	Val Loss: 4.675

Epoch: 3
	Train Loss: 4.371
	Val Loss: 4.173

Epoch: 4
	Train Loss: 3.986
	Val Loss: 3.949

Epoch: 5
	Train Loss: 3.788
	Val Loss: 3.778

Epoch: 6
	Train Loss: 3.656
	Val Loss: 3.677

Epoch: 7
	Train Loss: 3.556
	Val Loss: 3.597

Epoch: 8
	Train Loss: 3.459
	Val Loss: 3.525

Epoch: 9
	Train Loss: 3.379
	Val Loss: 3.477

Epoch: 10
	Train Loss: 3.315
	Val Loss: 3.432

Epoch: 11
	Train Loss: 3.254
	Val Loss: 3.405

Epoch: 12
	Train Loss: 3.196
	Val Loss: 3.342

Epoch: 13
	Train Loss: 3.135
	Val Loss: 3.313

Epoch: 14
	Train Loss: 3.079
	Val Loss: 3.275

Epoch: 15
	Train Loss: 3.023
	Val Loss: 3.215

Epoch: 16
	Train Loss: 2.970
	Val Loss: 3.178

Epoch: 17
	Train Loss: 2.923
	Val Loss: 3.143

Epoch: 18
	Train Loss: 2.875
	Val Loss: 3.134

Epoch: 19
	Train Loss: 2.827
	Val Loss: 3.100

Epoch: 20
	Train Loss: 2.781
	Val Loss: 3.087


In [61]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy boy


In [62]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A young boy playing in the water . <eos>


In [63]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and construction People and


In [64]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of men are standing in front of a building <eos>


In [65]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People this People


In [66]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> People are walking down a city street in front of a building . <eos>


In [67]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children The Children


In [68]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Children are walking down the street . <eos>


In [69]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction The Construction


In [70]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> Children are walking down a street . <eos>


In [71]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three Three


In [72]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = beam_search(batch_64_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> <eos>


In [75]:
save_path = '/content/sample_data/model_batch_size64.pth'
save_model(save_path, batch_64_model, optimizer, n_epochs, train_loss, val_loss, src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

Model saved to /content/sample_data/model_batch_size64.pth


##2.1 Increasing batch size to 256

In [76]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 256
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

batch_256_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(batch_256_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [77]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [78]:
n_epochs = 20
print("Training the model for batch size 256")
for epoch in range(n_epochs):
    train_loss = train(batch_256_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(batch_256_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model for batch size 256

Epoch: 1
	Train Loss: 6.277
	Val Loss: 5.155

Epoch: 2
	Train Loss: 5.064
	Val Loss: 4.966

Epoch: 3
	Train Loss: 4.892
	Val Loss: 4.820

Epoch: 4
	Train Loss: 4.766
	Val Loss: 4.723

Epoch: 5
	Train Loss: 4.638
	Val Loss: 4.569

Epoch: 6
	Train Loss: 4.435
	Val Loss: 4.288

Epoch: 7
	Train Loss: 4.198
	Val Loss: 4.122

Epoch: 8
	Train Loss: 4.040
	Val Loss: 4.034

Epoch: 9
	Train Loss: 3.904
	Val Loss: 3.897

Epoch: 10
	Train Loss: 3.795
	Val Loss: 3.819

Epoch: 11
	Train Loss: 3.694
	Val Loss: 3.753

Epoch: 12
	Train Loss: 3.609
	Val Loss: 3.674

Epoch: 13
	Train Loss: 3.539
	Val Loss: 3.632

Epoch: 14
	Train Loss: 3.470
	Val Loss: 3.580

Epoch: 15
	Train Loss: 3.406
	Val Loss: 3.540

Epoch: 16
	Train Loss: 3.344
	Val Loss: 3.490

Epoch: 17
	Train Loss: 3.286
	Val Loss: 3.471

Epoch: 18
	Train Loss: 3.232
	Val Loss: 3.419

Epoch: 19
	Train Loss: 3.181
	Val Loss: 3.379

Epoch: 20
	Train Loss: 3.133
	Val Loss: 3.355


In [79]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [80]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> The little girl is playing in the snow . <eos>


In [81]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


In [82]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> People are walking on the street . <eos>


In [83]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People


In [84]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A large group of people are walking down a city street in the background . <eos>


In [85]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People


In [86]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of people are walking down the street . <eos>


In [87]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People A People


In [88]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> A group of young men are sitting on a street . <eos>


In [89]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two


In [90]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = beam_search(batch_256_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: <bos> <eos>


In [91]:
save_path = '/content/sample_data/model_batch_size256.pth'
save_model(save_path, batch_64_model, optimizer, n_epochs, train_loss, val_loss, src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

Model saved to /content/sample_data/model_batch_size256.pth


## 3. Changing learning rate to 0.0005

In [112]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0005 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

lr_1_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(lr_1_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [113]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [114]:
n_epochs = 20
print("Training the model for learning rate: 0.0005")
for epoch in range(n_epochs):
    train_loss = train(lr_1_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(lr_1_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model for learning rate: 0.0005

Epoch: 1
	Train Loss: 5.758
	Val Loss: 7.390

Epoch: 2
	Train Loss: 5.283
	Val Loss: 12.580

Epoch: 3
	Train Loss: 5.198
	Val Loss: 12.137

Epoch: 4
	Train Loss: 5.175
	Val Loss: 12.135

Epoch: 5
	Train Loss: 5.143
	Val Loss: 12.101

Epoch: 6
	Train Loss: 5.139
	Val Loss: 12.218

Epoch: 7
	Train Loss: 5.111
	Val Loss: 11.185

Epoch: 8
	Train Loss: 5.101
	Val Loss: 11.893

Epoch: 9
	Train Loss: 5.088
	Val Loss: 11.418

Epoch: 10
	Train Loss: 5.079
	Val Loss: 11.628

Epoch: 11
	Train Loss: 5.072
	Val Loss: 10.750

Epoch: 12
	Train Loss: 5.065
	Val Loss: 10.850

Epoch: 13
	Train Loss: 5.060
	Val Loss: 11.144

Epoch: 14
	Train Loss: 5.057
	Val Loss: 11.898

Epoch: 15
	Train Loss: 5.050
	Val Loss: 12.287

Epoch: 16
	Train Loss: 5.041
	Val Loss: 11.730

Epoch: 17
	Train Loss: 5.027
	Val Loss: 12.418

Epoch: 18
	Train Loss: 4.998
	Val Loss: 12.215

Epoch: 19
	Train Loss: 4.915
	Val Loss: 11.197

Epoch: 20
	Train Loss: 4.855
	Val Loss: 11.701


In [115]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [116]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [117]:
src_sentence = "Ich liebe es, Zeit in der Natur zu verbringen, umgeben von Bäumen und Blumen."  # German for "I love spending time in nature, surrounded by trees and flowers."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [119]:
src_sentence = "Wir planen eine Reise nach Deutschland nächsten Sommer."  # German for "We are planning a trip to Germany next summer."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [120]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [121]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(lr_1_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


## 4. Changing dropout to 0.2

In [122]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.2  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

dropout_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(dropout_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [123]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [124]:
n_epochs = 20
print("Training the model for dropout rate: 0.2")
for epoch in range(n_epochs):
    train_loss = train(dropout_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(dropout_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model for dropout rate: 0.2

Epoch: 1
	Train Loss: 5.773
	Val Loss: 5.097

Epoch: 2
	Train Loss: 4.955
	Val Loss: 4.838

Epoch: 3
	Train Loss: 4.758
	Val Loss: 4.729

Epoch: 4
	Train Loss: 4.585
	Val Loss: 4.480

Epoch: 5
	Train Loss: 4.301
	Val Loss: 4.182

Epoch: 6
	Train Loss: 4.050
	Val Loss: 3.992

Epoch: 7
	Train Loss: 3.898
	Val Loss: 3.890

Epoch: 8
	Train Loss: 3.785
	Val Loss: 3.834

Epoch: 9
	Train Loss: 3.690
	Val Loss: 3.745

Epoch: 10
	Train Loss: 3.599
	Val Loss: 3.688

Epoch: 11
	Train Loss: 3.528
	Val Loss: 3.655

Epoch: 12
	Train Loss: 3.468
	Val Loss: 3.581

Epoch: 13
	Train Loss: 3.413
	Val Loss: 3.544

Epoch: 14
	Train Loss: 3.362
	Val Loss: 3.537

Epoch: 15
	Train Loss: 3.317
	Val Loss: 3.489

Epoch: 16
	Train Loss: 3.272
	Val Loss: 3.462

Epoch: 17
	Train Loss: 3.228
	Val Loss: 3.423

Epoch: 18
	Train Loss: 3.186
	Val Loss: 3.398

Epoch: 19
	Train Loss: 3.145
	Val Loss: 3.360

Epoch: 20
	Train Loss: 3.105
	Val Loss: 3.356


In [125]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(dropout_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy A boy


In [126]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(dropout_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


In [127]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(dropout_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


In [128]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(dropout_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People People


In [129]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(dropout_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog A dog


## 5. Changing no of heads to 4

In [130]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6          # Number of encoder and decoder layers
num_heads = 4  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

head_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(head_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [131]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [132]:
n_epochs = 20
print("Training the model for dropout rate: 0.2")
for epoch in range(n_epochs):
    train_loss = train(head_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(head_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model for dropout rate: 0.2

Epoch: 1
	Train Loss: 5.713
	Val Loss: 4.991

Epoch: 2
	Train Loss: 4.863
	Val Loss: 4.800

Epoch: 3
	Train Loss: 4.723
	Val Loss: 4.702

Epoch: 4
	Train Loss: 4.532
	Val Loss: 4.388

Epoch: 5
	Train Loss: 4.206
	Val Loss: 4.118

Epoch: 6
	Train Loss: 3.983
	Val Loss: 3.955

Epoch: 7
	Train Loss: 3.832
	Val Loss: 3.885

Epoch: 8
	Train Loss: 3.715
	Val Loss: 3.751

Epoch: 9
	Train Loss: 3.617
	Val Loss: 3.681

Epoch: 10
	Train Loss: 3.524
	Val Loss: 3.605

Epoch: 11
	Train Loss: 3.434
	Val Loss: 3.560

Epoch: 12
	Train Loss: 3.353
	Val Loss: 3.487

Epoch: 13
	Train Loss: 3.274
	Val Loss: 3.432

Epoch: 14
	Train Loss: 3.204
	Val Loss: 3.376

Epoch: 15
	Train Loss: 3.141
	Val Loss: 3.343

Epoch: 16
	Train Loss: 3.080
	Val Loss: 3.304

Epoch: 17
	Train Loss: 3.023
	Val Loss: 3.250

Epoch: 18
	Train Loss: 2.965
	Val Loss: 3.213

Epoch: 19
	Train Loss: 2.909
	Val Loss: 3.176

Epoch: 20
	Train Loss: 2.855
	Val Loss: 3.124


In [133]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(head_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child A child


In [134]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(head_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: People on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on on


In [135]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(head_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction Construction


In [137]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(head_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A <eos>


## 6. Changing number of layers to 8

In [138]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 8         # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate
learning_rate = 0.0001 #Learning Rate
batch_size = 128
grad_clip = 1

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

layer_model = initialize_transformer_model(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

optimizer = optim.Adam(layer_model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [139]:
# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [141]:
n_epochs = 20
print("Training the model for 8 layers")
for epoch in range(n_epochs):
    train_loss = train(layer_model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(layer_model, valid_iterator, criterion)

    print(f'\nEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')

Training the model for 8 layers

Epoch: 1
	Train Loss: 5.570
	Val Loss: 5.200

Epoch: 2
	Train Loss: 5.171
	Val Loss: 5.221

Epoch: 3
	Train Loss: 5.107
	Val Loss: 5.177

Epoch: 4
	Train Loss: 5.058
	Val Loss: 5.168

Epoch: 5
	Train Loss: 4.973
	Val Loss: 5.426

Epoch: 6
	Train Loss: 4.930
	Val Loss: 5.424

Epoch: 7
	Train Loss: 4.909
	Val Loss: 5.562

Epoch: 8
	Train Loss: 4.907
	Val Loss: 6.405

Epoch: 9
	Train Loss: 4.854
	Val Loss: 5.708

Epoch: 10
	Train Loss: 4.758
	Val Loss: 5.619

Epoch: 11
	Train Loss: 4.707
	Val Loss: 5.598

Epoch: 12
	Train Loss: 4.671
	Val Loss: 5.821

Epoch: 13
	Train Loss: 4.645
	Val Loss: 5.761

Epoch: 14
	Train Loss: 4.628
	Val Loss: 5.579

Epoch: 15
	Train Loss: 4.624
	Val Loss: 5.505

Epoch: 16
	Train Loss: 4.594
	Val Loss: 5.764

Epoch: 17
	Train Loss: 4.575
	Val Loss: 5.670

Epoch: 18
	Train Loss: 4.580
	Val Loss: 6.745

Epoch: 19
	Train Loss: 4.574
	Val Loss: 5.892

Epoch: 20
	Train Loss: 4.500
	Val Loss: 5.754


In [142]:
src_sentence = "Ein kleiner Junge spielt draußen mit einem Ball."  # German for "A little boy playing outside with a ball."
translated_sentence = translate_sentence(layer_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [143]:
src_sentence = "Die Sonne scheint hell am blauen Himmel."  # German for "The sun is shining brightly in the blue sky."
translated_sentence = translate_sentence(layer_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [144]:
src_sentence = "Gib mir bitte das Salz und den Pfeffer."  # German for "Please pass me the salt and pepper."
translated_sentence = translate_sentence(layer_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several Several


In [145]:
src_sentence = "Guten Tag."  # German for "Good Morning."
translated_sentence = translate_sentence(layer_model, src_sentence, vocab_src, vocab_tgt)
print(f'Translated sentence: {translated_sentence}')

Translated sentence: Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two Two


##Report

**Task** : Using the transformers for German to English translation.

**Dataset**: Multi 30K

**Implementation:** Implemented the transformer architecture with encoder-decoder layers, positional encoding and attention mechanism. The code blocks were completed from the reference annotated paper - https://nlp.seas.harvard.edu/annotated-transformer/
Following steps were taken while implementing transformers:
  
  1) Data Preprocessing - Data was tokenized using the tokenizers from the spacy library.
  
  2) Padding the sentences and appending the <eos>/<bos> tokens to the sentences.
  
  3) Defining the hyperparameters for the model such as batch_size, learning_rate etc.
  
  4) Training the model for ~20 epochs and while monitoring the training and validation loss.
  
  5) Finally using the model to translate some real life german sentences.

  6) For the translation both - the greedy approach and the beam search were implemented.


  **Results**
  - So the transformer model with attention mechanism was trained for 20 epochs using the parameters:
    - src_vocab_size = len(vocab_src)  # Size of source vocabulary
    - tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
    - d_model = 512  # Embedding dimension
    - N = 8         # Number of encoder and decoder layers
    - num_heads = 8  # Number of attention heads
    - d_ff = 2048    # Dimension of feed forward networks
    - max_seq_length = 5000 # Maximum sequence length
    - dropout = 0.1  # Dropout rate
    - learning_rate = 0.0001 #Learning Rate
    - batch_size = 128
    - grad_clip = 1
  
  After training on these default parameters, the model was used to translate german sentences using greedy approach and beam search.
  Following are the results:

  **Greedy Approach**

  G: Ein kleiner Junge spielt draußen mit einem Ball
  
  E: A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field A little person in field.

  **Beam Search**

  G: Ein kleiner Junge spielt draußen mit einem Ball
  
  E: tag(BOS) A person is playing in the water . tag(EOS)

  From the above example, we can clearly see that the Beam search approach is better than the greedy approach.

  **After this initial test, some hyperparameters were changed and then the results were observed.**

  **1) Increasing No of Epochs** - From 20 to 30
   - Training the model for more epochs decreased the training  and validation loss further to 2.4. But this is not guaranteed always as we increase the epochs, there is always a chance that the model may get stuck on a global maxima and then not improve further.

   - There was not much improvement in the actual translated sentences though.

   **2) Increasing/Decreasing the batch size** - 64, 128 and 256.
   - Tweaking the batch size parameter, in both ways gave us good results. Reducing the  batch size especially gave us good results and even a lesser loss value.

   **3) Increasing the learning rate** - 0.0001 to 0.0005.
   - Tweaking this parameter, did not us good results at all. Increasing the learning rate by just a few points led to a very high validation loss and as a result, the translated sentences are also not so good.

   **4) Increasing the dropout rate** - 0.1 to 0.2.
   - This initially decreased the training and validation loss but then the loss values plateaued at 4.0. There was no significant change in the actual translated sentences.

   **5) Decreasing the number of attention heads** - 8 to 4
   - The transformer model performed well even after decreasing the number of attention heads. The loss values for 8 and 4 attention heads were exactly the same and even the translated sentences were similar.

   **6) Increasing the number of layers** - 6 to 8.
    - When the no of layers was increased from 6 to 8, the training loss plateaued at 4.0. The model didn't show any improvement, rather the quality of the translated sentences was not good.


**Challenges**
- Implementing the transformer architecture using the reference. One has to have a really good understanding of transformers to implement and debug the code.

- Tweaking the hyperparameters to the right value because even the slightest change can cause a big impact on the results.