## Practical 9
### Exercises

1. Initialise a small version of your transformer model (do not use more than 4 layers and 64 hidden units unless you have access to sufficient compute).
2. Initialise the dataloader using the dataset class from practical 5.
3. Initialise the loss function (cross-entropy loss) optimiser and learning rate scheduler.
4. Implement the training loop.
5. Train the model for 5 epochs and ensure that loss decreases for both the training and validation sets of the dataset. You can use a small randomly selected subset of the training data to speed up training. 

In [1]:
import torch
import json
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
# use realtive import
import sys
sys.path.append('..')

from modelling.model import Transformer
from modelling.dataloader import TranslationDataset, MyBPETokenizer
from modelling.optimizer import create_adamw_optimizer
from modelling.scheduler import TransformerScheduler

In [2]:
# load the data
print("Loading data...")
with open("../data/cleaned_wmt17_de_en_split_train.json", encoding='utf-8') as f:
    train_data = json.load(f)
    
with open("../data/cleaned_wmt17_de_en_split_validation.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

with open("../data/cleaned_wmt17_de_en_texts_for_tokenizer.json", "r", encoding="utf-8") as f:
    tokenizer_texts = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")

Loading data...
Train samples: 4801360
Val samples: 2782


In [3]:
# use small subset for training on CPU for 5 epochs
# increased the number of samples because of overfitting with too small data
num_train_samples = 10000
num_val_samples = 3000

train_data = train_data[:num_train_samples]
val_data = val_data[:num_val_samples]

tokenizer_texts = tokenizer_texts[:20000]

print(f"Using {len(train_data)} training samples and {len(val_data)} validation samples.")

Using 10000 training samples and 2782 validation samples.


In [4]:
# train Tokenizer
tokenizer = MyBPETokenizer(texts=tokenizer_texts, vocab_size=5000, save_dir="../data/small_bpe_tokenizer")

Loading existing tokenizer from ../data/small_bpe_tokenizer


In [5]:
# instanciate dataset and dataloader
train_dataset = TranslationDataset(train_data, tokenizer, max_src_len=32, max_tgt_len=32)
val_dataset = TranslationDataset(val_data, tokenizer, max_src_len=32, max_tgt_len=32)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print("Datasets and Dataloaders initialized.")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Datasets and Dataloaders initialized.
Train dataset size: 10000
Validation dataset size: 2782


In [6]:
# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = Transformer(
    vocab_size=5000,
    d_model=64,
    n_heads=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    max_len=64,
    dim_feedforward=256,
    dropout=0.3,
).to(device)

Using device: cuda


In [7]:
# count parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 876936


In [8]:
# setup training
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id)

# added weight decay for better generalization
optimizer = create_adamw_optimizer(model, learning_rate=1e-4, weight_decay=1e-4)
scheduler = TransformerScheduler(optimizer, d_model=64, warmup_steps=100)

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, scheduler, device, epoch_num):
    model.train()
    total_loss = 0
    batch_count = 0
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc=f"Training Epoch {epoch_num}")):
        src_ids = batch['src_ids'].to(device)
        tgt_ids = batch['tgt_ids'].to(device)

        tgt_input = tgt_ids[:, :-1] # remove last token for decoder input
        tgt_output = tgt_ids[:, 1:] # remove first token for target output
        
        #if batch_idx == 0 and epoch_num % 1 == 0:
        #    print(f"Batch {batch_idx} shapes")
        #    print(f"src_ids: {src_ids.shape}")
        #    print(f"tgt_ids: {tgt_ids.shape}")
        
        #if batch_idx == 0 and epoch_num % 1 == 0:
        #    print(f"tgt_input: {tgt_input.shape}")
        #    print(f"tgt_output: {tgt_output.shape}")
        
        # create padding masks
        src_mask = (src_ids != tokenizer.pad_id).to(device)  # [batch_size, src_len]
        tgt_mask = (tgt_input != tokenizer.pad_id).to(device)  # [batch_size, tgt_len]
        memory_mask = src_mask  # [batch_size, src_len]
        
        # Forward pass
        optimizer.zero_grad()
        output = model(src_ids, tgt_input,
                       src_mask=src_mask,
                       tgt_mask=tgt_mask,
                       memory_mask=memory_mask)
        
        #if batch_idx == 0 and epoch_num % 1 == 0:
        #    print(f"Model output shape: {output.shape}")

        # Reshape for loss calculation
        # CrossEntropyLoss expects: input=[N, C], target=[N]
        # where N = batch_size * seq_len, C = vocab_size
        batch_size, seq_len, vocab_size = output.shape
        output_reshaped = output.reshape(-1, vocab_size)      # [batch_size * seq_len, vocab_size]
        tgt_output_reshaped = tgt_output.reshape(-1)          # [batch_size * seq_len]
        
        #if batch_idx == 0:
        #    print(f"\nAfter reshaping for loss:")
        #    print(f"output_reshaped shape: {output_reshaped.shape}")           # [batch*seq, vocab]
        #    print(f"tgt_output_reshaped shape: {tgt_output_reshaped.shape}")   # [batch*seq]
        #    print(f"vocab_size: {vocab_size}")
        #print(f"Output reshaped shape: {output_reshaped.shape}, Target reshaped shape: {tgt_output_reshaped.shape}, Vocab size: {vocab_size}")
        loss = criterion(output_reshaped, tgt_output_reshaped)
        
        # backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        batch_count += 1
    
    avg_loss = total_loss / batch_count
    return avg_loss

In [10]:
def validate(model, dataloader, criterion, device, epoch_num):
    model.eval()
    total_loss = 0
    batch_count = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dataloader, desc=f"Validation Epoch {epoch_num}")):
            src_ids = batch["src_ids"].to(device)
            tgt_ids = batch["tgt_ids"].to(device)
            
            # Same teacher forcing setup as training
            tgt_input = tgt_ids[:, :-1]
            tgt_output = tgt_ids[:, 1:]
            
            # create padding masks same as training
            src_mask = (src_ids != tokenizer.pad_id).to(device)
            tgt_mask = (tgt_input != tokenizer.pad_id).to(device)
            memory_mask = src_mask
            
            # Forward pass
            output = model(src_ids, tgt_input, 
                           src_mask=src_mask, 
                           tgt_mask=tgt_mask, 
                           memory_mask=memory_mask)
            
            # Reshape for loss
            vocab_size = output.shape[-1]
            output_reshaped = output.reshape(-1, vocab_size)
            tgt_output_reshaped = tgt_output.reshape(-1)
            
            # Calculate loss
            loss = criterion(output_reshaped, tgt_output_reshaped)
            total_loss += loss.item()
            batch_count += 1
    
    avg_loss = total_loss / batch_count
    return avg_loss

In [26]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=5):
    train_losses = []
    val_losses = []
    
    print(f"Starting training for {num_epochs} epochs")
    print(f"Device: {device}")
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        
        # Train
        train_loss = train_epoch(model, train_loader, criterion, optimizer, scheduler, device, epoch+1)
        
        # Validate
        val_loss = validate(model, val_loader, criterion, device, epoch+1)
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        # Print epoch summary
        print(f"Epoch {epoch+1} Summary:")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
    
    return train_losses, val_losses

In [12]:
def plot_losses(train_losses, val_losses, num_epochs):
    plt.figure(figsize=(10, 6))
    epochs = range(1, num_epochs + 1)
    
    plt.plot(epochs, train_losses, 'b-o', label='Training Loss', linewidth=2, markersize=8)
    plt.plot(epochs, val_losses, 'r-s', label='Validation Loss', linewidth=2, markersize=8)
    
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.title('Training and Validation Loss Over Epochs', fontsize=14, fontweight='bold')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.xticks(epochs)
    
    # Add value labels on points
    for i, (tl, vl) in enumerate(zip(train_losses, val_losses)):
        plt.text(i+1, tl, f'{tl:.3f}', ha='center', va='bottom', fontsize=9)
        plt.text(i+1, vl, f'{vl:.3f}', ha='center', va='top', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    # Print final statistics
    print("Training Complete!")
    print(f"Final Training Loss:   {train_losses[-1]:.4f}")
    print(f"Final Validation Loss: {val_losses[-1]:.4f}")

In [13]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [22]:
# run the training
num_epochs = 5
train_losses, val_losses = train_model(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    scheduler,
    device,
    num_epochs=num_epochs
)
plot_losses(train_losses, val_losses, num_epochs)

Starting training for 5 epochs
Device: cuda


Epoch 1/5


Training Epoch 1: 100%|██████████| 1250/1250 [04:16<00:00,  4.87it/s] 
Validation Epoch 1: 100%|██████████| 348/348 [00:00<00:00, 444.68it/s]


Epoch 1 Summary:
Train Loss: 3.2506
Val Loss: 4.6668
Epoch 2/5


Training Epoch 2: 100%|██████████| 1250/1250 [00:09<00:00, 132.77it/s]
Validation Epoch 2: 100%|██████████| 348/348 [00:00<00:00, 446.26it/s]


Epoch 2 Summary:
Train Loss: 3.2220
Val Loss: 4.6532
Epoch 3/5


Training Epoch 3: 100%|██████████| 1250/1250 [00:09<00:00, 131.54it/s]
Validation Epoch 3: 100%|██████████| 348/348 [00:00<00:00, 446.89it/s]


Epoch 3 Summary:
Train Loss: 3.2013
Val Loss: 4.6510
Epoch 4/5


Training Epoch 4: 100%|██████████| 1250/1250 [00:09<00:00, 135.13it/s]
Validation Epoch 4: 100%|██████████| 348/348 [00:00<00:00, 443.05it/s]


Epoch 4 Summary:
Train Loss: 3.1790
Val Loss: 4.6607
Epoch 5/5


Training Epoch 5:  69%|██████▊   | 858/1250 [00:06<00:02, 134.86it/s]


KeyboardInterrupt: 

In [15]:
actual_vocab_size = len(tokenizer.tokenizer.get_vocab())
print(f"Actual tokenizer vocab size: {actual_vocab_size}")

Actual tokenizer vocab size: 5000


### Practical 10
1. Implement the autoregressive generation procedure described above using your transformer mode(Using greedy decoding, remember to add a maximum length to the generation procedure to prevent infinite generation.)
2. Generate translations for the test set (or a subset of the test set) of WMT17 German-English.
3. Evaluate the BLEU score of your model on the test set (or a subset of the test set) of WMT17 German-English.
4. Evaluate some of the translations generated by your model. Do they make sense? What are some of the errors made by your model?

In [16]:
from modelling.generation import evaluate_bleu, generate_translation

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
with open("../data/cleaned_wmt17_de_en_split_test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

In [18]:
num_test_samples = 200
test_data_samples = test_data[:num_test_samples]

print(f"Total test samples available: {len(test_data)}")
print(f"Using {len(test_data_samples)} samples for evaluation")

Total test samples available: 2761
Using 200 samples for evaluation


In [19]:
src_sentence = test_data_samples[0]['src']
    
generated_translation = generate_translation(
    model,
    src_sentence,
    tokenizer,
    max_len=32,
    device=device
)

print(f"Source Sentence: {src_sentence}")
print(f"Generated Translation: {generated_translation}")

Source Sentence: 28-jähriger Koch in San Francisco Mall tot aufgefunden
Generated Translation: ('i wo ul d li ke to sa y t hat the com mission s ho ul d li ke to be t hat the com mission s ho ul d', [1, 37, 264, 822, 32, 146, 445, 717, 144, 53, 48, 221, 613, 4136, 4251, 47, 396, 822, 32, 146, 445, 717, 86, 48, 221, 613, 4136, 4251, 47, 396, 822, 32])


In [20]:
result = evaluate_bleu(
    model,
    test_data_samples,
    tokenizer,
    max_len=32,
    device=device
)

Generating translations for 200 samples...
Progress: 10/200
Progress: 20/200
Progress: 30/200
Progress: 40/200
Progress: 50/200
Progress: 60/200
Progress: 70/200
Progress: 80/200
Progress: 90/200
Progress: 100/200
Progress: 110/200
Progress: 120/200
Progress: 130/200
Progress: 140/200
Progress: 150/200
Progress: 160/200
Progress: 170/200
Progress: 180/200
Progress: 190/200
Progress: 200/200


In [21]:
# print BLEU score and show 5 example translations
print(f"BLEU Score on Test Set: {result['bleu_score']:.2f}")

print("Example Translations:")
for i in range(5):
    print(f"Source: {result['source_texts'][i]}")
    print(f"Target: {result['references'][i]}")
    print(f"Predicted: {result['predictions'][i]}")

BLEU Score on Test Set: 0.00
Example Translations:
Source: 28-jähriger Koch in San Francisco Mall tot aufgefunden
Target: 28-Year-Old Chef Found Dead at San Francisco Mall
Predicted: i wo ul d li ke to sa y t hat the com mission s ho ul d li ke to be t hat the com mission s ho ul d
Source: Ein 28-jähriger Koch, der vor kurzem nach San Francisco gezogen ist, wurde im Treppenhaus eines örtlichen Einkaufzentrums tot aufgefunden.
Target: A 28-year-old chef who had recently moved to San Francisco was found dead in the stairwell of a local mall this week.
Predicted: i wo ul d li ke to be t hat the com mission s ho ul d li ke to be t hat the com mission s ho ul d li
Source: Der Bruder des Opfers sagte aus, dass er sich niemanden vorstellen kann, der ihm schaden wollen würde,  Endlich ging es bei ihm wieder bergauf. 
Target: But the victim s brother says he can t think of anyone who would want to hurt him, saying,  Things were finally going well for him. 
Predicted: i wo ul d li ke to sa y t h

### Evaluation
- The model is extremly small and trained for very few epochs, so the BLEU score is expected to be low.
- The only produces [UNK] tokens as output, indicating that it has not learned to generate meaningful translations.