# Week 3 Day 14: Regularization, Monitoring, and Early Stopping - Part 1

## Overview
In this notebook, we'll implement and explore key techniques for preventing overfitting and ensuring optimal model performance. We will focus on:
- Implementing regularization (Dropout and Weight Decay)
- Creating a model with configurable regularization
- Setting up a synthetic dataset to demonstrate overfitting

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import time
import math
from torch.utils.data import DataLoader, Dataset, random_split
from typing import List, Dict, Tuple, Optional

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Simple Language Model with Regularization

Let's define a simple language model with configurable dropout for regularization.

In [None]:
class SimpleLanguageModel(nn.Module):
    """A simple transformer-based language model with dropout."""
    
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=4, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.output = nn.Linear(d_model, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.output.bias.data.zero_()
        self.output.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, src, src_mask=None):
        if src_mask is None:
            src_mask = generate_square_subsequent_mask(src.size(1)).to(src.device)
        
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_decoder(src, memory=src, tgt_mask=src_mask) # Using src as memory for decoder-only style
        output = self.output(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

## 2. Synthetic Dataset for Overfitting Demonstration

Let's create a small, repetitive dataset to easily demonstrate overfitting.

In [None]:
class OverfitDataset(Dataset):
    """A small, repetitive dataset designed to cause overfitting."""
    
    def __init__(self, vocab_size=100, seq_len=32, size=200):
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.size = size
        
        # Create a few fixed, repetitive patterns
        self.patterns = []
        for i in range(10):
            pattern_type = i % 3
            if pattern_type == 0: # Sequential
                start = torch.randint(0, vocab_size - seq_len, (1,)).item()
                self.patterns.append(torch.arange(start, start + seq_len + 1))
            elif pattern_type == 1: # Alternating
                tok1, tok2 = torch.randint(0, vocab_size, (2,)).tolist()
                pat = torch.tensor([tok1 if j % 2 == 0 else tok2 for j in range(seq_len + 1)])
                self.patterns.append(pat)
            else: # Repeating blocks
                block = torch.randint(0, vocab_size, (4,))
                pat = block.repeat((seq_len // 4) + 2)[:seq_len+1]
                self.patterns.append(pat)
    
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        # Return one of the fixed patterns
        seq = self.patterns[idx % len(self.patterns)]
        x = seq[:-1]
        y = seq[1:]
        return x, y

# Create dataset and dataloaders
vocab_size = 100
seq_len = 32
dataset = OverfitDataset(vocab_size, seq_len, size=200)
train_size = 150
val_size = 50
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

## 3. Basic Training and Evaluation Loop

Let's define a training loop. We will add more features to it in Part 2.

In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

def train(model, train_loader, val_loader, epochs, lr, weight_decay=0.0):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()
    
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            epoch_loss += loss.item()
        
        avg_epoch_loss = epoch_loss / len(train_loader)
        val_loss = evaluate(model, val_loader, criterion)
        
        history['train_loss'].append(avg_epoch_loss)
        history['val_loss'].append(val_loss)
        
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Time: {elapsed:.2f}s')
        
    return history

## 4. Demonstrating Overfitting

Let's train a model with no regularization to observe overfitting.

In [None]:
# Model with no dropout
overfit_model = SimpleLanguageModel(vocab_size=vocab_size, dropout=0.0)

print("Training a model with no regularization to induce overfitting...")
# Use a high learning rate and many epochs to speed up overfitting
overfit_history = train(
    model=overfit_model, 
    train_loader=train_dataloader, 
    val_loader=val_dataloader, 
    epochs=50, 
    lr=0.001,
    weight_decay=0.0
)

def plot_loss_curves(history, title):
    plt.figure(figsize=(10, 6))
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_loss_curves(overfit_history, 'Overfitting: No Regularization')

## 5. Applying Regularization

Now, let's train a model with dropout and weight decay to mitigate overfitting.

In [None]:
# Model with dropout
regularized_model = SimpleLanguageModel(vocab_size=vocab_size, dropout=0.2)

print("\nTraining a model with Dropout and Weight Decay...")
regularized_history = train(
    model=regularized_model, 
    train_loader=train_dataloader, 
    val_loader=val_dataloader, 
    epochs=50, 
    lr=0.001, 
    weight_decay=0.01 # Add weight decay
)

plot_loss_curves(regularized_history, 'With Regularization (Dropout + Weight Decay)')

## Part 1 Summary

In this part, we have:
1. Created a simple language model with configurable dropout.
2. Designed a synthetic dataset to easily demonstrate overfitting.
3. Trained a model without regularization and observed the validation loss increasing while training loss decreases, a clear sign of overfitting.
4. Trained another model with dropout and weight decay, observing that the gap between training and validation loss is much smaller, indicating better generalization.

In Part 2, we will explore monitoring with a logging system and implement early stopping to automatically find the best model checkpoint.