# Week 3 Day 14: Regularization, Monitoring, and Early Stopping - Part 2

## Overview
In this notebook, we'll build on Part 1 to implement monitoring and early stopping, two crucial techniques for robust model training. We will focus on:
- Setting up a comprehensive logging system for training metrics.
- Implementing an early stopping mechanism to prevent overfitting.
- Combining regularization, monitoring, and early stopping into a final, robust training loop.

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 1. Re-defining Model and Data (for standalone execution)

Let's quickly redefine the model and dataset from Part 1 so this notebook can be run independently.

In [None]:
# --- Model Definition ---
class SimpleLanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=4, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, batch_first=True, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        mask = torch.triu(torch.ones(x.size(1), x.size(1)) * float('-inf'), diagonal=1).to(x.device)
        x = self.embedding(x)
        x = self.transformer(x, mask=mask)
        return self.output

# --- Dataset Definition ---
class OverfitDataset(Dataset):
    def __init__(self, vocab_size=100, seq_len=32, size=200):
        self.patterns = []
        for i in range(10):
            pat = torch.randint(0, vocab_size, (seq_len + 1,))...
            self.patterns.append(pat)
    
    def __len__(self):
        return 200
    
    def __getitem__(self, idx):
        seq = self.patterns[idx % len(self.patterns)]
        return seq[:-1], seq[1:]

# --- Dataloaders ---
vocab_size = 100
seq_len = 32
dataset = OverfitDataset(vocab_size, seq_len)
train_dataset, val_dataset = random_split(dataset, [150, 50])
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

## 2. Comprehensive Logging with TensorBoard

Let's create a logger to track various metrics during training and visualize them with TensorBoard.

In [None]:
class TrainingLogger:
    """A simple logger for TensorBoard and console."""
    def __init__(self, log_dir='runs'):
        self.log_dir = f'{log_dir}/{int(time.time())}'
        self.writer = SummaryWriter(self.log_dir)
        print(f'TensorBoard log directory: {self.log_dir}')

    def log_scalar(self, tag, value, step):
        self.writer.add_scalar(tag, value, step)

    def log_histogram(self, tag, values, step):
        self.writer.add_histogram(tag, values, step)

    def log_hyperparameters(self, hparams):
        self.writer.add_hparams(hparams, {})
    def close(self):
        self.writer.close()

## 3. Implementing Early Stopping

Now, we'll create a class to handle early stopping, which saves the best model and stops training when performance on the validation set stops improving.

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=5, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

## 4. The Complete Training Loop with Monitoring and Early Stopping

Let's combine everything into a final, robust training loop.

In [None]:
def robust_train(model, train_loader, val_loader, epochs, lr, weight_decay, patience):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()
    
    # Initialize logger and early stopping
    logger = TrainingLogger()
    early_stopper = EarlyStopping(patience=patience, verbose=True)
    
    hparams = {'lr': lr, 'weight_decay': weight_decay, 'patience': patience, 'dropout': model.transformer.layers[0].dropout.p}
    logger.log_hyperparameters(hparams)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            epoch_loss += loss.item()
        
        avg_train_loss = epoch_loss / len(train_loader)
        val_loss = evaluate(model, val_loader, criterion)
        
        # Log metrics
        logger.log_scalar('Loss/train', avg_train_loss, epoch)
        logger.log_scalar('Loss/validation', val_loss, epoch)
        logger.log_scalar('Learning Rate', optimizer.param_groups[0]['lr'], epoch)
        for name, param in model.named_parameters():
            if param.requires_grad:
                logger.log_histogram(f'Parameters/{name}', param.data, epoch)
                if param.grad is not None:
                    logger.log_histogram(f'Gradients/{name}', param.grad.data, epoch)
        
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Time: {elapsed:.2f}s')
        
        # Early stopping
        early_stopper(val_loss, model)
        if early_stopper.early_stop:
            print('Early stopping')
            break
            
    # Load best model
    model.load_state_dict(torch.load('checkpoint.pt'))
    logger.close()
    return model

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

## 5. Running the Robust Training Loop

Let's run our final training loop and see early stopping in action.

In [None]:
# Model with regularization
final_model = SimpleLanguageModel(vocab_size=vocab_size, dropout=0.1)

print('Training with monitoring and early stopping...')
best_model = robust_train(
    model=final_model, 
    train_loader=train_dataloader, 
    val_loader=val_dataloader, 
    epochs=100, # Set a high number of epochs
    lr=0.001, 
    weight_decay=0.01,
    patience=7 # Set patience for early stopping
)

print('\nTo view TensorBoard logs, run the following command in your terminal:')
print(f'tensorboard --logdir=runs')

## Summary and Key Insights

In this notebook, we have implemented a robust training pipeline by incorporating:

1.  **Regularization**: Using dropout and weight decay to prevent overfitting and improve model generalization.
2.  **Monitoring**: Setting up a comprehensive logger with TensorBoard to track key metrics like loss, learning rate, and parameter distributions. This provides crucial insights into the training dynamics.
3.  **Early Stopping**: Implementing a mechanism to automatically stop training when the model's performance on a validation set ceases to improve, which saves computational resources and retrieves the best-performing model state.

By combining these techniques, we can train models more effectively, diagnose issues like overfitting, and ensure that we are selecting the best possible model based on its ability to generalize to unseen data. This forms the foundation of a production-ready training script for any deep learning model.