# Day 9: Transformer Components - Part 1

This notebook explores residual connections and layer normalization - two critical components that make transformer training stable and effective.

## Setup and Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 1. Residual Connections Implementation

Let's implement and visualize how residual connections work:

In [None]:
class SimpleLayer(nn.Module):
    """A simple linear layer for demonstration."""
    
    def __init__(self, d_model):
        super().__init__()
        self.linear = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        return torch.tanh(self.linear(x))

class ResidualLayer(nn.Module):
    """Layer with residual connection."""
    
    def __init__(self, d_model):
        super().__init__()
        self.layer = SimpleLayer(d_model)
        
    def forward(self, x):
        return x + self.layer(x)  # Residual connection

class DeepNetwork(nn.Module):
    """Deep network with optional residual connections."""
    
    def __init__(self, d_model, num_layers, use_residual=True):
        super().__init__()
        self.use_residual = use_residual
        
        if use_residual:
            self.layers = nn.ModuleList([
                ResidualLayer(d_model) for _ in range(num_layers)
            ])
        else:
            self.layers = nn.ModuleList([
                SimpleLayer(d_model) for _ in range(num_layers)
            ])
    
    def forward(self, x):
        activations = [x]
        
        for layer in self.layers:
            x = layer(x)
            activations.append(x)
        
        return x, activations

## 2. Gradient Flow Analysis

Let's analyze how residual connections affect gradient flow:

In [None]:
def analyze_gradient_flow():
    """Compare gradient flow with and without residual connections."""
    
    d_model = 64
    num_layers = 10
    batch_size = 32
    
    # Create networks
    net_with_residual = DeepNetwork(d_model, num_layers, use_residual=True)
    net_without_residual = DeepNetwork(d_model, num_layers, use_residual=False)
    
    # Create input
    x = torch.randn(batch_size, d_model)
    target = torch.randn(batch_size, d_model)
    
    # Forward pass and compute loss
    def compute_gradients(network, x, target):
        network.zero_grad()
        output, _ = network(x)
        loss = F.mse_loss(output, target)
        loss.backward()
        
        gradients = []
        for layer in network.layers:
            if hasattr(layer, 'layer'):
                # Residual layer
                grad_norm = layer.layer.linear.weight.grad.norm().item()
            else:
                # Simple layer
                grad_norm = layer.linear.weight.grad.norm().item()
            gradients.append(grad_norm)
        
        return gradients, loss.item()
    
    # Compute gradients
    grads_with_residual, loss_with = compute_gradients(net_with_residual, x, target)
    grads_without_residual, loss_without = compute_gradients(net_without_residual, x, target)
    
    # Plot results
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    layers = list(range(1, num_layers + 1))
    plt.plot(layers, grads_with_residual, 'o-', label='With Residual', linewidth=2)
    plt.plot(layers, grads_without_residual, 's-', label='Without Residual', linewidth=2)
    plt.xlabel('Layer Number')
    plt.ylabel('Gradient Norm')
    plt.title('Gradient Flow Comparison')
    plt.legend()
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.bar(['With Residual', 'Without Residual'], [loss_with, loss_without])
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Average gradient norm with residual: {np.mean(grads_with_residual):.6f}")
    print(f"Average gradient norm without residual: {np.mean(grads_without_residual):.6f}")
    print(f"Loss with residual: {loss_with:.6f}")
    print(f"Loss without residual: {loss_without:.6f}")

analyze_gradient_flow()

## 3. Layer Normalization Implementation

Now let's implement and compare different normalization techniques:

In [None]:
class LayerNorm(nn.Module):
    """Custom Layer Normalization implementation."""
    
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    
    def forward(self, x):
        # Compute mean and std across the feature dimension
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        
        # Normalize
        normalized = (x - mean) / (std + self.eps)
        
        # Scale and shift
        return self.gamma * normalized + self.beta

def compare_normalizations():
    """Compare different normalization techniques."""
    
    batch_size = 8
    seq_len = 16
    d_model = 64
    
    # Create input with varying scales
    x = torch.randn(batch_size, seq_len, d_model)
    x[:, :, :32] *= 10  # Make first half of features larger
    x[:, :, 32:] *= 0.1  # Make second half smaller
    
    # Apply different normalizations
    layer_norm = LayerNorm(d_model)
    batch_norm = nn.BatchNorm1d(d_model)
    
    # Layer normalization
    x_layer_norm = layer_norm(x)
    
    # Batch normalization (need to reshape)
    x_reshaped = x.view(-1, d_model).transpose(0, 1)
    x_batch_norm = batch_norm(x_reshaped).transpose(0, 1).view(batch_size, seq_len, d_model)
    
    # Visualize results
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    
    # Original input
    im1 = axes[0, 0].imshow(x[0].detach().numpy(), cmap='RdBu', aspect='auto')
    axes[0, 0].set_title('Original Input')
    axes[0, 0].set_xlabel('Feature Dimension')
    axes[0, 0].set_ylabel('Sequence Position')
    plt.colorbar(im1, ax=axes[0, 0])
    
    # Layer normalized
    im2 = axes[0, 1].imshow(x_layer_norm[0].detach().numpy(), cmap='RdBu', aspect='auto')
    axes[0, 1].set_title('Layer Normalized')
    axes[0, 1].set_xlabel('Feature Dimension')
    axes[0, 1].set_ylabel('Sequence Position')
    plt.colorbar(im2, ax=axes[0, 1])
    
    # Batch normalized
    im3 = axes[0, 2].imshow(x_batch_norm[0].detach().numpy(), cmap='RdBu', aspect='auto')
    axes[0, 2].set_title('Batch Normalized')
    axes[0, 2].set_xlabel('Feature Dimension')
    axes[0, 2].set_ylabel('Sequence Position')
    plt.colorbar(im3, ax=axes[0, 2])
    
    # Statistics comparison
    axes[1, 0].hist(x[0].flatten().detach().numpy(), bins=50, alpha=0.7, label='Original')
    axes[1, 0].set_title('Original Distribution')
    axes[1, 0].set_xlabel('Value')
    axes[1, 0].set_ylabel('Frequency')
    
    axes[1, 1].hist(x_layer_norm[0].flatten().detach().numpy(), bins=50, alpha=0.7, label='Layer Norm', color='orange')
    axes[1, 1].set_title('Layer Norm Distribution')
    axes[1, 1].set_xlabel('Value')
    axes[1, 1].set_ylabel('Frequency')
    
    axes[1, 2].hist(x_batch_norm[0].flatten().detach().numpy(), bins=50, alpha=0.7, label='Batch Norm', color='green')
    axes[1, 2].set_title('Batch Norm Distribution')
    axes[1, 2].set_xlabel('Value')
    axes[1, 2].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("Statistics Comparison:")
    print(f"Original - Mean: {x.mean():.4f}, Std: {x.std():.4f}")
    print(f"Layer Norm - Mean: {x_layer_norm.mean():.4f}, Std: {x_layer_norm.std():.4f}")
    print(f"Batch Norm - Mean: {x_batch_norm.mean():.4f}, Std: {x_batch_norm.std():.4f}")

compare_normalizations()

## 4. Training Stability Analysis

Let's analyze how these components affect training stability:

In [None]:
class TransformerComponent(nn.Module):
    """Basic transformer component with configurable normalization and residual connections."""
    
    def __init__(self, d_model, use_residual=True, use_layer_norm=True):
        super().__init__()
        self.use_residual = use_residual
        self.use_layer_norm = use_layer_norm
        
        self.linear1 = nn.Linear(d_model, d_model * 4)
        self.linear2 = nn.Linear(d_model * 4, d_model)
        
        if use_layer_norm:
            self.norm = LayerNorm(d_model)
    
    def forward(self, x):
        residual = x
        
        # Apply layer norm first (pre-norm)
        if self.use_layer_norm:
            x = self.norm(x)
        
        # Feed-forward network
        x = F.gelu(self.linear1(x))
        x = self.linear2(x)
        
        # Add residual connection
        if self.use_residual:
            x = x + residual
        
        return x

def training_stability_experiment():
    """Experiment to show training stability with different configurations."""
    
    d_model = 128
    seq_len = 32
    batch_size = 16
    num_steps = 100
    
    # Different configurations
    configs = [
        {'use_residual': True, 'use_layer_norm': True, 'name': 'Residual + LayerNorm'},
        {'use_residual': True, 'use_layer_norm': False, 'name': 'Residual Only'},
        {'use_residual': False, 'use_layer_norm': True, 'name': 'LayerNorm Only'},
        {'use_residual': False, 'use_layer_norm': False, 'name': 'Neither'}
    ]
    
    results = {}
    
    for config in configs:
        # Create model
        model = nn.Sequential(*[
            TransformerComponent(d_model, config['use_residual'], config['use_layer_norm'])
            for _ in range(4)
        ])
        
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        losses = []
        grad_norms = []
        
        for step in range(num_steps):
            # Generate random data
            x = torch.randn(batch_size, seq_len, d_model)
            target = torch.randn(batch_size, seq_len, d_model)
            
            # Forward pass
            optimizer.zero_grad()
            output = model(x)
            loss = F.mse_loss(output, target)
            
            # Backward pass
            loss.backward()
            
            # Compute gradient norm
            total_norm = 0
            for p in model.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
            total_norm = total_norm ** (1. / 2)
            
            optimizer.step()
            
            losses.append(loss.item())
            grad_norms.append(total_norm)
        
        results[config['name']] = {'losses': losses, 'grad_norms': grad_norms}
    
    # Plot results
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Loss curves
    for name, data in results.items():
        axes[0].plot(data['losses'], label=name, linewidth=2)
    axes[0].set_xlabel('Training Step')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    axes[0].set_yscale('log')
    
    # Gradient norms
    for name, data in results.items():
        axes[1].plot(data['grad_norms'], label=name, linewidth=2)
    axes[1].set_xlabel('Training Step')
    axes[1].set_ylabel('Gradient Norm')
    axes[1].set_title('Gradient Norms')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    axes[1].set_yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    # Print final statistics
    print("Final Training Statistics:")
    for name, data in results.items():
        final_loss = data['losses'][-1]
        avg_grad_norm = np.mean(data['grad_norms'][-10:])
        print(f"{name:20s}: Loss = {final_loss:.6f}, Avg Grad Norm = {avg_grad_norm:.6f}")

training_stability_experiment()