# Day 4: Positional Encodings - Part 3

In this notebook, we'll explore practical applications and exercises related to positional encodings.

## Setup and Imports

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi, sin, cos

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 5. Positional Encoding in Practice

Let's implement a simple transformer block with positional encoding to see how it works in practice.

In [None]:
# First, let's define our positional encoding classes
class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal positional encoding implementation."""
    
    def __init__(self, d_model, max_seq_len=5000):
        super().__init__()
        self.d_model = d_model
        
        # Create positional encoding matrix
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        
        # Create division term for frequency scaling
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-np.log(10000.0) / d_model))
        
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Register as buffer (not a parameter)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        """Add positional encoding to input embeddings."""
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]

class LearnedPositionalEmbedding(nn.Module):
    """Learned positional embedding implementation."""
    
    def __init__(self, max_seq_len, d_model):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.d_model = d_model
        
        # Learnable position embeddings
        self.position_embeddings = nn.Embedding(max_seq_len, d_model)
        
        # Initialize with small random values
        nn.init.normal_(self.position_embeddings.weight, std=0.02)
    
    def forward(self, x, position_ids=None):
        """Add learned positional embeddings."""
        seq_len = x.size(1)
        
        if position_ids is None:
            position_ids = torch.arange(seq_len, dtype=torch.long, device=x.device)
            position_ids = position_ids.unsqueeze(0).expand(x.size(0), -1)
        
        position_embeddings = self.position_embeddings(position_ids)
        return x + position_embeddings

In [None]:
class TransformerWithPositionalEncoding(nn.Module):
    """Simple transformer block with positional encoding."""
    
    def __init__(self, vocab_size, d_model, nhead, max_seq_len, pos_encoding_type='sinusoidal'):
        super().__init__()
        self.d_model = d_model
        
        # Token embeddings
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        
        # Positional encoding
        if pos_encoding_type == 'sinusoidal':
            self.pos_encoding = SinusoidalPositionalEncoding(d_model, max_seq_len)
        elif pos_encoding_type == 'learned':
            self.pos_encoding = LearnedPositionalEmbedding(max_seq_len, d_model)
        else:
            raise ValueError(f"Unknown positional encoding type: {pos_encoding_type}")
        
        # Simple attention layer
        self.attention = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
    
    def forward(self, token_ids):
        """Forward pass with positional encoding."""
        # Token embeddings
        embeddings = self.token_embedding(token_ids) * np.sqrt(self.d_model)
        
        # Add positional encoding
        embeddings_with_pos = self.pos_encoding(embeddings)
        
        # Self-attention
        attn_output, attn_weights = self.attention(
            embeddings_with_pos, embeddings_with_pos, embeddings_with_pos
        )
        
        # Residual connection and normalization
        output = self.norm(embeddings_with_pos + attn_output)
        
        return output, attn_weights

### Demonstrating the Effect of Positional Encoding

Let's demonstrate how positional encoding affects attention patterns in a transformer model:

In [None]:
def demonstrate_position_effect():
    """Demonstrate how positional encoding affects attention."""
    
    vocab_size = 100
    d_model = 64
    nhead = 4
    max_seq_len = 10
    
    # Create models with and without positional encoding
    model_with_pos = TransformerWithPositionalEncoding(
        vocab_size, d_model, nhead, max_seq_len, 'sinusoidal'
    )
    
    # Create a model without positional encoding (for comparison)
    class NoPositionalEncoding(nn.Module):
        def forward(self, x):
            return x
    
    model_no_pos = TransformerWithPositionalEncoding(
        vocab_size, d_model, nhead, max_seq_len, 'sinusoidal'
    )
    model_no_pos.pos_encoding = NoPositionalEncoding()
    
    # Test with identical tokens at different positions
    token_ids = torch.tensor([[1, 1, 1, 2, 2, 2]])  # Same tokens repeated
    
    # Forward pass with positional encoding
    output_with_pos, attn_with_pos = model_with_pos(token_ids)
    
    # Forward pass without positional encoding
    output_no_pos, attn_no_pos = model_no_pos(token_ids)
    
    print("Effect of Positional Encoding on Attention:")
    print("=" * 50)
    
    # Compare attention patterns
    print("Attention weights WITH positional encoding:")
    print(attn_with_pos[0, 0].detach().numpy().round(3))  # First head, first batch
    
    print("\nAttention weights WITHOUT positional encoding:")
    print(attn_no_pos[0, 0].detach().numpy().round(3))
    
    # Visualize attention patterns
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # With positional encoding
    im1 = ax1.imshow(attn_with_pos[0, 0].detach().numpy(), cmap='Blues')
    ax1.set_title('Attention WITH Positional Encoding')
    ax1.set_xlabel('Key Position')
    ax1.set_ylabel('Query Position')
    plt.colorbar(im1, ax=ax1)
    
    # Without positional encoding
    im2 = ax2.imshow(attn_no_pos[0, 0].detach().numpy(), cmap='Blues')
    ax2.set_title('Attention WITHOUT Positional Encoding')
    ax2.set_xlabel('Key Position')
    ax2.set_ylabel('Query Position')
    plt.colorbar(im2, ax=ax2)
    
    plt.tight_layout()
    plt.show()
    
    # Analyze the attention patterns
    print("\nObservations:")
    print("1. Without positional encoding, identical tokens attend to each other equally")
    print("2. With positional encoding, we see a more structured attention pattern")
    print("3. Tokens tend to attend more to themselves and nearby positions")
    print("4. The model can now distinguish between different positions of the same token")
    
    return output_with_pos, output_no_pos, attn_with_pos, attn_no_pos

# Demonstrate the effect of positional encoding
position_effect_results = demonstrate_position_effect()

## 6. Practical Exercises

### Exercise 1: Position Encoding Comparison

Let's compare different positional encoding methods in terms of how they affect attention patterns and model performance:

In [None]:
def position_encoding_comparison_experiment():
    """Compare different positional encoding methods."""
    
    d_model = 128
    max_seq_len = 50
    
    # Initialize encoders
    encoders = {
        'Sinusoidal': SinusoidalPositionalEncoding(d_model, max_seq_len),
        'Learned': LearnedPositionalEmbedding(max_seq_len, d_model)
    }
    
    # Test sequence
    seq_len = 20
    batch_size = 1
    
    # Random token embeddings
    token_embeddings = torch.randn(batch_size, seq_len, d_model)
    
    results = {}
    
    for name, encoder in encoders.items():
        # Apply positional encoding
        if name == 'Sinusoidal':
            encoded = encoder(token_embeddings)
        else:  # Learned
            encoded = encoder(token_embeddings)
        
        # Compute position distinguishability
        position_similarities = []
        for i in range(seq_len - 1):
            for j in range(i + 1, seq_len):
                pos_i = encoded[0, i].detach().numpy()
                pos_j = encoded[0, j].detach().numpy()
                sim = np.dot(pos_i, pos_j) / (np.linalg.norm(pos_i) * np.linalg.norm(pos_j))
                position_similarities.append(sim)
        
        results[name] = {
            'avg_similarity': np.mean(position_similarities),
            'std_similarity': np.std(position_similarities),
            'encoded_shape': encoded.shape
        }
    
    print("Positional Encoding Comparison:")
    print("=" * 40)
    for name, stats in results.items():
        print(f"{name}:")
        print(f"  Average position similarity: {stats['avg_similarity']:.4f}")
        print(f"  Similarity std deviation: {stats['std_similarity']:.4f}")
        print(f"  Output shape: {stats['encoded_shape']}")
        print()
    
    # Visualize similarity distributions
    plt.figure(figsize=(10, 6))
    
    # Compute all pairwise similarities for visualization
    all_similarities = {}
    for name, encoder in encoders.items():
        if name == 'Sinusoidal':
            encoded = encoder(token_embeddings)
        else:  # Learned
            encoded = encoder(token_embeddings)
            
        # Compute similarity matrix
        similarities = np.zeros((seq_len, seq_len))
        for i in range(seq_len):
            for j in range(seq_len):
                pos_i = encoded[0, i].detach().numpy()
                pos_j = encoded[0, j].detach().numpy()
                sim = np.dot(pos_i, pos_j) / (np.linalg.norm(pos_i) * np.linalg.norm(pos_j))
                similarities[i, j] = sim
        
        all_similarities[name] = similarities
    
    # Plot similarity matrices
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    for i, (name, similarities) in enumerate(all_similarities.items()):
        im = axes[i].imshow(similarities, cmap='viridis', vmin=-1, vmax=1)
        axes[i].set_title(f'{name} Position Similarity Matrix')
        axes[i].set_xlabel('Position')
        axes[i].set_ylabel('Position')
        plt.colorbar(im, ax=axes[i])
    
    plt.tight_layout()
    plt.show()
    
    return results, all_similarities

# Run the comparison experiment
comparison_experiment_results, similarity_matrices = position_encoding_comparison_experiment()

### Exercise 2: Sequence Length Extrapolation

Let's test how well different positional encodings handle longer sequences than they were trained on:

In [None]:
def sequence_length_extrapolation_test():
    """Test how well positional encodings handle longer sequences."""
    
    d_model = 64
    train_seq_len = 20
    test_seq_lens = [30, 40, 60, 100]
    
    # Train on shorter sequences
    sinusoidal = SinusoidalPositionalEncoding(d_model, max_seq_len=200)
    learned = LearnedPositionalEmbedding(train_seq_len, d_model)
    
    print("Sequence Length Extrapolation Test:")
    print("=" * 40)
    
    for test_len in test_seq_lens:
        print(f"\nTesting sequence length: {test_len}")
        
        # Create test embeddings
        test_embeddings = torch.randn(1, test_len, d_model)
        
        # Sinusoidal (can handle any length)
        try:
            sin_encoded = sinusoidal(test_embeddings)
            sin_success = True
        except Exception as e:
            sin_success = False
            print(f"  Sinusoidal failed: {e}")
        
        # Learned (limited by training length)
        try:
            if test_len <= train_seq_len:
                learned_encoded = learned(test_embeddings)
                learned_success = True
            else:
                learned_success = False
                print(f"  Learned encoding cannot handle length {test_len} (trained on {train_seq_len})")
        except Exception as e:
            learned_success = False
            print(f"  Learned failed: {e}")
        
        print(f"  Sinusoidal: {'✓' if sin_success else '✗'}")
        print(f"  Learned: {'✓' if learned_success else '✗'}")
    
    print("\nConclusion: Sinusoidal encodings can extrapolate to longer sequences,")
    print("while learned encodings are limited by their training length.")
    
    # Visualize sinusoidal encodings at different positions
    if max(test_seq_lens) <= 200:  # Check if we can visualize the longest sequence
        positions = [0, 10, 20, 50, 100]
        plt.figure(figsize=(12, 6))
        
        for pos in positions:
            if pos < 200:  # Make sure position is within range
                encoding = sinusoidal.pe[0, pos, :32].detach().numpy()  # First 32 dimensions
                plt.plot(encoding, label=f'Position {pos}', alpha=0.7)
        
        plt.title('Sinusoidal Encodings at Different Positions')
        plt.xlabel('Dimension')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

# Run the extrapolation test
sequence_length_extrapolation_test()

## Key Takeaways

1. **Position Problem**: Transformers need explicit positional information due to their parallel processing
2. **Sinusoidal Encoding**: Mathematical, deterministic approach that extrapolates well to unseen lengths
3. **Learned Encoding**: Flexible approach that adapts to data but is limited by training sequence length
4. **RoPE**: Modern approach with better relative position modeling through rotational transformations
5. **Relative Encoding**: Focuses on relative rather than absolute positions for more efficient modeling

## What's Next

In Day 5, we'll build a complete end-to-end pipeline combining everything: text → tokenization → embeddings → positional encoding, creating a full preprocessing pipeline for transformer models.