# Day 8: Transformer Architectures - Implementation and Analysis

This notebook contains practical implementations and exercises for the three main transformer architectures:
- Encoder-Only (BERT-style)
- Decoder-Only (GPT-style) 
- Encoder-Decoder (T5-style)

We'll also explore masked attention mechanisms and compare architectural properties.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Tuple
import math

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Libraries imported successfully!")

## 1. Encoder-Only Architecture (BERT-style)

The encoder-only architecture uses bidirectional attention to build rich contextual representations.

In [None]:
class EncoderOnlyTransformer(nn.Module):
    """BERT-style encoder-only transformer."""
    
    def __init__(self, vocab_size: int, d_model: int = 512, n_heads: int = 8, 
                 n_layers: int = 6, d_ff: int = 2048, max_seq_len: int = 512):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
        self.layer_norm = nn.LayerNorm(d_model)
        
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
        # Embedding + positional encoding
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Transformer encoder (bidirectional attention)
        x = self.transformer(x, src_key_padding_mask=mask)
        x = self.layer_norm(x)
        
        return x

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_len: int = 512):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len).unsqueeze(1).float()
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [None]:
# Test encoder-only model
vocab_size = 1000
seq_len = 10
batch_size = 2

encoder_model = EncoderOnlyTransformer(vocab_size, d_model=256, n_heads=8, n_layers=4)
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))

with torch.no_grad():
    output = encoder_model(input_ids)
    print(f"Encoder-only output shape: {output.shape}")
    print(f"Each token has bidirectional context from all other tokens")

## 2. Decoder-Only Architecture (GPT-style)

The decoder-only architecture uses causal (masked) attention for autoregressive generation.

In [None]:
class DecoderOnlyTransformer(nn.Module):
    """GPT-style decoder-only transformer."""
    
    def __init__(self, vocab_size: int, d_model: int = 512, n_heads: int = 8, 
                 n_layers: int = 6, d_ff: int = 2048, max_seq_len: int = 512):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerDecoder(decoder_layer, n_layers)
        self.output_projection = nn.Linear(d_model, vocab_size)
        
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
        seq_len = x.size(1)
        
        # Create causal mask
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
        
        # Embedding + positional encoding
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Transformer decoder (causal attention)
        # Note: Using x as both tgt and memory for decoder-only architecture
        x = self.transformer(x, x, tgt_mask=causal_mask, memory_mask=causal_mask)
        
        # Project to vocabulary
        logits = self.output_projection(x)
        
        return logits

In [None]:
# Test decoder-only model
decoder_model = DecoderOnlyTransformer(vocab_size, d_model=256, n_heads=8, n_layers=4)
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))

with torch.no_grad():
    logits = decoder_model(input_ids)
    print(f"Decoder-only output shape: {logits.shape}")
    print(f"Each token can only attend to previous tokens (causal masking)")

## 3. Encoder-Decoder Architecture (T5-style)

The encoder-decoder architecture combines bidirectional encoding with causal decoding.

In [None]:
class EncoderDecoderTransformer(nn.Module):
    """T5-style encoder-decoder transformer."""
    
    def __init__(self, vocab_size: int, d_model: int = 512, n_heads: int = 8, 
                 n_layers: int = 6, d_ff: int = 2048, max_seq_len: int = 512):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        
        # Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=0.1,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, n_layers)
        
        # Decoder
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=0.1,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, n_layers)
        
        self.output_projection = nn.Linear(d_model, vocab_size)
        
    def forward(self, src: torch.Tensor, tgt: torch.Tensor, 
                src_mask: Optional[torch.Tensor] = None,
                tgt_mask: Optional[torch.Tensor] = None):
        
        # Encode source sequence (bidirectional)
        src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
        src_emb = self.pos_encoding(src_emb)
        memory = self.encoder(src_emb, src_key_padding_mask=src_mask)
        
        # Decode target sequence (causal + cross-attention)
        tgt_len = tgt.size(1)
        if tgt_mask is None:
            tgt_mask = torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1).bool()
        
        tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        tgt_emb = self.pos_encoding(tgt_emb)
        
        output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask)
        logits = self.output_projection(output)
        
        return logits

In [None]:
# Test encoder-decoder model
enc_dec_model = EncoderDecoderTransformer(vocab_size, d_model=256, n_heads=8, n_layers=4)
src_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
tgt_ids = torch.randint(0, vocab_size, (batch_size, seq_len))

with torch.no_grad():
    logits = enc_dec_model(src_ids, tgt_ids)
    print(f"Encoder-decoder output shape: {logits.shape}")
    print(f"Decoder attends to encoder outputs via cross-attention")

## 4. Masked Attention Visualization

Let's visualize the different attention patterns used by each architecture.

In [None]:
def visualize_attention_patterns():
    """Visualize attention patterns for different architectures."""
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # BERT-style (bidirectional)
    bert_pattern = torch.ones(4, 4)
    sns.heatmap(bert_pattern.numpy(), annot=True, fmt='.0f', 
                cmap='Blues', ax=axes[0], cbar=False)
    axes[0].set_title('BERT: Bidirectional\n(All-to-All Attention)')
    axes[0].set_xlabel('Key Positions')
    axes[0].set_ylabel('Query Positions')
    
    # GPT-style (causal)
    gpt_pattern = torch.tril(torch.ones(4, 4))
    sns.heatmap(gpt_pattern.numpy(), annot=True, fmt='.0f',
                cmap='Greens', ax=axes[1], cbar=False)
    axes[1].set_title('GPT: Causal\n(Lower Triangular)')
    axes[1].set_xlabel('Key Positions')
    axes[1].set_ylabel('Query Positions')
    
    # T5-style (cross-attention visualization)
    t5_pattern = torch.ones(3, 4)  # 3 decoder tokens, 4 encoder tokens
    sns.heatmap(t5_pattern.numpy(), annot=True, fmt='.0f',
                cmap='Oranges', ax=axes[2], cbar=False)
    axes[2].set_title('T5: Cross-Attention\n(Decoder → Encoder)')
    axes[2].set_xlabel('Encoder Positions')
    axes[2].set_ylabel('Decoder Positions')
    
    plt.tight_layout()
    plt.show()

visualize_attention_patterns()

## 5. Architecture Comparison

Let's compare the three architectures across different dimensions.

In [None]:
def compare_architectures():
    """Compare the three transformer architectures."""
    
    print("Transformer Architecture Comparison")
    print("=" * 50)
    
    architectures = {
        'Encoder-Only (BERT)': {
            'attention_type': 'Bidirectional',
            'use_cases': ['Classification', 'NER', 'Question Answering'],
            'examples': ['BERT', 'RoBERTa', 'DeBERTa'],
            'strengths': ['Rich bidirectional context', 'Good for understanding tasks'],
            'limitations': ['Cannot generate text', 'No causal modeling']
        },
        'Decoder-Only (GPT)': {
            'attention_type': 'Causal/Autoregressive',
            'use_cases': ['Text Generation', 'Language Modeling', 'Few-shot Learning'],
            'examples': ['GPT-2/3/4', 'LLaMA', 'PaLM'],
            'strengths': ['Excellent generation', 'Scalable', 'Versatile'],
            'limitations': ['No bidirectional context', 'Less efficient for understanding']
        },
        'Encoder-Decoder (T5)': {
            'attention_type': 'Bidirectional + Cross-attention',
            'use_cases': ['Translation', 'Summarization', 'Seq2Seq'],
            'examples': ['T5', 'BART', 'mT5'],
            'strengths': ['Best for seq2seq', 'Flexible input/output lengths'],
            'limitations': ['More complex', 'Higher memory usage']
        }
    }
    
    for arch_name, details in architectures.items():
        print(f"\n{arch_name}:")
        print(f"  Attention: {details['attention_type']}")
        print(f"  Use cases: {', '.join(details['use_cases'])}")
        print(f"  Examples: {', '.join(details['examples'])}")
        print(f"  Strengths: {', '.join(details['strengths'])}")
        print(f"  Limitations: {', '.join(details['limitations'])}")

compare_architectures()

## 6. Practical Implementation Considerations

In [None]:
def practical_implementation_example():
    """Show practical implementation considerations."""
    
    print("Practical Implementation Considerations")
    print("=" * 45)
    
    # Model size comparison
    configs = {
        'BERT-Base': {'layers': 12, 'd_model': 768, 'heads': 12, 'params': '110M'},
        'GPT-2': {'layers': 12, 'd_model': 768, 'heads': 12, 'params': '117M'},
        'T5-Base': {'layers': 12, 'd_model': 768, 'heads': 12, 'params': '220M'}
    }
    
    print("Model Size Comparison:")
    print("Model      | Layers | d_model | Heads | Parameters")
    print("-" * 50)
    
    for model, config in configs.items():
        print(f"{model:10s} | {config['layers']:6d} | {config['d_model']:7d} | "
              f"{config['heads']:5d} | {config['params']:>10s}")
    
    print("\nMemory Usage Considerations:")
    seq_len = 512
    batch_size = 8
    d_model = 768
    
    # Attention memory (dominant factor)
    attention_memory = batch_size * seq_len * seq_len * 4 / (1024**2)  # MB
    
    print(f"Attention matrices: {attention_memory:.1f} MB")
    print(f"Scales quadratically with sequence length")
    print(f"Encoder-decoder uses ~2x memory (self + cross attention)")
    
    print("\nTraining Considerations:")
    print("- BERT: Masked Language Modeling + Next Sentence Prediction")
    print("- GPT: Causal Language Modeling")
    print("- T5: Span Corruption (text-to-text)")

practical_implementation_example()

## 7. Architecture Decision Guide

In [None]:
def architecture_decision_guide():
    """Guide for choosing the right architecture."""
    
    print("Architecture Decision Guide")
    print("=" * 35)
    
    # Task-specific recommendations
    task_recommendations = {
        'Text Classification': 'Encoder-Only (BERT)',
        'Sentiment Analysis': 'Encoder-Only (BERT)',
        'Named Entity Recognition': 'Encoder-Only (BERT)',
        'Question Answering': 'Encoder-Only (BERT)',
        'Text Generation': 'Decoder-Only (GPT)',
        'Language Modeling': 'Decoder-Only (GPT)',
        'Code Generation': 'Decoder-Only (GPT)',
        'Chat/Dialogue': 'Decoder-Only (GPT)',
        'Machine Translation': 'Encoder-Decoder (T5)',
        'Summarization': 'Encoder-Decoder (T5)',
        'Text-to-Text Tasks': 'Encoder-Decoder (T5)',
        'Paraphrasing': 'Encoder-Decoder (T5)'
    }
    
    print("Task-Specific Recommendations:")
    print("-" * 30)
    
    for task, recommendation in task_recommendations.items():
        print(f"{task:25s}: {recommendation}")
    
    print("\nKey Decision Factors:")
    print("1. Generation vs Understanding")
    print("2. Bidirectional vs Causal context")
    print("3. Input-output relationship")
    print("4. Computational efficiency")
    print("5. Available training data")

architecture_decision_guide()

## 8. Exercises

Try these exercises to deepen your understanding:

In [None]:
# Exercise 1: Implement a simple masked language model head for BERT
class MLMHead(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.dense = nn.Linear(d_model, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
        self.decoder = nn.Linear(d_model, vocab_size)
        
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        return self.decoder(hidden_states)

# Exercise 2: Create a simple text generation function for GPT
def generate_text(model, input_ids, max_length=20, temperature=1.0):
    """Simple greedy generation for decoder-only model."""
    model.eval()
    generated = input_ids.clone()
    
    for _ in range(max_length - input_ids.size(1)):
        with torch.no_grad():
            logits = model(generated)
            next_token_logits = logits[:, -1, :] / temperature
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            generated = torch.cat([generated, next_token], dim=1)
    
    return generated

print("Exercise implementations completed!")
print("Try modifying the architectures and experimenting with different configurations.")