# Day 5: End-to-End Pipeline - Part 2

In this notebook, we'll continue exploring our transformer preprocessing pipeline with performance benchmarking and integration with the docs/llm.md flow.

## Setup and Imports

In [None]:
import torch
import torch.nn as nn
import numpy as np
import time
from typing import List, Dict, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Reimplementing Required Classes

Let's first reimplement our positional encoding and pipeline classes from part 1:

In [None]:
class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal positional encoding implementation."""
    
    def __init__(self, d_model, max_seq_len=5000):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        """Add positional encoding to input embeddings."""
        return x + self.pe[:, :x.size(1)]

class LearnedPositionalEmbedding(nn.Module):
    """Learned positional embedding implementation."""
    
    def __init__(self, max_seq_len, d_model):
        super().__init__()
        self.position_embeddings = nn.Embedding(max_seq_len, d_model)
        nn.init.normal_(self.position_embeddings.weight, std=0.02)
    
    def forward(self, x):
        """Add learned positional embeddings."""
        seq_len = x.size(1)
        position_ids = torch.arange(seq_len, dtype=torch.long, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand(x.size(0), -1)
        return x + self.position_embeddings(position_ids)

class TransformerPreprocessingPipeline:
    """Complete preprocessing pipeline for transformer models."""
    
    def __init__(self, 
                 vocab_size: int = 1000,
                 embedding_dim: int = 128,
                 max_seq_len: int = 512,
                 pad_token: str = "[PAD]",
                 cls_token: str = "[CLS]",
                 sep_token: str = "[SEP]",
                 unk_token: str = "[UNK]",
                 pos_encoding_type: str = "sinusoidal"):
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        
        # Special tokens
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.unk_token = unk_token
        
        # Initialize components
        self._init_tokenizer()
        self._init_embeddings()
        self._init_positional_encoding(pos_encoding_type)
    
    def _init_tokenizer(self):
        """Initialize tokenizer with basic vocabulary."""
        # Create basic vocabulary
        special_tokens = [self.pad_token, self.unk_token, self.cls_token, self.sep_token]
        common_words = ["the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with"]
        chars = list("abcdefghijklmnopqrstuvwxyz0123456789.,!?;:()[]{}\"'-_/\\@#$%^&*+=<>|`~")
        
        vocab_list = special_tokens + common_words + chars
        vocab_list = vocab_list[:self.vocab_size]  # Limit to vocab_size
        
        self.vocab = {token: idx for idx, token in enumerate(vocab_list)}
        self.vocab_reverse = {idx: token for token, idx in self.vocab.items()}
        
        # Special token IDs
        self.pad_token_id = self.vocab[self.pad_token]
        self.cls_token_id = self.vocab[self.cls_token]
        self.sep_token_id = self.vocab[self.sep_token]
        self.unk_token_id = self.vocab[self.unk_token]
    
    def _init_embeddings(self):
        """Initialize embedding layer."""
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, 
                                    padding_idx=self.pad_token_id)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.1)
        
        # Zero out padding token embedding
        self.embedding.weight.data[self.pad_token_id].fill_(0)
    
    def _init_positional_encoding(self, pos_type):
        """Initialize positional encoding."""
        if pos_type == "sinusoidal":
            self.pos_encoding = SinusoidalPositionalEncoding(
                self.embedding_dim, self.max_seq_len)
        elif pos_type == "learned":
            self.pos_encoding = LearnedPositionalEmbedding(
                self.max_seq_len, self.embedding_dim)
        else:
            raise ValueError(f"Unknown positional encoding type: {pos_type}")
    
    def tokenize(self, text: str) -> List[str]:
        """Simple tokenization (character-level for demo)."""
        # Normalize text
        text = text.lower().strip()
        
        # Simple character-level tokenization
        tokens = []
        for char in text:
            if char in self.vocab:
                tokens.append(char)
            else:
                tokens.append(self.unk_token)
        
        return tokens
    
    def encode_text(self, text: str, 
                   add_special_tokens: bool = True,
                   max_length: Optional[int] = None,
                   padding: bool = True,
                   truncation: bool = True) -> Dict[str, torch.Tensor]:
        """Complete encoding pipeline."""
        
        if max_length is None:
            max_length = self.max_seq_len
        
        # Step 1: Tokenization
        tokens = self.tokenize(text)
        
        # Step 2: Add special tokens
        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]
        
        # Step 3: Truncation
        if truncation and len(tokens) > max_length:
            if add_special_tokens:
                tokens = tokens[:max_length-1] + [self.sep_token]
            else:
                tokens = tokens[:max_length]
        
        # Step 4: Convert to IDs
        input_ids = [self.vocab.get(token, self.unk_token_id) for token in tokens]
        
        # Step 5: Create attention mask
        attention_mask = [1] * len(input_ids)
        
        # Step 6: Padding
        if padding and len(input_ids) < max_length:
            padding_length = max_length - len(input_ids)
            input_ids.extend([self.pad_token_id] * padding_length)
            attention_mask.extend([0] * padding_length)
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'tokens': tokens
        }
    
    def get_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        """Get token embeddings."""
        return self.embedding(input_ids)
    
    def add_positional_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
        """Add positional encoding to embeddings."""
        return self.pos_encoding(embeddings)
    
    def process_text(self, text: str, **kwargs) -> Dict[str, torch.Tensor]:
        """Complete end-to-end processing."""
        # Step 1: Encode text to tokens and IDs
        encoded = self.encode_text(text, **kwargs)
        
        # Step 2: Get embeddings
        embeddings = self.get_embeddings(encoded['input_ids'].unsqueeze(0))
        
        # Step 3: Add positional encoding
        embeddings_with_pos = self.add_positional_encoding(embeddings)
        
        # Step 4: Scale embeddings (as in original Transformer)
        embeddings_with_pos = embeddings_with_pos * np.sqrt(self.embedding_dim)
        
        return {
            'input_ids': encoded['input_ids'],
            'attention_mask': encoded['attention_mask'],
            'embeddings': embeddings_with_pos.squeeze(0),
            'tokens': encoded['tokens']
        }

## 2. Performance Benchmarking

Let's benchmark the performance of our pipeline with different text lengths and batch sizes:

In [None]:
def benchmark_pipeline():
    """Benchmark pipeline performance."""
    
    pipeline = TransformerPreprocessingPipeline(
        vocab_size=1000,
        embedding_dim=128,
        max_seq_len=64
    )
    
    # Test texts of different lengths
    test_cases = [
        ("Short", "hello"),
        ("Medium", "this is a medium length sentence for testing"),
        ("Long", "this is a much longer sentence that should test the performance of our pipeline with more tokens and processing overhead" * 2),
    ]
    
    batch_sizes = [1, 8, 32]
    
    print("Pipeline Performance Benchmark")
    print("=" * 50)
    
    results = []
    
    for name, text in test_cases:
        print(f"\n{name} text ({len(text)} chars):")
        
        for batch_size in batch_sizes:
            # Prepare batch
            texts = [text] * batch_size
            
            # Benchmark
            start_time = time.time()
            
            for text_item in texts:
                result = pipeline.process_text(text_item, max_length=32)
            
            end_time = time.time()
            
            total_time = end_time - start_time
            time_per_text = total_time / batch_size
            texts_per_second = batch_size / total_time
            
            print(f"  Batch size {batch_size:2d}: {time_per_text*1000:6.2f}ms/text | {texts_per_second:6.1f} texts/sec")
            
            results.append({
                'text_type': name,
                'text_length': len(text),
                'batch_size': batch_size,
                'time_per_text_ms': time_per_text * 1000,
                'texts_per_second': texts_per_second
            })
    
    # Visualize results
    plt.figure(figsize=(12, 6))
    
    # Group by text type
    for name in ["Short", "Medium", "Long"]:
        data = [r for r in results if r['text_type'] == name]
        batch_sizes = [r['batch_size'] for r in data]
        times = [r['time_per_text_ms'] for r in data]
        plt.plot(batch_sizes, times, 'o-', label=name)
    
    plt.title('Processing Time per Text vs Batch Size')
    plt.xlabel('Batch Size')
    plt.ylabel('Time per Text (ms)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return results

# Run benchmark
benchmark_results = benchmark_pipeline()

## 3. Integration with docs/llm.md Flow

Let's demonstrate how our pipeline matches the flow described in docs/llm.md:

In [None]:
def demonstrate_llm_md_integration():
    """Demonstrate how pipeline matches docs/llm.md flow."""
    
    print("Integration with docs/llm.md Flow")
    print("=" * 50)
    
    pipeline = TransformerPreprocessingPipeline(
        vocab_size=500,
        embedding_dim=128,
        max_seq_len=32
    )
    
    text = "Natural language processing with transformers"
    
    print(f"Input Text: '{text}'")
    print("\nFollowing docs/llm.md flow:")
    
    # Step 1: Input Embeddings (from docs/llm.md)
    print("\n1. Input Embeddings:")
    encoded = pipeline.encode_text(text, max_length=16)
    embeddings = pipeline.get_embeddings(encoded['input_ids'].unsqueeze(0))
    print(f"   Token embeddings shape: {embeddings.shape}")
    print(f"   Sample embedding (first token): {embeddings[0, 0, :5].detach().numpy()}")
    
    # Step 2: Positional Encoding (from docs/llm.md)
    print("\n2. Positional Encoding:")
    embeddings_with_pos = pipeline.add_positional_encoding(embeddings)
    print(f"   With positional encoding shape: {embeddings_with_pos.shape}")
    print(f"   Position encoding added: ✓")
    
    # Step 3: Ready for Transformer Blocks
    print("\n3. Ready for Transformer Processing:")
    print("   ✓ Input Embeddings: Created dense vector representations")
    print("   ✓ Positional Encoding: Added sequence order information")
    print("   ✓ Attention Mask: Created for padding tokens")
    print("   → Ready for Multi-Head Attention (Week 2)")
    
    # Show the complete flow
    result = pipeline.process_text(text, max_length=16)
    
    print(f"\nFinal Output Summary:")
    print(f"   Input IDs: {result['input_ids'].shape}")
    print(f"   Embeddings: {result['embeddings'].shape}")
    print(f"   Attention Mask: {result['attention_mask'].shape}")
    print(f"   Ready for: Encoder/Decoder Transformer Blocks")
    
    # Visualize the flow
    plt.figure(figsize=(12, 8))
    
    # Create a horizontal flow diagram
    steps = ['Raw Text', 'Tokenization', 'Token IDs', 'Embeddings', 'Positional\nEncoding', 'Model Input']
    x = np.arange(len(steps))
    y = np.zeros_like(x)
    
    plt.plot(x, y, 'o-', markersize=15, linewidth=2)
    
    for i, step in enumerate(steps):
        plt.text(i, 0.1, step, ha='center', fontsize=12, fontweight='bold')
    
    plt.title('Transformer Preprocessing Pipeline Flow')
    plt.xlim(-0.5, len(steps) - 0.5)
    plt.ylim(-0.5, 0.5)
    plt.axis('off')
    plt.show()
    
    return result

# Demonstrate integration
integration_result = demonstrate_llm_md_integration()

## 4. Summary and Next Steps

In this notebook, we've benchmarked our transformer preprocessing pipeline and demonstrated how it integrates with the flow described in docs/llm.md. The pipeline efficiently converts raw text into transformer-ready inputs through a series of steps:

1. Text normalization and tokenization
2. Converting tokens to IDs
3. Adding special tokens and padding
4. Creating embeddings
5. Adding positional encoding
6. Creating attention masks

This completes our Week 1 journey through the fundamentals of transformer preprocessing. In Week 2, we'll build on this foundation to implement the transformer architecture itself, including self-attention mechanisms and encoder/decoder blocks.