# Day 5: End-to-End Pipeline - Part 1

In this notebook, we'll build a complete preprocessing pipeline that takes raw text and produces transformer-ready inputs. We'll integrate all the components we've learned about in Week 1: tokenization, embeddings, and positional encoding.

## Setup and Imports

In [None]:
import torch
import torch.nn as nn
import numpy as np
from typing import List, Dict, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Positional Encoding Components

First, let's define our positional encoding classes that we'll use in our pipeline:

In [None]:
class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal positional encoding implementation."""
    
    def __init__(self, d_model, max_seq_len=5000):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        """Add positional encoding to input embeddings."""
        return x + self.pe[:, :x.size(1)]

class LearnedPositionalEmbedding(nn.Module):
    """Learned positional embedding implementation."""
    
    def __init__(self, max_seq_len, d_model):
        super().__init__()
        self.position_embeddings = nn.Embedding(max_seq_len, d_model)
        nn.init.normal_(self.position_embeddings.weight, std=0.02)
    
    def forward(self, x):
        """Add learned positional embeddings."""
        seq_len = x.size(1)
        position_ids = torch.arange(seq_len, dtype=torch.long, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand(x.size(0), -1)
        return x + self.position_embeddings(position_ids)

## 2. Complete Preprocessing Pipeline

Now, let's implement our complete transformer preprocessing pipeline that integrates tokenization, embeddings, and positional encoding:

In [None]:
class TransformerPreprocessingPipeline:
    """Complete preprocessing pipeline for transformer models."""
    
    def __init__(self, 
                 vocab_size: int = 1000,
                 embedding_dim: int = 128,
                 max_seq_len: int = 512,
                 pad_token: str = "[PAD]",
                 cls_token: str = "[CLS]",
                 sep_token: str = "[SEP]",
                 unk_token: str = "[UNK]",
                 pos_encoding_type: str = "sinusoidal"):
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        
        # Special tokens
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.unk_token = unk_token
        
        # Initialize components
        self._init_tokenizer()
        self._init_embeddings()
        self._init_positional_encoding(pos_encoding_type)
    
    def _init_tokenizer(self):
        """Initialize tokenizer with basic vocabulary."""
        # Create basic vocabulary
        special_tokens = [self.pad_token, self.unk_token, self.cls_token, self.sep_token]
        common_words = ["the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with"]
        chars = list("abcdefghijklmnopqrstuvwxyz0123456789.,!?;:()[]{}\"'-_/\\@#$%^&*+=<>|`~")
        
        vocab_list = special_tokens + common_words + chars
        vocab_list = vocab_list[:self.vocab_size]  # Limit to vocab_size
        
        self.vocab = {token: idx for idx, token in enumerate(vocab_list)}
        self.vocab_reverse = {idx: token for token, idx in self.vocab.items()}
        
        # Special token IDs
        self.pad_token_id = self.vocab[self.pad_token]
        self.cls_token_id = self.vocab[self.cls_token]
        self.sep_token_id = self.vocab[self.sep_token]
        self.unk_token_id = self.vocab[self.unk_token]
    
    def _init_embeddings(self):
        """Initialize embedding layer."""
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, 
                                    padding_idx=self.pad_token_id)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.1)
        
        # Zero out padding token embedding
        self.embedding.weight.data[self.pad_token_id].fill_(0)
    
    def _init_positional_encoding(self, pos_type):
        """Initialize positional encoding."""
        if pos_type == "sinusoidal":
            self.pos_encoding = SinusoidalPositionalEncoding(
                self.embedding_dim, self.max_seq_len)
        elif pos_type == "learned":
            self.pos_encoding = LearnedPositionalEmbedding(
                self.max_seq_len, self.embedding_dim)
        else:
            raise ValueError(f"Unknown positional encoding type: {pos_type}")
    
    def tokenize(self, text: str) -> List[str]:
        """Simple tokenization (character-level for demo)."""
        # Normalize text
        text = text.lower().strip()
        
        # Simple character-level tokenization
        tokens = []
        for char in text:
            if char in self.vocab:
                tokens.append(char)
            else:
                tokens.append(self.unk_token)
        
        return tokens
    
    def encode_text(self, text: str, 
                   add_special_tokens: bool = True,
                   max_length: Optional[int] = None,
                   padding: bool = True,
                   truncation: bool = True) -> Dict[str, torch.Tensor]:
        """Complete encoding pipeline."""
        
        if max_length is None:
            max_length = self.max_seq_len
        
        # Step 1: Tokenization
        tokens = self.tokenize(text)
        
        # Step 2: Add special tokens
        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]
        
        # Step 3: Truncation
        if truncation and len(tokens) > max_length:
            if add_special_tokens:
                tokens = tokens[:max_length-1] + [self.sep_token]
            else:
                tokens = tokens[:max_length]
        
        # Step 4: Convert to IDs
        input_ids = [self.vocab.get(token, self.unk_token_id) for token in tokens]
        
        # Step 5: Create attention mask
        attention_mask = [1] * len(input_ids)
        
        # Step 6: Padding
        if padding and len(input_ids) < max_length:
            padding_length = max_length - len(input_ids)
            input_ids.extend([self.pad_token_id] * padding_length)
            attention_mask.extend([0] * padding_length)
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'tokens': tokens
        }
    
    def get_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        """Get token embeddings."""
        return self.embedding(input_ids)
    
    def add_positional_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
        """Add positional encoding to embeddings."""
        return self.pos_encoding(embeddings)
    
    def process_text(self, text: str, **kwargs) -> Dict[str, torch.Tensor]:
        """Complete end-to-end processing."""
        # Step 1: Encode text to tokens and IDs
        encoded = self.encode_text(text, **kwargs)
        
        # Step 2: Get embeddings
        embeddings = self.get_embeddings(encoded['input_ids'].unsqueeze(0))
        
        # Step 3: Add positional encoding
        embeddings_with_pos = self.add_positional_encoding(embeddings)
        
        # Step 4: Scale embeddings (as in original Transformer)
        embeddings_with_pos = embeddings_with_pos * np.sqrt(self.embedding_dim)
        
        return {
            'input_ids': encoded['input_ids'],
            'attention_mask': encoded['attention_mask'],
            'embeddings': embeddings_with_pos.squeeze(0),
            'tokens': encoded['tokens']
        }

## 3. Testing the Pipeline

Let's test our pipeline with a comprehensive set of examples:

In [None]:
def test_pipeline_comprehensive():
    """Comprehensive pipeline testing."""
    
    print("Comprehensive Pipeline Testing")
    print("=" * 50)
    
    # Initialize pipeline
    pipeline = TransformerPreprocessingPipeline(
        vocab_size=200,
        embedding_dim=64,
        max_seq_len=32,
        pos_encoding_type="sinusoidal"
    )
    
    # Test cases
    test_texts = [
        "hello world",
        "this is a longer test sentence",
        "short",
        "",  # Empty string
        "a" * 100,  # Very long text
        "hello, world! 123",  # With punctuation and numbers
    ]
    
    for i, text in enumerate(test_texts, 1):
        print(f"\nTest {i}: '{text[:30]}{'...' if len(text) > 30 else ''}'")
        
        try:
            result = pipeline.process_text(text, max_length=16, padding=True)
            
            print(f"  Input IDs shape: {result['input_ids'].shape}")
            print(f"  Attention mask shape: {result['attention_mask'].shape}")
            print(f"  Embeddings shape: {result['embeddings'].shape}")
            print(f"  Tokens: {result['tokens'][:10]}{'...' if len(result['tokens']) > 10 else ''}")
            print(f"  Non-padding tokens: {result['attention_mask'].sum().item()}")
            
            # Validate shapes
            assert result['input_ids'].shape[0] == 16, "Input IDs length mismatch"
            assert result['attention_mask'].shape[0] == 16, "Attention mask length mismatch"
            assert result['embeddings'].shape == (16, 64), "Embeddings shape mismatch"
            
            print("  ✓ All validations passed")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    return pipeline

# Run comprehensive tests
pipeline = test_pipeline_comprehensive()

## 4. Visualizing Pipeline Components

Let's visualize the different components of our pipeline to better understand how they work together:

In [None]:
def visualize_pipeline_components(pipeline, text="hello world"):
    """Visualize each step of the pipeline."""
    
    print(f"Pipeline Visualization for: '{text}'")
    print("=" * 50)
    
    # Process text
    result = pipeline.process_text(text, max_length=16, padding=True)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Token IDs
    axes[0, 0].bar(range(len(result['input_ids'])), result['input_ids'].numpy())
    axes[0, 0].set_title('Token IDs')
    axes[0, 0].set_xlabel('Position')
    axes[0, 0].set_ylabel('Token ID')
    
    # 2. Attention Mask
    axes[0, 1].bar(range(len(result['attention_mask'])), result['attention_mask'].numpy())
    axes[0, 1].set_title('Attention Mask')
    axes[0, 1].set_xlabel('Position')
    axes[0, 1].set_ylabel('Mask Value')
    
    # 3. Embeddings (first 8 dimensions)
    embeddings_subset = result['embeddings'][:, :8].detach().numpy()
    im1 = axes[1, 0].imshow(embeddings_subset.T, cmap='RdBu', aspect='auto')
    axes[1, 0].set_title('Embeddings (first 8 dims)')
    axes[1, 0].set_xlabel('Position')
    axes[1, 0].set_ylabel('Embedding Dimension')
    plt.colorbar(im1, ax=axes[1, 0])
    
    # 4. Positional Encoding Pattern
    pos_encoding = pipeline.pos_encoding.pe[0, :16, :8].detach().numpy()
    im2 = axes[1, 1].imshow(pos_encoding.T, cmap='RdBu', aspect='auto')
    axes[1, 1].set_title('Positional Encoding (first 8 dims)')
    axes[1, 1].set_xlabel('Position')
    axes[1, 1].set_ylabel('Encoding Dimension')
    plt.colorbar(im2, ax=axes[1, 1])
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed breakdown
    print("\nDetailed Breakdown:")
    print("-" * 30)
    
    valid_positions = result['attention_mask'].sum().item()
    for i in range(min(valid_positions, 10)):  # Show first 10 valid positions
        token_id = result['input_ids'][i].item()
        token = pipeline.vocab_reverse.get(token_id, f"ID_{token_id}")
        embedding_norm = torch.norm(result['embeddings'][i]).item()
        
        print(f"Position {i}: '{token}' (ID: {token_id}) | Embedding norm: {embedding_norm:.3f}")

# Visualize pipeline components
visualize_pipeline_components(pipeline, "hello world!")