# Day 5: End-to-End Pipeline - Part 3

In this notebook, we'll explore advanced features of our transformer preprocessing pipeline, including batch processing and pipeline statistics.

## Setup and Imports

In [None]:
import torch
import torch.nn as nn
import numpy as np
from typing import List, Dict, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Advanced Pipeline Implementation

Let's extend our basic pipeline with advanced features like batch processing and statistics tracking:

In [None]:
# First, let's define our positional encoding classes
class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal positional encoding implementation."""
    
    def __init__(self, d_model, max_seq_len=5000):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        """Add positional encoding to input embeddings."""
        return x + self.pe[:, :x.size(1)]

class LearnedPositionalEmbedding(nn.Module):
    """Learned positional embedding implementation."""
    
    def __init__(self, max_seq_len, d_model):
        super().__init__()
        self.position_embeddings = nn.Embedding(max_seq_len, d_model)
        nn.init.normal_(self.position_embeddings.weight, std=0.02)
    
    def forward(self, x):
        """Add learned positional embeddings."""
        seq_len = x.size(1)
        position_ids = torch.arange(seq_len, dtype=torch.long, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand(x.size(0), -1)
        return x + self.position_embeddings(position_ids)

# Base pipeline class (simplified version)
class TransformerPreprocessingPipeline:
    """Base preprocessing pipeline for transformer models."""
    
    def __init__(self, 
                 vocab_size: int = 1000,
                 embedding_dim: int = 128,
                 max_seq_len: int = 512,
                 pad_token: str = "[PAD]",
                 cls_token: str = "[CLS]",
                 sep_token: str = "[SEP]",
                 unk_token: str = "[UNK]",
                 pos_encoding_type: str = "sinusoidal"):
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        
        # Special tokens
        self.pad_token = pad_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        self.unk_token = unk_token
        
        # Initialize components
        self._init_tokenizer()
        self._init_embeddings()
        self._init_positional_encoding(pos_encoding_type)
    
    def _init_tokenizer(self):
        """Initialize tokenizer with basic vocabulary."""
        # Create basic vocabulary
        special_tokens = [self.pad_token, self.unk_token, self.cls_token, self.sep_token]
        common_words = ["the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with"]
        chars = list("abcdefghijklmnopqrstuvwxyz0123456789.,!?;:()[]{}\"'-_/\\@#$%^&*+=<>|`~")
        
        vocab_list = special_tokens + common_words + chars
        vocab_list = vocab_list[:self.vocab_size]  # Limit to vocab_size
        
        self.vocab = {token: idx for idx, token in enumerate(vocab_list)}
        self.vocab_reverse = {idx: token for token, idx in self.vocab.items()}
        
        # Special token IDs
        self.pad_token_id = self.vocab[self.pad_token]
        self.cls_token_id = self.vocab[self.cls_token]
        self.sep_token_id = self.vocab[self.sep_token]
        self.unk_token_id = self.vocab[self.unk_token]
    
    def _init_embeddings(self):
        """Initialize embedding layer."""
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, 
                                    padding_idx=self.pad_token_id)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.1)
        
        # Zero out padding token embedding
        self.embedding.weight.data[self.pad_token_id].fill_(0)
    
    def _init_positional_encoding(self, pos_type):
        """Initialize positional encoding."""
        if pos_type == "sinusoidal":
            self.pos_encoding = SinusoidalPositionalEncoding(
                self.embedding_dim, self.max_seq_len)
        elif pos_type == "learned":
            self.pos_encoding = LearnedPositionalEmbedding(
                self.max_seq_len, self.embedding_dim)
        else:
            raise ValueError(f"Unknown positional encoding type: {pos_type}")
    
    def tokenize(self, text: str) -> List[str]:
        """Simple tokenization (character-level for demo)."""
        # Normalize text
        text = text.lower().strip()
        
        # Simple character-level tokenization
        tokens = []
        for char in text:
            if char in self.vocab:
                tokens.append(char)
            else:
                tokens.append(self.unk_token)
        
        return tokens
    
    def encode_text(self, text: str, 
                   add_special_tokens: bool = True,
                   max_length: Optional[int] = None,
                   padding: bool = True,
                   truncation: bool = True) -> Dict[str, torch.Tensor]:
        """Complete encoding pipeline."""
        
        if max_length is None:
            max_length = self.max_seq_len
        
        # Step 1: Tokenization
        tokens = self.tokenize(text)
        
        # Step 2: Add special tokens
        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]
        
        # Step 3: Truncation
        if truncation and len(tokens) > max_length:
            if add_special_tokens:
                tokens = tokens[:max_length-1] + [self.sep_token]
            else:
                tokens = tokens[:max_length]
        
        # Step 4: Convert to IDs
        input_ids = [self.vocab.get(token, self.unk_token_id) for token in tokens]
        
        # Step 5: Create attention mask
        attention_mask = [1] * len(input_ids)
        
        # Step 6: Padding
        if padding and len(input_ids) < max_length:
            padding_length = max_length - len(input_ids)
            input_ids.extend([self.pad_token_id] * padding_length)
            attention_mask.extend([0] * padding_length)
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'tokens': tokens
        }
    
    def get_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        """Get token embeddings."""
        return self.embedding(input_ids)
    
    def add_positional_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
        """Add positional encoding to embeddings."""
        return self.pos_encoding(embeddings)
    
    def process_text(self, text: str, **kwargs) -> Dict[str, torch.Tensor]:
        """Complete end-to-end processing."""
        # Step 1: Encode text to tokens and IDs
        encoded = self.encode_text(text, **kwargs)
        
        # Step 2: Get embeddings
        embeddings = self.get_embeddings(encoded['input_ids'].unsqueeze(0))
        
        # Step 3: Add positional encoding
        embeddings_with_pos = self.add_positional_encoding(embeddings)
        
        # Step 4: Scale embeddings (as in original Transformer)
        embeddings_with_pos = embeddings_with_pos * np.sqrt(self.embedding_dim)
        
        return {
            'input_ids': encoded['input_ids'],
            'attention_mask': encoded['attention_mask'],
            'embeddings': embeddings_with_pos.squeeze(0),
            'tokens': encoded['tokens']
        }

## 2. Advanced Pipeline with Batch Processing and Statistics

Now, let's extend the base pipeline with advanced features:

In [None]:
class AdvancedTransformerPipeline(TransformerPreprocessingPipeline):
    """Extended pipeline with advanced features."""
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.statistics = {
            'processed_texts': 0,
            'total_tokens': 0,
            'avg_sequence_length': 0,
            'unk_token_count': 0,
            'token_frequency': {}
        }
    
    def process_batch(self, texts: List[str], **kwargs) -> Dict[str, torch.Tensor]:
        """Process multiple texts efficiently."""
        
        batch_results = []
        max_len = kwargs.get('max_length', self.max_seq_len)
        
        for text in texts:
            result = self.process_text(text, **kwargs)
            batch_results.append(result)
        
        # Stack results
        batch_input_ids = torch.stack([r['input_ids'] for r in batch_results])
        batch_attention_mask = torch.stack([r['attention_mask'] for r in batch_results])
        batch_embeddings = torch.stack([r['embeddings'] for r in batch_results])
        
        # Update statistics
        self._update_statistics(batch_results)
        
        return {
            'input_ids': batch_input_ids,
            'attention_mask': batch_attention_mask,
            'embeddings': batch_embeddings,
            'batch_size': len(texts)
        }
    
    def _update_statistics(self, results):
        """Update pipeline statistics based on processed results."""
        self.statistics['processed_texts'] += len(results)
        
        total_non_padding = 0
        for result in results:
            # Count non-padding tokens
            non_padding = result['attention_mask'].sum().item()
            total_non_padding += non_padding
            
            # Count unknown tokens
            unk_count = (result['input_ids'] == self.unk_token_id).sum().item()
            self.statistics['unk_token_count'] += unk_count
            
            # Update token frequency
            for token_id in result['input_ids']:
                token_id = token_id.item()
                if token_id != self.pad_token_id:
                    token = self.vocab_reverse.get(token_id, f"ID_{token_id}")
                    self.statistics['token_frequency'][token] = \
                        self.statistics['token_frequency'].get(token, 0) + 1
        
        # Update total tokens and average sequence length
        self.statistics['total_tokens'] += total_non_padding
        self.statistics['avg_sequence_length'] = (
            self.statistics['total_tokens'] / self.statistics['processed_texts']
        )
    
    def get_statistics(self):
        """Get processing statistics."""
        stats = self.statistics.copy()
        
        # Get top 10 most frequent tokens
        if self.statistics['token_frequency']:
            sorted_tokens = sorted(
                self.statistics['token_frequency'].items(),
                key=lambda x: x[1],
                reverse=True
            )[:10]
            stats['top_tokens'] = dict(sorted_tokens)
        
        return stats
    
    def save_pipeline(self, path: str):
        """Save pipeline configuration."""
        config = {
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim,
            'max_seq_len': self.max_seq_len,
            'vocab': self.vocab,
            'statistics': self.statistics
        }
        
        torch.save({
            'config': config,
            'embedding_weights': self.embedding.state_dict(),
            'pos_encoding_weights': self.pos_encoding.state_dict() if hasattr(self.pos_encoding, 'state_dict') else None
        }, path)
        
        print(f"Pipeline saved to {path}")

## 3. Testing Advanced Pipeline Features

Let's test the advanced features of our pipeline:

In [None]:
def test_advanced_pipeline():
    """Test advanced pipeline features."""
    
    advanced_pipeline = AdvancedTransformerPipeline(
        vocab_size=300,
        embedding_dim=64,
        max_seq_len=24
    )
    
    # Test batch processing
    test_texts = [
        "hello world",
        "this is a test",
        "batch processing works",
        "transformers are powerful"
    ]
    
    print("Advanced Pipeline Testing")
    print("=" * 40)
    
    # Process batch
    batch_result = advanced_pipeline.process_batch(test_texts, max_length=12, padding=True)
    
    print(f"Batch processing results:")
    print(f"  Batch size: {batch_result['batch_size']}")
    print(f"  Input IDs shape: {batch_result['input_ids'].shape}")
    print(f"  Embeddings shape: {batch_result['embeddings'].shape}")
    
    # Show statistics
    stats = advanced_pipeline.get_statistics()
    print(f"\nPipeline Statistics:")
    for key, value in stats.items():
        if key != 'token_frequency' and key != 'top_tokens':
            print(f"  {key}: {value}")
    
    # Show top tokens
    if 'top_tokens' in stats:
        print("\nTop tokens:")
        for token, count in stats['top_tokens'].items():
            print(f"  '{token}': {count}")
    
    # Test with more texts to see statistics evolve
    more_texts = [
        "language models are fascinating",
        "deep learning revolutionized nlp",
        "transformers use self-attention",
        "positional encoding is crucial",
        "end-to-end pipeline works well",
        "we completed week one successfully"
    ]
    
    print("\nProcessing more texts...")
    advanced_pipeline.process_batch(more_texts, max_length=16, padding=True)
    
    # Show updated statistics
    updated_stats = advanced_pipeline.get_statistics()
    print(f"\nUpdated Pipeline Statistics:")
    for key, value in updated_stats.items():
        if key != 'token_frequency' and key != 'top_tokens':
            print(f"  {key}: {value}")
    
    # Show top tokens
    if 'top_tokens' in updated_stats:
        print("\nTop tokens:")
        for token, count in updated_stats['top_tokens'].items():
            print(f"  '{token}': {count}")
    
    # Visualize token frequency
    if 'top_tokens' in updated_stats:
        plt.figure(figsize=(12, 6))
        tokens = list(updated_stats['top_tokens'].keys())
        counts = list(updated_stats['top_tokens'].values())
        
        plt.bar(tokens, counts)
        plt.title('Top Token Frequencies')
        plt.xlabel('Token')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    
    return advanced_pipeline, updated_stats

# Test advanced pipeline
advanced_pipeline, stats = test_advanced_pipeline()

## 4. Saving and Loading the Pipeline

Let's test saving and loading the pipeline:

In [None]:
# Save the pipeline
advanced_pipeline.save_pipeline('transformer_pipeline.pt')

# Load the pipeline (in a real scenario)
def load_pipeline(path):
    """Load a saved pipeline."""
    saved_data = torch.load(path)
    config = saved_data['config']
    
    # Create a new pipeline with the saved configuration
    pipeline = AdvancedTransformerPipeline(
        vocab_size=config['vocab_size'],
        embedding_dim=config['embedding_dim'],
        max_seq_len=config['max_seq_len']
    )
    
    # Restore vocabulary
    pipeline.vocab = config['vocab']
    pipeline.vocab_reverse = {idx: token for token, idx in pipeline.vocab.items()}
    
    # Restore weights
    pipeline.embedding.load_state_dict(saved_data['embedding_weights'])
    if saved_data['pos_encoding_weights'] and hasattr(pipeline.pos_encoding, 'load_state_dict'):
        pipeline.pos_encoding.load_state_dict(saved_data['pos_encoding_weights'])
    
    # Restore statistics
    pipeline.statistics = config['statistics']
    
    return pipeline

# Note: In this notebook, we won't actually load the pipeline to avoid
# file system issues, but in a real scenario, you would use:
# loaded_pipeline = load_pipeline('transformer_pipeline.pt')

## 5. Summary and Next Steps

In this notebook, we've extended our transformer preprocessing pipeline with advanced features:

1. **Batch Processing**: Efficiently handling multiple texts at once
2. **Statistics Tracking**: Monitoring token frequencies, sequence lengths, and unknown token usage
3. **Serialization**: Saving and loading pipeline configurations

These features make our pipeline more practical for real-world applications. The complete pipeline now provides a robust foundation for transformer models, integrating all the components we've learned about in Week 1.

In Week 2, we'll build on this foundation to implement the transformer architecture itself, including self-attention mechanisms and encoder/decoder blocks.