# Day 2: Advanced Tokenization - Part 3: Experiments and Optimization

This notebook contains the final part of our tokenization exploration, focusing on vocabulary size experiments, domain-specific tokenizers, and performance benchmarking.

## Setup and Imports

In [None]:
import tiktoken
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
from collections import Counter

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 4. Vocabulary Size Experiments

Let's explore how vocabulary size affects tokenization efficiency.

In [None]:
def create_bpe_tokenizer_simple(texts, vocab_size=1000):
    """Create and train a simple BPE tokenizer."""
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    tokenizer.train_from_iterator(texts, trainer)
    return tokenizer

def vocabulary_size_experiment():
    """Experiment with different vocabulary sizes."""
    
    # Sample training data
    training_data = [
        "Natural language processing is fascinating.",
        "Machine learning models require careful preprocessing.",
        "Tokenization splits text into meaningful units.",
        "Subword tokenization handles out-of-vocabulary words.",
        "BERT uses WordPiece tokenization for efficiency.",
        "GPT models use BPE tokenization for text generation.",
        "Transformer architectures have revolutionized NLP.",
        "Attention mechanisms allow models to focus on relevant parts of input.",
        "Fine-tuning pre-trained models improves performance on specific tasks.",
        "Transfer learning leverages knowledge from one domain to another."
    ] * 20  # Repeat for more training data
    
    vocab_sizes = [50, 100, 200, 500, 1000]
    results = []
    
    test_sentences = [
        "Preprocessing tokenization algorithms efficiently.",
        "Natural language understanding requires context.",
        "Transformers process sequences in parallel."
    ]
    
    for vocab_size in vocab_sizes:
        tokenizer = create_bpe_tokenizer_simple(training_data, vocab_size)
        actual_vocab_size = tokenizer.get_vocab_size()
        
        for sentence in test_sentences:
            encoding = tokenizer.encode(sentence)
            
            results.append({
                'Vocabulary Size': vocab_size,
                'Actual Vocab Size': actual_vocab_size,
                'Sentence': sentence,
                'Sequence Length': len(encoding.ids),
                'Tokens': encoding.tokens,
                'Compression Ratio': len(sentence) / len(encoding.ids)
            })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Visualize results
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df, x='Vocabulary Size', y='Sequence Length', hue='Sentence', marker='o')
    plt.title('Vocabulary Size vs. Sequence Length')
    plt.xlabel('Vocabulary Size')
    plt.ylabel('Sequence Length (tokens)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df, x='Vocabulary Size', y='Compression Ratio', hue='Sentence', marker='o')
    plt.title('Vocabulary Size vs. Compression Ratio')
    plt.xlabel('Vocabulary Size')
    plt.ylabel('Compression Ratio (chars/token)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Show detailed results
    print("Vocabulary Size vs Sequence Length:")
    for vocab_size in vocab_sizes:
        subset = df[df['Vocabulary Size'] == vocab_size]
        print(f"\nVocab Size: {vocab_size}")
        for _, row in subset.iterrows():
            print(f"  Sentence: '{row['Sentence']}'")
            print(f"  Sequence Length: {row['Sequence Length']}")
            print(f"  Tokens: {row['Tokens']}")
            print(f"  Compression Ratio: {row['Compression Ratio']:.2f}")
            print()
    
    return df

# Run the experiment
vocab_experiment_df = vocabulary_size_experiment()

## 5. Domain-Specific Tokenizers

Let's create tokenizers optimized for specific domains.

In [None]:
def create_domain_tokenizer(domain_texts, domain_name, vocab_size=5000):
    """Create a tokenizer optimized for a specific domain."""
    
    # Initialize with domain-specific settings
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    
    # Domain-specific normalization
    if domain_name == "code":
        # Preserve case for code
        tokenizer.normalizer = NFD()
    elif domain_name == "biomedical":
        # Preserve scientific notation
        tokenizer.normalizer = Sequence([NFD(), Lowercase()])
    else:
        tokenizer.normalizer = Sequence([NFD(), Lowercase()])
    
    tokenizer.pre_tokenizer = Whitespace()
    
    # Domain-specific special tokens
    special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    
    if domain_name == "code":
        special_tokens.extend(["[FUNC]", "[VAR]", "[COMMENT]"])
    elif domain_name == "biomedical":
        special_tokens.extend(["[GENE]", "[PROTEIN]", "[DRUG]"])
    
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        min_frequency=2
    )
    
    # Train on domain data
    tokenizer.train_from_iterator(domain_texts, trainer)
    
    return tokenizer

# Example domain-specific texts
code_texts = [
    "def tokenize_text(text): return text.split()",
    "import numpy as np",
    "class BertTokenizer: def __init__(self): pass",
    "for i in range(len(tokens)): print(tokens[i])",
    "if __name__ == '__main__': main()",
    "def process_batch(batch_size=32, max_length=512):",
    "model.train(); optimizer.zero_grad()",
    "loss = criterion(outputs, targets)",
    "with torch.no_grad(): model.eval()",
    "return {k: v.item() for k, v in results.items()}"
]

biomedical_texts = [
    "The patient was diagnosed with COVID-19.",
    "BRCA1 and BRCA2 are human genes that produce tumor suppressor proteins.",
    "The drug dosage was 2.5mg administered twice daily.",
    "Protein kinases are enzymes that modify other proteins.",
    "DNA sequencing revealed a mutation in the TP53 gene.",
    "The patient's hemoglobin A1c level was 7.2%.",
    "Monoclonal antibodies target specific antigens.",
    "The study examined the efficacy of mRNA vaccines.",
    "Cytokine storm is a severe immune reaction.",
    "PCR tests detect viral genetic material in samples."
]

general_texts = training_texts  # Reuse our previous training data

# Create domain-specific tokenizers
code_tokenizer = create_domain_tokenizer(code_texts, "code", vocab_size=100)
biomedical_tokenizer = create_domain_tokenizer(biomedical_texts, "biomedical", vocab_size=100)
general_tokenizer = create_domain_tokenizer(general_texts, "general", vocab_size=100)

# Test texts for each domain
test_texts = {
    "code": "def process_tokens(tokens): return [t.lower() for t in tokens]",
    "biomedical": "The study found that SARS-CoV-2 binds to ACE2 receptors with high affinity.",
    "general": "Natural language processing models can understand and generate human language."
}

# Compare tokenization across domains
tokenizers = {
    "Code Tokenizer": code_tokenizer,
    "Biomedical Tokenizer": biomedical_tokenizer,
    "General Tokenizer": general_tokenizer
}

results = []

for domain, text in test_texts.items():
    print(f"\n{domain.upper()} TEXT: '{text}'")
    
    for name, tokenizer in tokenizers.items():
        encoding = tokenizer.encode(text)
        
        results.append({
            'Domain': domain,
            'Tokenizer': name,
            'Text': text,
            'Token Count': len(encoding.ids),
            'Tokens': encoding.tokens
        })
        
        print(f"\n{name}:")
        print(f"  Token count: {len(encoding.ids)}")
        print(f"  Tokens: {encoding.tokens}")

# Create DataFrame and visualize
domain_df = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
sns.barplot(data=domain_df, x='Domain', y='Token Count', hue='Tokenizer')
plt.title('Token Count by Domain and Tokenizer')
plt.xlabel('Text Domain')
plt.ylabel('Number of Tokens')
plt.xticks(rotation=0)
plt.legend(title='Tokenizer')
plt.tight_layout()
plt.show()

## 6. Performance Benchmarking

Let's benchmark the performance of different tokenizers.

In [None]:
def benchmark_tokenizers(texts, iterations=50):
    """Benchmark different tokenizers."""
    
    # Make sure we have the tokenizers defined
    try:
        gpt2_enc
    except NameError:
        gpt2_enc = tiktoken.get_encoding("gpt2")
        gpt4_enc = tiktoken.get_encoding("cl100k_base")
    
    tokenizers_to_test = {
        'tiktoken GPT-2': gpt2_enc,
        'tiktoken GPT-4': gpt4_enc,
        'HF BPE': code_tokenizer,  # Reusing our domain tokenizers
        'HF WordPiece': biomedical_tokenizer
    }
    
    results = []
    
    for name, tokenizer in tokenizers_to_test.items():
        # Warm-up
        for text in texts:
            if hasattr(tokenizer, 'encode'):
                # tiktoken
                tokenizer.encode(text)
            else:
                # HF tokenizers
                tokenizer.encode(text)
        
        # Benchmark
        start_time = time.time()
        
        for _ in range(iterations):
            for text in texts:
                if hasattr(tokenizer, 'encode'):
                    # tiktoken
                    tokenizer.encode(text)
                else:
                    # HF tokenizers
                    tokenizer.encode(text)
        
        end_time = time.time()
        total_time = end_time - start_time
        
        results.append({
            'Tokenizer': name,
            'Total Time (s)': total_time,
            'Avg Time per Text (ms)': (total_time / (iterations * len(texts))) * 1000,
            'Texts per Second': (iterations * len(texts)) / total_time
        })
    
    return pd.DataFrame(results)

# Benchmark with sample texts of different lengths
benchmark_texts = [
    "Short text.",
    "This is a medium-length sentence for testing tokenization speed.",
    "This is a much longer text that contains multiple sentences and should provide a good test of tokenization performance across different algorithms and implementations. We want to see how each tokenizer handles longer inputs with various patterns and structures.",
    "def benchmark_function(input_data, iterations=100): return [process(data) for _ in range(iterations) for data in input_data]"
]

perf_results = benchmark_tokenizers(benchmark_texts)

print("Tokenizer Performance Benchmark:")
print("-" * 50)
print(perf_results)

# Visualize performance
plt.figure(figsize=(12, 6))

# Plot texts per second (higher is better)
ax = sns.barplot(data=perf_results, x='Tokenizer', y='Texts per Second')
plt.title('Tokenizer Performance: Texts Processed per Second')
plt.xlabel('Tokenizer')
plt.ylabel('Texts per Second (higher is better)')
plt.xticks(rotation=45)

# Add value labels on bars
for i, v in enumerate(perf_results['Texts per Second']):
    ax.text(i, v + 5, f"{v:.0f}", ha='center')

plt.tight_layout()
plt.show()

# Plot average time per text (lower is better)
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=perf_results, x='Tokenizer', y='Avg Time per Text (ms)')
plt.title('Tokenizer Performance: Average Time per Text')
plt.xlabel('Tokenizer')
plt.ylabel('Milliseconds per Text (lower is better)')
plt.xticks(rotation=45)

# Add value labels on bars
for i, v in enumerate(perf_results['Avg Time per Text (ms)']):
    ax.text(i, v + 0.01, f"{v:.2f}ms", ha='center')

plt.tight_layout()
plt.show()

## 7. Key Takeaways

1. **Library Choice**: `tiktoken` is optimized for speed and works well with OpenAI models, while Hugging Face `tokenizers` offers more flexibility and customization options.

2. **Vocabulary Size**: There's a clear trade-off between vocabulary size and sequence length. Larger vocabularies result in shorter sequences but require more memory for embeddings.

3. **Domain Adaptation**: Custom tokenizers trained on domain-specific data perform better on texts from that domain, with more efficient tokenization and better handling of domain terminology.

4. **Performance**: tiktoken generally outperforms other libraries in terms of raw speed, but Hugging Face tokenizers offer more features and customization options.

5. **BPE Merges**: Understanding how BPE merges work helps debug tokenization issues and optimize vocabulary for specific use cases.

## 8. Next Steps

In Day 3, we'll explore how tokens become dense vector representations through embeddings, and how these embeddings capture semantic meaning.