# Day 2: Advanced Tokenization with Hugging Face Tokenizers - Part 2

This notebook continues our exploration of tokenization libraries, focusing on the Hugging Face `tokenizers` library.

## Setup and Imports

In [None]:
import tiktoken
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
from collections import Counter

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 2. Hugging Face Tokenizers

Let's implement and train our own tokenizers using the Hugging Face `tokenizers` library.

### 2.1 BPE Tokenizer Implementation

In [None]:
def create_bpe_tokenizer(texts, vocab_size=1000):
    """Create and train a BPE tokenizer."""
    
    # Initialize tokenizer
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    
    # Setup trainer
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    
    # Train tokenizer
    tokenizer.train_from_iterator(texts, trainer)
    
    # Add special token processing
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ],
    )
    
    return tokenizer

# Example training texts
training_texts = [
    "Hello world, this is a test.",
    "Tokenization is important for NLP.",
    "BPE creates subword units efficiently.",
    "Machine learning models need tokenized input.",
    "Natural language processing requires preprocessing.",
    "Transformers use attention mechanisms to process text.",
    "Word embeddings capture semantic relationships between words.",
    "Contextual embeddings depend on surrounding words.",
    "BERT is a bidirectional encoder representation from transformers.",
    "GPT models are autoregressive language models."
] * 10  # Repeat to get more training data

# Create and train BPE tokenizer
bpe_tokenizer = create_bpe_tokenizer(training_texts, vocab_size=200)

# Test the tokenizer
test_text = "Hello world! This is a test of our BPE tokenizer."
encoding = bpe_tokenizer.encode(test_text)

print(f"Input: {test_text}")
print(f"Tokens: {encoding.tokens}")
print(f"IDs: {encoding.ids}")
print(f"Vocabulary size: {bpe_tokenizer.get_vocab_size()}")

### 2.2 WordPiece Tokenizer Implementation

In [None]:
def create_wordpiece_tokenizer(texts, vocab_size=1000):
    """Create and train a WordPiece tokenizer (BERT-style)."""
    
    # Initialize tokenizer
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    
    # Add normalization
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = Whitespace()
    
    # Setup trainer
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    
    # Train tokenizer
    tokenizer.train_from_iterator(texts, trainer)
    
    return tokenizer

# Create and train WordPiece tokenizer
wordpiece_tokenizer = create_wordpiece_tokenizer(training_texts, vocab_size=200)

# Test the tokenizer
encoding = wordpiece_tokenizer.encode(test_text)

print(f"Input: {test_text}")
print(f"Tokens: {encoding.tokens}")
print(f"IDs: {encoding.ids}")
print(f"Vocabulary size: {wordpiece_tokenizer.get_vocab_size()}")

### 2.3 Comparing BPE and WordPiece Tokenization

In [None]:
def compare_tokenization_approaches(text):
    """Compare different tokenization approaches on the same text."""
    
    # Character-level tokenization
    char_tokens = list(text)
    
    # Word-level tokenization
    word_tokens = text.split()
    
    # tiktoken (GPT-2)
    gpt2_tokens = gpt2_enc.encode(text)
    gpt2_token_strings = [gpt2_enc.decode([t]) for t in gpt2_tokens]
    
    # tiktoken (GPT-4)
    gpt4_tokens = gpt4_enc.encode(text)
    gpt4_token_strings = [gpt4_enc.decode([t]) for t in gpt4_tokens]
    
    # Custom BPE
    bpe_encoding = bpe_tokenizer.encode(text)
    bpe_tokens = bpe_encoding.tokens
    
    # Custom WordPiece
    wp_encoding = wordpiece_tokenizer.encode(text)
    wp_tokens = wp_encoding.tokens
    
    # Create comparison DataFrame
    results = {
        'Tokenization': ['Character', 'Word', 'GPT-2', 'GPT-4', 'Custom BPE', 'Custom WordPiece'],
        'Token Count': [len(char_tokens), len(word_tokens), len(gpt2_tokens), len(gpt4_tokens), len(bpe_tokens), len(wp_tokens)],
        'Compression Ratio': [len(text)/len(char_tokens), len(text)/len(word_tokens), 
                             len(text)/len(gpt2_tokens), len(text)/len(gpt4_tokens),
                             len(text)/len(bpe_tokens), len(text)/len(wp_tokens)]
    }
    
    df = pd.DataFrame(results)
    
    # Visualize results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Token count plot
    sns.barplot(data=df, x='Tokenization', y='Token Count', ax=ax1)
    ax1.set_title('Token Count by Tokenization Method')
    ax1.set_ylabel('Number of Tokens')
    ax1.tick_params(axis='x', rotation=45)
    
    # Compression ratio plot
    sns.barplot(data=df, x='Tokenization', y='Compression Ratio', ax=ax2)
    ax2.set_title('Compression Ratio by Tokenization Method')
    ax2.set_ylabel('Characters per Token (higher = more efficient)')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Print token examples
    print("Token Examples:")
    print(f"Character: {char_tokens[:10]}...")
    print(f"Word: {word_tokens}")
    print(f"GPT-2: {gpt2_token_strings}")
    print(f"GPT-4: {gpt4_token_strings}")
    print(f"Custom BPE: {bpe_tokens}")
    print(f"Custom WordPiece: {wp_tokens}")
    
    return df

# Test with a sample text
comparison_text = "The quick brown fox jumps over the lazy dog. Tokenization splits text into meaningful units."
comparison_df = compare_tokenization_approaches(comparison_text)

## 3. Inspecting BPE Merge Operations

Let's examine how BPE merges work step by step.

In [None]:
def inspect_bpe_merges(tokenizer, text):
    """Inspect how BPE merges work step by step."""
    
    if hasattr(tokenizer, 'get_vocab'):
        vocab = tokenizer.get_vocab()
    else:
        # For tiktoken
        vocab = {tokenizer.decode([i]): i for i in range(tokenizer.n_vocab)}
    
    # Get encoding
    if hasattr(tokenizer, 'encode'):
        tokens = tokenizer.encode(text)
        token_strings = [tokenizer.decode([t]) for t in tokens]
    else:
        encoding = tokenizer.encode(text)
        tokens = encoding.ids
        token_strings = encoding.tokens
    
    print(f"Text: '{text}'")
    print(f"Tokens: {token_strings}")
    print(f"Token IDs: {tokens}")
    
    # Show merge patterns
    if hasattr(tokenizer, 'decode'):
        # For tiktoken
        merged_tokens = [t for t in token_strings if len(t) > 1 and not t.startswith('[')]
    else:
        # For HF tokenizers
        merged_tokens = [t for t in token_strings if len(t) > 1 and not t.startswith('[')]
        
    print(f"Merged tokens (subwords): {merged_tokens}")
    
    return tokens, token_strings

# Example with different complexity texts
texts_to_analyze = [
    "hello",
    "hello world",
    "internationalization",
    "preprocessing tokenization",
    "The quick brown fox jumps over the lazy dog."
]

for text in texts_to_analyze:
    print("\n" + "="*50)
    inspect_bpe_merges(gpt2_enc, text)