# Day 2: Advanced Tokenization with tiktoken and tokenizers - Practical Exercises

This notebook contains hands-on exercises and implementations for Day 2 of the LLM learning journey.

## Learning Objectives
- Implement tokenization using tiktoken and Hugging Face tokenizers
- Compare different tokenization libraries and their performance
- Analyze vocabulary size vs sequence length trade-offs
- Create custom domain-specific tokenizers
- Benchmark tokenizer performance

## Setup and Installation

In [None]:
# Install required packages
!pip install tiktoken tokenizers transformers matplotlib seaborn pandas numpy

In [None]:
import tiktoken
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
from collections import Counter

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. tiktoken: OpenAI's Fast Tokenizer

Let's start by exploring tiktoken and its different encodings.

In [None]:
# Get different tiktoken encodings
gpt2_enc = tiktoken.get_encoding("gpt2")
gpt4_enc = tiktoken.get_encoding("cl100k_base")  # GPT-4
codex_enc = tiktoken.get_encoding("p50k_base")   # Code models

# Test text
text = "Hello, world! This is advanced tokenization with tiktoken. Let's see how it handles different types of text: code_variable, émojis 🚀, and numbers 12345."

print(f"Original text: {text}")
print(f"Text length: {len(text)} characters\n")

# Compare different encodings
encodings = {
    'GPT-2': gpt2_enc,
    'GPT-4': gpt4_enc,
    'Codex': codex_enc
}

for name, enc in encodings.items():
    tokens = enc.encode(text)
    decoded = enc.decode(tokens)
    
    print(f"{name}:")
    print(f"  Tokens: {len(tokens)}")
    print(f"  Vocab size: {enc.n_vocab:,}")
    print(f"  First 10 tokens: {tokens[:10]}")
    print(f"  Decoded tokens: {[enc.decode([t]) for t in tokens[:10]]}")
    print(f"  Perfect reconstruction: {decoded == text}")
    print()

### 1.1 Visualizing Token Distribution

Let's visualize how different encodings tokenize the same text.

In [None]:
def visualize_tokenization(text, encodings):
    """Visualize how different encodings tokenize the same text."""
    results = []
    
    for name, enc in encodings.items():
        tokens = enc.encode(text)
        token_strings = [enc.decode([t]) for t in tokens]
        
        # Count token lengths
        token_lengths = [len(t) for t in token_strings]
        
        for i, (token, length) in enumerate(zip(token_strings, token_lengths)):
            results.append({
                'Encoding': name,
                'Token Index': i,
                'Token': token,
                'Length': length
            })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Plot token lengths
    plt.figure(figsize=(12, 6))
    sns.barplot(data=df, x='Token Index', y='Length', hue='Encoding')
    plt.title('Token Length by Position and Encoding')
    plt.xlabel('Token Position')
    plt.ylabel('Token Length (characters)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Plot token distribution
    plt.figure(figsize=(10, 6))
    for name in df['Encoding'].unique():
        subset = df[df['Encoding'] == name]
        sns.histplot(subset['Length'], label=name, alpha=0.6, bins=range(1, 10))
    
    plt.title('Distribution of Token Lengths')
    plt.xlabel('Token Length (characters)')
    plt.ylabel('Count')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return df

# Visualize tokenization for our test text
token_df = visualize_tokenization(text, encodings)

### 1.2 Testing with Different Text Types

Let's see how tiktoken handles different types of text.

In [None]:
# Test different text types
text_types = {
    'English': "The quick brown fox jumps over the lazy dog.",
    'Code': "def tokenize(text): return [token for token in text.split()]",
    'Emojis': "I love 😊 using 🚀 emojis 🎉 in my text! 👍",
    'Numbers': "The price is $1,234.56 for 42 items.",
    'Unicode': "こんにちは世界! Привет, мир! مرحبا بالعالم!",
    'Mixed': "User@example.com sent a message: 'Hello!' with #hashtag and http://example.com"
}

# Use GPT-4 encoding for comparison
encoder = gpt4_enc

for name, sample in text_types.items():
    tokens = encoder.encode(sample)
    token_strings = [encoder.decode([t]) for t in tokens]
    
    print(f"{name} text:")
    print(f"  Original: {sample}")
    print(f"  Tokens: {len(tokens)}")
    print(f"  Token IDs: {tokens}")
    print(f"  Token strings: {token_strings}")
    print()