# Week 3 Day 12: Tokenization at Scale & Sequence Preparation - Part 1

## Overview
In this notebook, we'll explore tokenization at scale and efficient sequence preparation for language model training. We'll focus on:
- Implementing efficient tokenization with HuggingFace Tokenizers
- Comparing different tokenization algorithms
- Analyzing tokenization efficiency and quality

In [None]:
# Import necessary libraries
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
import time
import random
from typing import List, Dict, Tuple, Optional
from collections import Counter

# For tokenization
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace, ByteLevel

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Downloading Sample Data

Let's download some sample text data to work with.

In [None]:
def download_text(url: str) -> str:
    """Download text from a URL."""
    response = requests.get(url)
    response.encoding = 'utf-8'  # Ensure proper encoding
    return response.text

def clean_gutenberg_text(text: str) -> str:
    """Clean Project Gutenberg text by removing headers and footers."""
    # Find the start of the actual content (after the header)
    start_markers = [
        "*** START OF THIS PROJECT GUTENBERG",
        "***START OF THE PROJECT GUTENBERG",
        "*** START OF THE PROJECT GUTENBERG"
    ]
    
    end_markers = [
        "*** END OF THIS PROJECT GUTENBERG",
        "***END OF THE PROJECT GUTENBERG",
        "*** END OF THE PROJECT GUTENBERG",
        "End of the Project Gutenberg"
    ]
    
    start_pos = len(text)
    for marker in start_markers:
        pos = text.find(marker)
        if pos != -1 and pos < start_pos:
            start_pos = pos
    
    if start_pos != len(text):
        # Find the end of the header line
        start_pos = text.find("\n", start_pos) + 1
    else:
        start_pos = 0
    
    end_pos = len(text)
    for marker in end_markers:
        pos = text.find(marker)
        if pos != -1 and pos < end_pos:
            end_pos = pos
    
    if end_pos == len(text):
        end_pos = len(text)
    
    # Extract the content between header and footer
    content = text[start_pos:end_pos].strip()
    
    # Additional cleaning
    content = re.sub(r'\r\n', '\n', content)  # Normalize line endings
    content = re.sub(r'\n{3,}', '\n\n', content)  # Remove excessive newlines
    
    return content

# Download some texts from different domains
text_urls = {
    "fiction": "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
    "science": "https://www.gutenberg.org/files/2009/2009-0.txt",  # Origin of Species
    "philosophy": "https://www.gutenberg.org/files/4280/4280-0.txt"  # Critique of Pure Reason
}

# Download and clean the texts
texts = {}
for name, url in text_urls.items():
    try:
        print(f"Downloading {name}...")
        text = download_text(url)
        texts[name] = clean_gutenberg_text(text)
        print(f"Downloaded {name}: {len(texts[name])} characters")
    except Exception as e:
        print(f"Error downloading {name}: {e}")

# Create a combined corpus
all_text = "\n\n".join(texts.values())
print(f"Total corpus size: {len(all_text)} characters")

# Save texts to files for tokenizer training
import os
os.makedirs("data", exist_ok=True)

for name, text in texts.items():
    with open(f"data/{name}.txt", "w", encoding="utf-8") as f:
        f.write(text)

with open("data/all_texts.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

## 2. Implementing Different Tokenization Algorithms

Let's implement and compare different tokenization algorithms using the HuggingFace Tokenizers library.

In [None]:
def train_bpe_tokenizer(files, vocab_size=10000):
    """Train a BPE tokenizer."""
    # Initialize a tokenizer with BPE model
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    
    # Set up pre-tokenizer
    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
    
    # Set up trainer
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        min_frequency=2
    )
    
    # Train tokenizer
    tokenizer.train(files, trainer)
    
    # Set up decoder
    tokenizer.decoder = decoders.ByteLevel()
    
    return tokenizer

def train_wordpiece_tokenizer(files, vocab_size=10000):
    """Train a WordPiece tokenizer."""
    # Initialize a tokenizer with WordPiece model
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    
    # Set up pre-tokenizer
    tokenizer.pre_tokenizer = Whitespace()
    
    # Set up trainer
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        min_frequency=2
    )
    
    # Train tokenizer
    tokenizer.train(files, trainer)
    
    return tokenizer

def train_unigram_tokenizer(files, vocab_size=10000):
    """Train a Unigram tokenizer."""
    # Initialize a tokenizer with Unigram model
    tokenizer = Tokenizer(Unigram())
    
    # Set up pre-tokenizer
    tokenizer.pre_tokenizer = Whitespace()
    
    # Set up trainer
    trainer = UnigramTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        unk_token="[UNK]"
    )
    
    # Train tokenizer
    tokenizer.train(files, trainer)
    
    return tokenizer

In [None]:
# Train tokenizers
files = ["data/all_texts.txt"]
vocab_size = 8000

print("Training BPE tokenizer...")
start_time = time.time()
bpe_tokenizer = train_bpe_tokenizer(files, vocab_size)
bpe_time = time.time() - start_time
print(f"BPE tokenizer trained in {bpe_time:.2f} seconds")

print("\nTraining WordPiece tokenizer...")
start_time = time.time()
wordpiece_tokenizer = train_wordpiece_tokenizer(files, vocab_size)
wordpiece_time = time.time() - start_time
print(f"WordPiece tokenizer trained in {wordpiece_time:.2f} seconds")

print("\nTraining Unigram tokenizer...")
start_time = time.time()
unigram_tokenizer = train_unigram_tokenizer(files, vocab_size)
unigram_time = time.time() - start_time
print(f"Unigram tokenizer trained in {unigram_time:.2f} seconds")

## 3. Comparing Tokenization Algorithms

Let's compare the different tokenization algorithms in terms of efficiency and token distribution.

In [None]:
def compare_tokenization_speed(tokenizers, text, num_runs=5):
    """Compare tokenization speed of different tokenizers."""
    results = {}
    
    for name, tokenizer in tokenizers.items():
        # Warm-up
        for _ in range(3):
            _ = tokenizer.encode(text)
        
        # Measure time
        start_time = time.time()
        for _ in range(num_runs):
            _ = tokenizer.encode(text)
        avg_time = (time.time() - start_time) / num_runs
        
        results[name] = avg_time
    
    return results

def compare_sequence_lengths(tokenizers, texts):
    """Compare sequence lengths produced by different tokenizers."""
    results = {}
    
    for name, tokenizer in tokenizers.items():
        domain_lengths = {}
        for domain, text in texts.items():
            encoding = tokenizer.encode(text)
            domain_lengths[domain] = len(encoding.ids)
        results[name] = domain_lengths
    
    return results

def analyze_token_distribution(tokenizer, text, top_n=20):
    """Analyze token distribution for a tokenizer."""
    encoding = tokenizer.encode(text)
    
    # Count token frequencies
    token_counts = Counter(encoding.ids)
    
    # Get most common tokens
    most_common = token_counts.most_common(top_n)
    
    # Convert token IDs to strings
    token_strings = []
    for token_id, count in most_common:
        token = tokenizer.decode([token_id])
        token_strings.append((token, count))
    
    return token_strings

# Compare tokenization speed
tokenizers = {
    "BPE": bpe_tokenizer,
    "WordPiece": wordpiece_tokenizer,
    "Unigram": unigram_tokenizer
}

# Use a sample of text for speed comparison
sample_text = all_text[:100000]  # First 100K characters

print("Comparing tokenization speed...")
speed_results = compare_tokenization_speed(tokenizers, sample_text)
for name, time_taken in speed_results.items():
    print(f"{name}: {time_taken:.6f} seconds")

# Compare sequence lengths
print("\nComparing sequence lengths...")
length_results = compare_sequence_lengths(tokenizers, texts)
for tokenizer_name, domain_lengths in length_results.items():
    print(f"\n{tokenizer_name}:")
    for domain, length in domain_lengths.items():
        print(f"  {domain}: {length} tokens")

# Plot tokenization speed comparison
plt.figure(figsize=(10, 6))
plt.bar(speed_results.keys(), speed_results.values())
plt.title('Tokenization Speed Comparison')
plt.xlabel('Tokenizer')
plt.ylabel('Time (seconds)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Plot sequence length comparison
plt.figure(figsize=(12, 6))
domains = list(texts.keys())
x = np.arange(len(domains))
width = 0.25
multiplier = 0

for tokenizer_name, domain_lengths in length_results.items():
    offset = width * multiplier
    plt.bar(x + offset, [domain_lengths[domain] for domain in domains], width, label=tokenizer_name)
    multiplier += 1

plt.title('Sequence Length Comparison')
plt.xlabel('Domain')
plt.ylabel('Number of Tokens')
plt.xticks(x + width, domains)
plt.legend(loc='best')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

## 4. Analyzing Token Distribution

Let's analyze the token distribution for each tokenizer.

In [None]:
# Analyze token distributions
for name, tokenizer in tokenizers.items():
    print(f"\n{name} - Most common tokens:")
    token_dist = analyze_token_distribution(tokenizer, all_text)
    for token, count in token_dist:
        # Replace newlines and tabs for display
        display_token = token.replace('\n', '\\n').replace('\t', '\\t')
        print(f"'{display_token}': {count}")

# Plot token frequency distribution
plt.figure(figsize=(15, 6))

for i, (name, tokenizer) in enumerate(tokenizers.items()):
    # Encode full text
    encoding = tokenizer.encode(all_text)
    token_counts = Counter(encoding.ids)
    
    # Get frequencies of top 100 tokens
    top_tokens = token_counts.most_common(100)
    freqs = [count for _, count in top_tokens]
    
    # Plot on log scale
    plt.subplot(1, 3, i+1)
    plt.loglog(range(1, len(freqs)+1), freqs, marker='o', markersize=3)
    plt.title(f'{name} Token Distribution')
    plt.xlabel('Token Rank')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Tokenization of Out-of-Vocabulary Words

Let's examine how each tokenizer handles out-of-vocabulary (OOV) words.

In [None]:
def compare_oov_handling(tokenizers, words):
    """Compare how tokenizers handle OOV words."""
    results = {}
    
    for name, tokenizer in tokenizers.items():
        word_tokens = {}
        for word in words:
            encoding = tokenizer.encode(word)
            tokens = tokenizer.decode(encoding.ids).split()
            word_tokens[word] = (encoding.ids, tokens)
        results[name] = word_tokens
    
    return results

# Test with some potentially OOV words
oov_words = [
    "transformer",  # Technical term
    "COVID19",  # Recent term
    "blockchain",  # Technical term
    "supercalifragilisticexpialidocious",  # Very long word
    "😊",  # Emoji
    "https://example.com",  # URL
    "#hashtag"  # Social media tag
]

oov_results = compare_oov_handling(tokenizers, oov_words)

# Display results
for word in oov_words:
    print(f"\nWord: '{word}'")
    for name in tokenizers.keys():
        ids, tokens = oov_results[name][word]
        print(f"  {name}: {len(ids)} tokens - {tokens}")

## 6. Saving and Loading Tokenizers

Let's save our trained tokenizers for later use.

In [None]:
# Create directory for tokenizers
os.makedirs("tokenizers", exist_ok=True)

# Save tokenizers
for name, tokenizer in tokenizers.items():
    tokenizer.save(f"tokenizers/{name.lower()}_tokenizer.json")
    print(f"Saved {name} tokenizer")

# Test loading a tokenizer
loaded_tokenizer = Tokenizer.from_file("tokenizers/bpe_tokenizer.json")
print("\nLoaded BPE tokenizer successfully")

# Test tokenization with loaded tokenizer
test_text = "This is a test of the loaded tokenizer."
original_encoding = bpe_tokenizer.encode(test_text)
loaded_encoding = loaded_tokenizer.encode(test_text)

print(f"Original tokens: {original_encoding.tokens}")
print(f"Loaded tokens: {loaded_encoding.tokens}")
print(f"Tokens match: {original_encoding.ids == loaded_encoding.ids}")

## 7. Summary and Key Insights

In this notebook, we've explored tokenization at scale by:

1. **Implementing different tokenization algorithms**:
   - BPE (Byte-Pair Encoding)
   - WordPiece
   - Unigram

2. **Comparing tokenization efficiency**:
   - Processing speed
   - Resulting sequence lengths

3. **Analyzing token distributions**:
   - Most common tokens
   - Frequency distributions

4. **Examining OOV handling**:
   - How different tokenizers handle unseen words
   - Subword decomposition strategies

5. **Saving and loading tokenizers**:
   - Persistence for consistent tokenization

In Part 2, we'll explore sequence packing and masking techniques for efficient training of causal language models.