# Week 1 Day 1: Text Normalization and Tokenization - Practical Implementation

This notebook contains all the practical code examples for Day 1, implementing text normalization and tokenization techniques covered in the theoretical guide.

## Setup and Imports

In [None]:
# Install required packages
# !pip install tiktoken transformers unidecode matplotlib seaborn

In [None]:
import unicodedata
import tiktoken
from transformers import AutoTokenizer
from unidecode import unidecode
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
import pandas as pd

## 1. Text Normalization Examples

In [None]:
# Unicode normalization example
text = "Café – AÑO 2025 — coöperate"
print(f"Original: {text}")

# Normalize to NFC (compose characters)
text_nfc = unicodedata.normalize("NFC", text)
print(f"NFC: {text_nfc}")

# Normalize to NFKC (compatibility)
text_nfkc = unicodedata.normalize("NFKC", text)
print(f"NFKC: {text_nfkc}")

# Show byte differences
print(f"\nOriginal bytes: {text.encode('utf-8')}")
print(f"NFC bytes: {text_nfc.encode('utf-8')}")
print(f"NFKC bytes: {text_nfkc.encode('utf-8')}")

In [None]:
# Complete normalization pipeline
def normalize_text(text, method='conservative'):
    """Normalize text with different strategies."""
    if method == 'aggressive':
        # Aggressive normalization
        text = unicodedata.normalize("NFKC", text)
        text = text.lower()
        text = unidecode(text)  # Remove all diacritics
    elif method == 'conservative':
        # Conservative normalization
        text = unicodedata.normalize("NFC", text)
        text = text.lower()
    
    # Common whitespace normalization
    text = ' '.join(text.split())
    return text

# Test different normalization strategies
test_text = "Café — AÑO — coöperate   with\tmultiple\nwhitespace"
print(f"Original: '{test_text}'")
print(f"Conservative: '{normalize_text(test_text, 'conservative')}'")
print(f"Aggressive: '{normalize_text(test_text, 'aggressive')}'")

## 2. Byte-Pair Encoding (BPE) with tiktoken

In [None]:
# GPT-2 BPE tokenizer example
enc = tiktoken.get_encoding("gpt2")
text = "Café – internationalization in 2025!"

# Encode text to token IDs
ids = enc.encode(text)
print(f"Text: {text}")
print(f"Token IDs: {ids}")
print(f"Decoded: {enc.decode(ids)}")

# Show individual tokens
tokens = [enc.decode([id]) for id in ids]
print(f"Individual tokens: {tokens}")

In [None]:
# Analyze tokenization patterns
test_words = [
    "internationalization",
    "unhappiness", 
    "preprocessing",
    "tokenization",
    "unchartedness"
]

print("BPE Tokenization Analysis:")
print("-" * 50)
for word in test_words:
    ids = enc.encode(word)
    tokens = [enc.decode([id]) for id in ids]
    print(f"{word:20} → {tokens} ({len(tokens)} tokens)")

## 3. WordPiece with BERT Tokenizer

In [None]:
# BERT WordPiece tokenizer
bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
text = "Internationalization is complicated!"

# Tokenize text
tokens = bert_tok.tokenize(text)
print(f"Text: {text}")
print(f"Tokens: {tokens}")

# Full encoding with special tokens
encoded = bert_tok(text, return_tensors="pt")
print(f"\nInput IDs: {encoded['input_ids']}")
print(f"Attention Mask: {encoded['attention_mask']}")

In [None]:
# Compare WordPiece vs BPE tokenization
comparison_words = [
    "internationalization",
    "unhappiness",
    "preprocessing", 
    "tokenization"
]

print("Tokenization Comparison: WordPiece vs BPE")
print("=" * 60)

for word in comparison_words:
    # WordPiece (BERT)
    wp_tokens = bert_tok.tokenize(word)
    
    # BPE (GPT-2)
    bpe_ids = enc.encode(word)
    bpe_tokens = [enc.decode([id]) for id in bpe_ids]
    
    print(f"\n{word}:")
    print(f"  WordPiece: {wp_tokens} ({len(wp_tokens)} tokens)")
    print(f"  BPE:       {bpe_tokens} ({len(bpe_tokens)} tokens)")

## 4. Tokenization Analysis and Visualization

In [None]:
# Analyze vocabulary sizes and token distributions
sample_text = """
The quick brown fox jumps over the lazy dog. 
Internationalization and localization are important for global applications.
Machine learning models require careful preprocessing of textual data.
Tokenization strategies significantly impact model performance and efficiency.
"""

# Tokenize with different methods
word_tokens = sample_text.split()
char_tokens = list(sample_text.replace(' ', '▁'))  # Use ▁ for spaces
bpe_tokens = [enc.decode([id]) for id in enc.encode(sample_text)]
wp_tokens = bert_tok.tokenize(sample_text)

# Create comparison DataFrame
comparison_data = {
    'Method': ['Word-level', 'Character-level', 'BPE', 'WordPiece'],
    'Token Count': [len(word_tokens), len(char_tokens), len(bpe_tokens), len(wp_tokens)],
    'Unique Tokens': [len(set(word_tokens)), len(set(char_tokens)), 
                     len(set(bpe_tokens)), len(set(wp_tokens))]
}

df = pd.DataFrame(comparison_data)
print("Tokenization Strategy Comparison:")
print(df.to_string(index=False))

In [None]:
# Visualize tokenization comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Token count comparison
ax1.bar(df['Method'], df['Token Count'], color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
ax1.set_title('Total Token Count by Method')
ax1.set_ylabel('Number of Tokens')
ax1.tick_params(axis='x', rotation=45)

# Unique tokens comparison
ax2.bar(df['Method'], df['Unique Tokens'], color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
ax2.set_title('Unique Token Count by Method')
ax2.set_ylabel('Number of Unique Tokens')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Practical Exercises

In [None]:
# Exercise 1: Analyze OOV rates
def calculate_oov_rate(text, tokenizer_type='bpe'):
    """Calculate out-of-vocabulary rate for different tokenizers."""
    words = text.lower().split()
    
    if tokenizer_type == 'bpe':
        # BPE has no OOV due to byte-level encoding
        return 0.0
    elif tokenizer_type == 'wordpiece':
        # WordPiece uses [UNK] for unknown tokens
        tokens = bert_tok.tokenize(text)
        unk_count = tokens.count('[UNK]')
        return unk_count / len(tokens) if tokens else 0.0
    elif tokenizer_type == 'word':
        # Simulate word-level with limited vocabulary
        common_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        oov_count = sum(1 for word in words if word not in common_words)
        return oov_count / len(words) if words else 0.0
    
    return 0.0

# Test with different text types
texts = {
    'Common': "The quick brown fox jumps over the lazy dog.",
    'Technical': "Internationalization requires sophisticated preprocessing algorithms.",
    'Rare words': "The sesquipedalian lexicographer's perspicacious observations."
}

print("OOV Rate Analysis:")
print("-" * 40)
for text_type, text in texts.items():
    print(f"\n{text_type} text: '{text}'")
    for tok_type in ['word', 'wordpiece', 'bpe']:
        oov_rate = calculate_oov_rate(text, tok_type)
        print(f"  {tok_type:10}: {oov_rate:.2%} OOV rate")

In [None]:
# Exercise 2: Token length analysis
def analyze_token_lengths(text, method='bpe'):
    """Analyze token length distribution."""
    if method == 'bpe':
        tokens = [enc.decode([id]) for id in enc.encode(text)]
    elif method == 'wordpiece':
        tokens = bert_tok.tokenize(text)
    elif method == 'word':
        tokens = text.split()
    else:
        tokens = list(text)
    
    lengths = [len(token) for token in tokens]
    return lengths, tokens

# Analyze token lengths for different methods
test_text = "Internationalization and preprocessing are fundamental components of natural language processing pipelines."

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
methods = ['word', 'char', 'bpe', 'wordpiece']
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold']

for i, (method, color) in enumerate(zip(methods, colors)):
    ax = axes[i//2, i%2]
    
    if method == 'char':
        lengths = [1] * len(test_text.replace(' ', ''))
    else:
        lengths, _ = analyze_token_lengths(test_text, method)
    
    ax.hist(lengths, bins=range(1, max(lengths)+2), alpha=0.7, color=color, edgecolor='black')
    ax.set_title(f'{method.title()} Token Lengths')
    ax.set_xlabel('Token Length (characters)')
    ax.set_ylabel('Frequency')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Custom Tokenization Implementation

In [None]:
# Simple BPE implementation for educational purposes
class SimpleBPE:
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = []
    
    def get_pairs(self, word):
        """Get all adjacent pairs in a word."""
        pairs = set()
        prev_char = word[0]
        for char in word[1:]:
            pairs.add((prev_char, char))
            prev_char = char
        return pairs
    
    def train(self, corpus):
        """Train BPE on a corpus."""
        # Initialize with characters
        vocab = Counter()
        for word in corpus:
            word_tokens = list(word) + ['</w>']
            vocab[' '.join(word_tokens)] += 1
        
        # Learn merges
        for i in range(self.vocab_size - len(set(''.join(corpus)))):
            pairs = Counter()
            for word, freq in vocab.items():
                symbols = word.split()
                for pair in self.get_pairs(symbols):
                    pairs[pair] += freq
            
            if not pairs:
                break
                
            best_pair = pairs.most_common(1)[0][0]
            self.merges.append(best_pair)
            
            # Apply merge
            new_vocab = {}
            bigram = ' '.join(best_pair)
            replacement = ''.join(best_pair)
            
            for word in vocab:
                new_word = word.replace(bigram, replacement)
                new_vocab[new_word] = vocab[word]
            vocab = new_vocab
        
        # Build final vocabulary
        self.vocab = set()
        for word in vocab:
            self.vocab.update(word.split())
    
    def encode(self, word):
        """Encode a word using learned BPE."""
        word_tokens = list(word) + ['</w>']
        
        for merge in self.merges:
            i = 0
            while i < len(word_tokens) - 1:
                if (word_tokens[i], word_tokens[i+1]) == merge:
                    word_tokens = word_tokens[:i] + [''.join(merge)] + word_tokens[i+2:]
                i += 1
        
        return word_tokens

# Test simple BPE
corpus = ['low', 'lower', 'lowest', 'newer', 'wider']
bpe = SimpleBPE(vocab_size=20)
bpe.train(corpus)

print("Simple BPE Training Results:")
print(f"Learned merges: {bpe.merges[:5]}...")  # Show first 5 merges
print(f"Vocabulary size: {len(bpe.vocab)}")

# Test encoding
test_words = ['low', 'lower', 'lowest', 'new']
for word in test_words:
    encoded = bpe.encode(word)
    print(f"{word:8} → {encoded}")

## Summary and Key Takeaways

This notebook demonstrated:

1. **Text Normalization**: Unicode normalization, case handling, and whitespace processing
2. **BPE Tokenization**: Using tiktoken for GPT-style byte-level BPE
3. **WordPiece Tokenization**: Using transformers library for BERT-style tokenization
4. **Comparative Analysis**: Token counts, vocabulary sizes, and OOV rates
5. **Visualization**: Understanding tokenization patterns through plots
6. **Custom Implementation**: Simple BPE algorithm for educational purposes

**Next Steps**: Practice with different tokenizers, experiment with vocabulary sizes, and explore domain-specific tokenization challenges.