# Tokenizer Efficiency Analysis

**TokAlign ACL Publication-Grade Evaluation Framework - Notebook 1/3**

This notebook performs tokenizer-level analysis comparing baseline and adapted tokenizers.
No model inference is required - pure tokenizer metrics only.

## Metrics (per CVA study + STRR paper)

| Metric | Formula | Target |
|--------|---------|--------|
| **Fertility** | tokens / words | Spanish approaching English ~1.4 |
| **Compression Ratio** | bytes / tokens | Higher = better |
| **PCW** | % words split into 2+ subwords | Lower = better |
| **UNK Rate** | % unknown tokens | Should approach 0% |
| **STRR** | % words preserved as single tokens | Higher = better |

## References
- Trans-Tokenization (Remy et al., 2024)
- CVA Study (Yamaguchi et al., EMNLP 2024) - 271.5% inference speedup
- STRR Metric (Nayeem et al., 2025) - Single Token Retention Rate


In [None]:
# Cell 1: Configuration
# =====================

# === MODEL PATHS ===
BASELINE_MODEL = "EleutherAI/pythia-1b"
ADAPTED_MODEL = "ADAPTED_MODEL_PATH"  # Replace with your adapted model path

# === DATASET CONFIGURATION ===
WIKIPEDIA_SAMPLES = 10_000
OSCAR_SAMPLES = 10_000

# === OUTPUT FILES ===
OUTPUT_DIR = "results"
BASELINE_OUTPUT = f"{OUTPUT_DIR}/tokenizer_analysis_baseline.csv"
ADAPTED_OUTPUT = f"{OUTPUT_DIR}/tokenizer_analysis_adapted.csv"
COMPARISON_OUTPUT = f"{OUTPUT_DIR}/tokenizer_comparison.csv"

# === STATISTICAL ANALYSIS ===
BOOTSTRAP_ITERATIONS = 1000
CONFIDENCE_LEVEL = 0.95
RANDOM_SEED = 42


In [None]:
# Cell 2: Install Dependencies
!pip install transformers datasets pandas numpy scipy tqdm -q


In [None]:
# Cell 3: Imports
import os
import numpy as np
import pandas as pd
from scipy import stats
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Imports completed successfully.")


In [None]:
# Cell 4: Tokenizer Metrics Functions
# ====================================

@dataclass
class TokenizerMetrics:
    """Container for all tokenizer metrics for a single text sample."""
    text: str
    num_words: int
    num_tokens: int
    num_bytes: int
    num_unk_tokens: int
    num_continued_words: int  # Words split into 2+ subwords
    num_single_token_words: int  # Words preserved as single tokens
    
    @property
    def fertility(self) -> float:
        """Tokens per word (lower is better for efficiency)."""
        return self.num_tokens / max(self.num_words, 1)
    
    @property
    def compression_ratio(self) -> float:
        """Bytes per token (higher is better - more compression)."""
        return self.num_bytes / max(self.num_tokens, 1)
    
    @property
    def pcw(self) -> float:
        """Proportion of Continued Words - % words split into 2+ subwords."""
        return self.num_continued_words / max(self.num_words, 1)
    
    @property
    def unk_rate(self) -> float:
        """Frequency of unknown tokens."""
        return self.num_unk_tokens / max(self.num_tokens, 1)
    
    @property
    def strr(self) -> float:
        """Single Token Retention Rate - % words preserved as single tokens."""
        return self.num_single_token_words / max(self.num_words, 1)


def analyze_text(text: str, tokenizer) -> TokenizerMetrics:
    """
    Compute all tokenizer metrics for a single text.
    
    Metrics per CVA study (Yamaguchi et al., EMNLP 2024) and STRR paper (Nayeem et al., 2025).
    """
    # Basic counts
    words = text.split()
    num_words = len(words)
    num_bytes = len(text.encode('utf-8'))
    
    # Tokenize full text
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    num_tokens = len(token_ids)
    
    # Count UNK tokens
    unk_token_id = tokenizer.unk_token_id
    num_unk_tokens = sum(1 for tid in token_ids if tid == unk_token_id) if unk_token_id is not None else 0
    
    # Analyze word-level tokenization
    num_continued_words = 0
    num_single_token_words = 0
    
    for word in words:
        word_tokens = tokenizer.encode(word, add_special_tokens=False)
        if len(word_tokens) == 1:
            num_single_token_words += 1
        elif len(word_tokens) > 1:
            num_continued_words += 1
    
    return TokenizerMetrics(
        text=text,
        num_words=num_words,
        num_tokens=num_tokens,
        num_bytes=num_bytes,
        num_unk_tokens=num_unk_tokens,
        num_continued_words=num_continued_words,
        num_single_token_words=num_single_token_words
    )


def analyze_corpus(texts: List[str], tokenizer, desc: str = "Analyzing") -> pd.DataFrame:
    """Analyze a corpus of texts and return metrics DataFrame."""
    results = []
    
    for text in tqdm(texts, desc=desc):
        if not text or not text.strip():
            continue
        metrics = analyze_text(text, tokenizer)
        results.append({
            'text': metrics.text[:200],  # Truncate for storage
            'num_words': metrics.num_words,
            'num_tokens': metrics.num_tokens,
            'num_bytes': metrics.num_bytes,
            'num_unk_tokens': metrics.num_unk_tokens,
            'num_continued_words': metrics.num_continued_words,
            'num_single_token_words': metrics.num_single_token_words,
            'fertility': metrics.fertility,
            'compression_ratio': metrics.compression_ratio,
            'pcw': metrics.pcw,
            'unk_rate': metrics.unk_rate,
            'strr': metrics.strr
        })
    
    return pd.DataFrame(results)


print("Tokenizer metrics functions defined.")


In [None]:
# Cell 5: Statistical Analysis Functions
# ======================================

def bootstrap_ci(data: np.ndarray, n_bootstrap: int = 1000, confidence: float = 0.95) -> Tuple[float, float, float]:
    """
    Compute bootstrap confidence interval.
    
    Returns: (mean, lower_ci, upper_ci)
    """
    means = []
    n = len(data)
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        means.append(np.mean(sample))
    
    alpha = 1 - confidence
    lower = np.percentile(means, 100 * alpha / 2)
    upper = np.percentile(means, 100 * (1 - alpha / 2))
    
    return np.mean(data), lower, upper


def cohens_d(group1: np.ndarray, group2: np.ndarray) -> float:
    """
    Compute Cohen's d effect size.
    
    Interpretation:
    - |d| < 0.2: negligible
    - 0.2 <= |d| < 0.5: small
    - 0.5 <= |d| < 0.8: medium
    - |d| >= 0.8: large
    """
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    
    # Pooled standard deviation
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    
    return (np.mean(group1) - np.mean(group2)) / pooled_std if pooled_std > 0 else 0


def compare_tokenizers(baseline_df: pd.DataFrame, adapted_df: pd.DataFrame, 
                       metric: str) -> Dict:
    """
    Statistical comparison of a metric between two tokenizers.
    
    Returns dict with:
    - baseline_mean, baseline_ci
    - adapted_mean, adapted_ci
    - t_statistic, p_value (paired t-test)
    - cohens_d
    - percent_change
    """
    baseline_vals = baseline_df[metric].values
    adapted_vals = adapted_df[metric].values
    
    # Ensure same length for paired test
    min_len = min(len(baseline_vals), len(adapted_vals))
    baseline_vals = baseline_vals[:min_len]
    adapted_vals = adapted_vals[:min_len]
    
    # Bootstrap CIs
    b_mean, b_lower, b_upper = bootstrap_ci(baseline_vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
    a_mean, a_lower, a_upper = bootstrap_ci(adapted_vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
    
    # Paired t-test
    t_stat, p_value = stats.ttest_rel(baseline_vals, adapted_vals)
    
    # Effect size
    d = cohens_d(baseline_vals, adapted_vals)
    
    # Percent change
    pct_change = ((a_mean - b_mean) / b_mean * 100) if b_mean != 0 else 0
    
    return {
        'metric': metric,
        'baseline_mean': b_mean,
        'baseline_ci_lower': b_lower,
        'baseline_ci_upper': b_upper,
        'adapted_mean': a_mean,
        'adapted_ci_lower': a_lower,
        'adapted_ci_upper': a_upper,
        't_statistic': t_stat,
        'p_value': p_value,
        'cohens_d': d,
        'percent_change': pct_change,
        'significant': p_value < 0.05
    }


def interpret_effect_size(d: float) -> str:
    """Interpret Cohen's d effect size."""
    d_abs = abs(d)
    if d_abs < 0.2:
        return "negligible"
    elif d_abs < 0.5:
        return "small"
    elif d_abs < 0.8:
        return "medium"
    else:
        return "large"


print("Statistical analysis functions defined.")


In [None]:
# Cell 6: Load Tokenizers
# =======================

print("Loading tokenizers...")

# Load baseline tokenizer
baseline_tokenizer = AutoTokenizer.from_pretrained(BASELINE_MODEL)
print(f"Baseline tokenizer: {BASELINE_MODEL}")
print(f"  Vocab size: {baseline_tokenizer.vocab_size}")

# Load adapted tokenizer (will fail gracefully if not available)
try:
    adapted_tokenizer = AutoTokenizer.from_pretrained(ADAPTED_MODEL)
    print(f"\nAdapted tokenizer: {ADAPTED_MODEL}")
    print(f"  Vocab size: {adapted_tokenizer.vocab_size}")
    HAS_ADAPTED = True
except Exception as e:
    print(f"\nAdapted model not found: {ADAPTED_MODEL}")
    print("  Running baseline-only analysis.")
    adapted_tokenizer = None
    HAS_ADAPTED = False


In [None]:
# Cell 7: Load Datasets
# =====================

print("Loading datasets...")

# Load Spanish Wikipedia
print("\n1. Loading Spanish Wikipedia...")
wiki_dataset = load_dataset(
    "wikimedia/wikipedia", 
    "20231101.es",
    split="train",
    trust_remote_code=True
)
wiki_texts = [x['text'] for x in wiki_dataset.shuffle(seed=RANDOM_SEED).select(range(WIKIPEDIA_SAMPLES))]
print(f"   Loaded {len(wiki_texts)} Wikipedia samples")

# Load Spanish OSCAR
print("\n2. Loading Spanish OSCAR...")
oscar_dataset = load_dataset(
    "oscar-corpus/OSCAR-2301",
    "es",
    split="train",
    trust_remote_code=True,
    streaming=True  # Use streaming for large dataset
)

# Collect OSCAR samples via streaming
oscar_texts = []
for i, sample in enumerate(oscar_dataset):
    if i >= OSCAR_SAMPLES:
        break
    text = sample.get('text', '')
    if text and len(text) > 50:  # Filter short texts
        oscar_texts.append(text)
        
print(f"   Loaded {len(oscar_texts)} OSCAR samples")

# Combine datasets
all_spanish_texts = wiki_texts + oscar_texts
print(f"\nTotal Spanish corpus: {len(all_spanish_texts)} samples")

# Also load English for comparison (fertility baseline)
print("\n3. Loading English Wikipedia for comparison...")
wiki_en = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    split="train",
    trust_remote_code=True
)
english_texts = [x['text'] for x in wiki_en.shuffle(seed=RANDOM_SEED).select(range(5000))]
print(f"   Loaded {len(english_texts)} English Wikipedia samples")


In [None]:
# Cell 8: Analyze Baseline Tokenizer
# ===================================

print("=" * 70)
print("ANALYZING BASELINE TOKENIZER")
print("=" * 70)

# Spanish analysis
print("\n--- Spanish Corpus ---")
baseline_spanish_df = analyze_corpus(all_spanish_texts, baseline_tokenizer, "Baseline (Spanish)")

# English analysis (for fertility comparison)
print("\n--- English Corpus (Reference) ---")
baseline_english_df = analyze_corpus(english_texts, baseline_tokenizer, "Baseline (English)")

# Save baseline results
baseline_spanish_df.to_csv(BASELINE_OUTPUT, index=False)
print(f"\nBaseline Spanish results saved to: {BASELINE_OUTPUT}")


In [None]:
# Cell 9: Analyze Adapted Tokenizer (if available)
# ================================================

if HAS_ADAPTED:
    print("=" * 70)
    print("ANALYZING ADAPTED TOKENIZER")
    print("=" * 70)
    
    # Spanish analysis
    print("\n--- Spanish Corpus ---")
    adapted_spanish_df = analyze_corpus(all_spanish_texts, adapted_tokenizer, "Adapted (Spanish)")
    
    # English analysis
    print("\n--- English Corpus ---")
    adapted_english_df = analyze_corpus(english_texts, adapted_tokenizer, "Adapted (English)")
    
    # Save adapted results
    adapted_spanish_df.to_csv(ADAPTED_OUTPUT, index=False)
    print(f"\nAdapted Spanish results saved to: {ADAPTED_OUTPUT}")
else:
    adapted_spanish_df = None
    adapted_english_df = None
    print("Skipping adapted tokenizer analysis (model not available).")


In [None]:
# Cell 10: Summary Statistics
# ============================

def print_summary_stats(df: pd.DataFrame, name: str):
    """Print summary statistics for a metrics DataFrame."""
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    
    metrics = ['fertility', 'compression_ratio', 'pcw', 'unk_rate', 'strr']
    
    print(f"\n{'Metric':<20} {'Mean':>10} {'Std':>10} {'Median':>10} {'P5':>10} {'P95':>10}")
    print("-" * 70)
    
    for metric in metrics:
        vals = df[metric].values
        mean_val, ci_lower, ci_upper = bootstrap_ci(vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
        print(f"{metric:<20} {np.mean(vals):>10.4f} {np.std(vals):>10.4f} "
              f"{np.median(vals):>10.4f} {np.percentile(vals, 5):>10.4f} {np.percentile(vals, 95):>10.4f}")
        print(f"{'  95% CI':<20} [{ci_lower:>10.4f}, {ci_upper:>10.4f}]")


# Print baseline statistics
print_summary_stats(baseline_spanish_df, "BASELINE TOKENIZER - SPANISH")
print_summary_stats(baseline_english_df, "BASELINE TOKENIZER - ENGLISH (Reference)")

# Print adapted statistics if available
if HAS_ADAPTED:
    print_summary_stats(adapted_spanish_df, "ADAPTED TOKENIZER - SPANISH")
    print_summary_stats(adapted_english_df, "ADAPTED TOKENIZER - ENGLISH")


In [None]:
# Cell 11: Statistical Comparison (Baseline vs Adapted)
# =====================================================

if HAS_ADAPTED:
    print("=" * 70)
    print("STATISTICAL COMPARISON: BASELINE vs ADAPTED (SPANISH)")
    print("=" * 70)
    
    metrics_to_compare = ['fertility', 'compression_ratio', 'pcw', 'unk_rate', 'strr']
    comparison_results = []
    
    for metric in metrics_to_compare:
        result = compare_tokenizers(baseline_spanish_df, adapted_spanish_df, metric)
        comparison_results.append(result)
        
        effect_interpretation = interpret_effect_size(result['cohens_d'])
        sig_marker = "***" if result['p_value'] < 0.001 else "**" if result['p_value'] < 0.01 else "*" if result['p_value'] < 0.05 else ""
        
        print(f"\n--- {metric.upper()} ---")
        print(f"  Baseline: {result['baseline_mean']:.4f} [{result['baseline_ci_lower']:.4f}, {result['baseline_ci_upper']:.4f}]")
        print(f"  Adapted:  {result['adapted_mean']:.4f} [{result['adapted_ci_lower']:.4f}, {result['adapted_ci_upper']:.4f}]")
        print(f"  Change:   {result['percent_change']:+.2f}%")
        print(f"  t-stat:   {result['t_statistic']:.3f}, p-value: {result['p_value']:.2e} {sig_marker}")
        print(f"  Cohen's d: {result['cohens_d']:.3f} ({effect_interpretation})")
    
    # Save comparison results
    comparison_df = pd.DataFrame(comparison_results)
    comparison_df.to_csv(COMPARISON_OUTPUT, index=False)
    print(f"\nComparison results saved to: {COMPARISON_OUTPUT}")
else:
    print("Comparison skipped (adapted model not available).")


In [None]:
# Cell 12: Fertility Gap Analysis
# ================================

print("=" * 70)
print("FERTILITY GAP ANALYSIS")
print("=" * 70)
print("\nGoal: Spanish fertility approaching English (~1.4 tokens/word)")

# Calculate fertility gap for baseline
baseline_es_fertility = baseline_spanish_df['fertility'].mean()
baseline_en_fertility = baseline_english_df['fertility'].mean()
baseline_gap = baseline_es_fertility - baseline_en_fertility
baseline_gap_pct = (baseline_gap / baseline_en_fertility) * 100

print(f"\n--- BASELINE ---")
print(f"  English fertility:  {baseline_en_fertility:.4f} tokens/word")
print(f"  Spanish fertility:  {baseline_es_fertility:.4f} tokens/word")
print(f"  Fertility gap:      {baseline_gap:+.4f} ({baseline_gap_pct:+.1f}% overhead)")

if HAS_ADAPTED:
    adapted_es_fertility = adapted_spanish_df['fertility'].mean()
    adapted_en_fertility = adapted_english_df['fertility'].mean()
    adapted_gap = adapted_es_fertility - adapted_en_fertility
    adapted_gap_pct = (adapted_gap / adapted_en_fertility) * 100
    
    print(f"\n--- ADAPTED ---")
    print(f"  English fertility:  {adapted_en_fertility:.4f} tokens/word")
    print(f"  Spanish fertility:  {adapted_es_fertility:.4f} tokens/word")
    print(f"  Fertility gap:      {adapted_gap:+.4f} ({adapted_gap_pct:+.1f}% overhead)")
    
    # Improvement calculation
    gap_reduction = baseline_gap - adapted_gap
    gap_reduction_pct = (gap_reduction / baseline_gap) * 100 if baseline_gap != 0 else 0
    
    print(f"\n--- IMPROVEMENT ---")
    print(f"  Fertility gap reduction: {gap_reduction:.4f} ({gap_reduction_pct:.1f}% reduction)")
    print(f"  Estimated inference speedup: {(baseline_es_fertility / adapted_es_fertility - 1) * 100:.1f}%")


In [None]:
# Cell 13: Generate LaTeX Tables
# ===============================

def generate_latex_table():
    """Generate LaTeX-formatted results table."""
    
    print("\n" + "=" * 70)
    print("LATEX TABLE OUTPUT")
    print("=" * 70)
    
    # Table 1: Summary metrics
    print("\n% Table 1: Tokenizer Efficiency Metrics")
    print("\\begin{table}[h]")
    print("\\centering")
    print("\\caption{Tokenizer Efficiency Analysis on Spanish Corpus}")
    print("\\label{tab:tokenizer-efficiency}")
    print("\\begin{tabular}{lccccc}")
    print("\\toprule")
    print("Model & Fertility$\\downarrow$ & Compression$\\uparrow$ & PCW$\\downarrow$ & UNK$\\downarrow$ & STRR$\\uparrow$ \\\\")
    print("\\midrule")
    
    # Baseline row
    b_fert = baseline_spanish_df['fertility'].mean()
    b_comp = baseline_spanish_df['compression_ratio'].mean()
    b_pcw = baseline_spanish_df['pcw'].mean()
    b_unk = baseline_spanish_df['unk_rate'].mean()
    b_strr = baseline_spanish_df['strr'].mean()
    print(f"Baseline & {b_fert:.3f} & {b_comp:.3f} & {b_pcw:.3f} & {b_unk:.4f} & {b_strr:.3f} \\\\")
    
    if HAS_ADAPTED:
        a_fert = adapted_spanish_df['fertility'].mean()
        a_comp = adapted_spanish_df['compression_ratio'].mean()
        a_pcw = adapted_spanish_df['pcw'].mean()
        a_unk = adapted_spanish_df['unk_rate'].mean()
        a_strr = adapted_spanish_df['strr'].mean()
        print(f"Adapted & {a_fert:.3f} & {a_comp:.3f} & {a_pcw:.3f} & {a_unk:.4f} & {a_strr:.3f} \\\\")
        
        # Delta row
        print("\\midrule")
        delta_fert = ((a_fert - b_fert) / b_fert) * 100 if b_fert != 0 else 0
        delta_comp = ((a_comp - b_comp) / b_comp) * 100 if b_comp != 0 else 0
        delta_pcw = ((a_pcw - b_pcw) / b_pcw) * 100 if b_pcw != 0 else 0
        delta_strr = ((a_strr - b_strr) / b_strr) * 100 if b_strr != 0 else 0
        print(f"$\\Delta$ & {delta_fert:+.1f}\\% & {delta_comp:+.1f}\\% & {delta_pcw:+.1f}\\% & -- & {delta_strr:+.1f}\\% \\\\")
    
    print("\\bottomrule")
    print("\\end{tabular}")
    print("\\end{table}")


generate_latex_table()

print("\n" + "=" * 70)
print("TOKENIZER EFFICIENCY ANALYSIS COMPLETE")
print("=" * 70)
