# Tokenizer Efficiency Analysis

**TokAlign ACL Publication-Grade Evaluation Framework - Notebook 1/3**

This notebook performs tokenizer-level analysis comparing baseline and adapted tokenizers.
No model inference is required - pure tokenizer metrics only.

## Metrics (per CVA study + STRR paper)

| Metric | Formula | Target |
|--------|---------|--------|
| **Fertility** | tokens / words | Spanish approaching English ~1.4 |
| **Compression Ratio** | bytes / tokens | Higher = better |
| **PCW** | % words split into 2+ subwords | Lower = better |
| **UNK Rate** | % unknown tokens | Should approach 0% |
| **STRR** | % words preserved as single tokens | Higher = better |

## References
- Trans-Tokenization (Remy et al., 2024)
- CVA Study (Yamaguchi et al., EMNLP 2024) - 271.5% inference speedup
- STRR Metric (Nayeem et al., 2025) - Single Token Retention Rate


In [1]:
# Cell 1: Configuration
# =====================

# === MODEL PATHS ===
BASELINE_MODEL = "EleutherAI/pythia-1b"
ADAPTED_MODEL = "/home/ubuntu/aryan-true-tokalign/true-tokalign/log/1b/0_qwen2-7b_S2/checkpoint-2500"

# === DATASET CONFIGURATION ===
WIKIPEDIA_SAMPLES = 10_000
OSCAR_SAMPLES = 10_000

# === OUTPUT FILES ===
OUTPUT_DIR = "results"
BASELINE_OUTPUT = f"{OUTPUT_DIR}/tokenizer_analysis_baseline.csv"
ADAPTED_OUTPUT = f"{OUTPUT_DIR}/tokenizer_analysis_adapted.csv"
COMPARISON_OUTPUT = f"{OUTPUT_DIR}/tokenizer_comparison.csv"

# === STATISTICAL ANALYSIS ===
BOOTSTRAP_ITERATIONS = 1000
CONFIDENCE_LEVEL = 0.95
RANDOM_SEED = 42


In [2]:
# Cell 2: Install Dependencies
!pip install transformers datasets pandas numpy scipy tqdm -q


In [3]:
# Cell 3: Imports
import os
import numpy as np
import pandas as pd
from scipy import stats
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Imports completed successfully.")


  from .autonotebook import tqdm as notebook_tqdm


  import pynvml  # type: ignore[import]


Imports completed successfully.


In [4]:
# Cell 4: Tokenizer Metrics Functions (TRUE PARALLEL with datasets.map)
# =====================================================================
# Using HuggingFace datasets.map() with num_proc for true multiprocessing
# Each worker loads its own tokenizer to avoid pickling issues

from datasets import Dataset
import multiprocessing as mp

NUM_PROC = mp.cpu_count() // 2  # Use 26 physical cores (avoid hyperthreading overhead)

# Global tokenizer cache per worker (loaded once per process)
_WORKER_TOKENIZER = None
_WORKER_TOKENIZER_PATH = None

def _get_tokenizer(tokenizer_path: str):
    """Get or load tokenizer for current worker process."""
    global _WORKER_TOKENIZER, _WORKER_TOKENIZER_PATH
    if _WORKER_TOKENIZER is None or _WORKER_TOKENIZER_PATH != tokenizer_path:
        from transformers import AutoTokenizer
        _WORKER_TOKENIZER = AutoTokenizer.from_pretrained(tokenizer_path)
        _WORKER_TOKENIZER_PATH = tokenizer_path
    return _WORKER_TOKENIZER


def compute_metrics_for_map(examples, tokenizer_path: str):
    """
    Compute tokenizer metrics for a batch - designed for datasets.map().
    Each worker loads its own tokenizer to avoid pickling issues.
    """
    tokenizer = _get_tokenizer(tokenizer_path)
    
    texts = examples['text']
    
    # Batch tokenize all texts at once (uses Rust backend - fast!)
    all_encodings = tokenizer(texts, add_special_tokens=False)['input_ids']
    
    unk_id = tokenizer.unk_token_id
    
    # Pre-allocate result lists
    num_words_list = []
    num_tokens_list = []
    num_bytes_list = []
    num_unk_list = []
    num_continued_list = []
    num_single_list = []
    fertility_list = []
    compression_list = []
    pcw_list = []
    unk_rate_list = []
    strr_list = []
    
    for text, token_ids in zip(texts, all_encodings):
        words = text.split() if text else []
        num_words = len(words)
        num_bytes = len(text.encode('utf-8')) if text else 0
        num_tokens = len(token_ids)
        num_unk = sum(1 for t in token_ids if t == unk_id) if unk_id else 0
        
        # Word-level analysis: batch encode all words at once for speed
        if words:
            word_encodings = tokenizer(words, add_special_tokens=False)['input_ids']
            num_single = sum(1 for enc in word_encodings if len(enc) == 1)
            num_continued = sum(1 for enc in word_encodings if len(enc) > 1)
        else:
            num_single = 0
            num_continued = 0
        
        num_words_list.append(num_words)
        num_tokens_list.append(num_tokens)
        num_bytes_list.append(num_bytes)
        num_unk_list.append(num_unk)
        num_continued_list.append(num_continued)
        num_single_list.append(num_single)
        fertility_list.append(num_tokens / max(num_words, 1))
        compression_list.append(num_bytes / max(num_tokens, 1))
        pcw_list.append(num_continued / max(num_words, 1))
        unk_rate_list.append(num_unk / max(num_tokens, 1))
        strr_list.append(num_single / max(num_words, 1))
    
    return {
        'num_words': num_words_list,
        'num_tokens': num_tokens_list,
        'num_bytes': num_bytes_list,
        'num_unk_tokens': num_unk_list,
        'num_continued_words': num_continued_list,
        'num_single_token_words': num_single_list,
        'fertility': fertility_list,
        'compression_ratio': compression_list,
        'pcw': pcw_list,
        'unk_rate': unk_rate_list,
        'strr': strr_list
    }


def analyze_corpus_parallel(texts: List[str], tokenizer_path: str, desc: str = "Analyzing") -> pd.DataFrame:
    """
    Analyze corpus using TRUE parallel processing with datasets.map().
    Uses all 26 physical cores for maximum throughput.
    """
    print(f"  {desc}: Processing {len(texts)} texts with {NUM_PROC} parallel workers...")
    
    # Filter empty texts
    texts = [t for t in texts if t and t.strip()]
    
    # Create HuggingFace Dataset
    ds = Dataset.from_dict({'text': texts})
    
    # Use datasets.map() with fn_kwargs to pass tokenizer_path (avoids lambda pickling issues)
    ds = ds.map(
        compute_metrics_for_map,
        batched=True,
        batch_size=500,
        num_proc=NUM_PROC,
        fn_kwargs={'tokenizer_path': tokenizer_path},
        desc=desc
    )
    
    # Convert to DataFrame
    return ds.to_pandas()


print(f"Tokenizer metrics functions defined (TRUE PARALLEL mode, {NUM_PROC} workers).")


Tokenizer metrics functions defined (TRUE PARALLEL mode, 26 workers).


In [5]:
# Cell 5: Statistical Analysis Functions
# ======================================

def bootstrap_ci(data: np.ndarray, n_bootstrap: int = 1000, confidence: float = 0.95) -> Tuple[float, float, float]:
    """
    Compute bootstrap confidence interval.
    
    Returns: (mean, lower_ci, upper_ci)
    """
    means = []
    n = len(data)
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        means.append(np.mean(sample))
    
    alpha = 1 - confidence
    lower = np.percentile(means, 100 * alpha / 2)
    upper = np.percentile(means, 100 * (1 - alpha / 2))
    
    return np.mean(data), lower, upper


def cohens_d(group1: np.ndarray, group2: np.ndarray) -> float:
    """
    Compute Cohen's d effect size.
    
    Interpretation:
    - |d| < 0.2: negligible
    - 0.2 <= |d| < 0.5: small
    - 0.5 <= |d| < 0.8: medium
    - |d| >= 0.8: large
    """
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    
    # Pooled standard deviation
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    
    return (np.mean(group1) - np.mean(group2)) / pooled_std if pooled_std > 0 else 0


def compare_tokenizers(baseline_df: pd.DataFrame, adapted_df: pd.DataFrame, 
                       metric: str) -> Dict:
    """
    Statistical comparison of a metric between two tokenizers.
    
    Returns dict with:
    - baseline_mean, baseline_ci
    - adapted_mean, adapted_ci
    - t_statistic, p_value (paired t-test)
    - cohens_d
    - percent_change
    """
    baseline_vals = baseline_df[metric].values
    adapted_vals = adapted_df[metric].values
    
    # Ensure same length for paired test
    min_len = min(len(baseline_vals), len(adapted_vals))
    baseline_vals = baseline_vals[:min_len]
    adapted_vals = adapted_vals[:min_len]
    
    # Bootstrap CIs
    b_mean, b_lower, b_upper = bootstrap_ci(baseline_vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
    a_mean, a_lower, a_upper = bootstrap_ci(adapted_vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
    
    # Paired t-test
    t_stat, p_value = stats.ttest_rel(baseline_vals, adapted_vals)
    
    # Effect size
    d = cohens_d(baseline_vals, adapted_vals)
    
    # Percent change
    pct_change = ((a_mean - b_mean) / b_mean * 100) if b_mean != 0 else 0
    
    return {
        'metric': metric,
        'baseline_mean': b_mean,
        'baseline_ci_lower': b_lower,
        'baseline_ci_upper': b_upper,
        'adapted_mean': a_mean,
        'adapted_ci_lower': a_lower,
        'adapted_ci_upper': a_upper,
        't_statistic': t_stat,
        'p_value': p_value,
        'cohens_d': d,
        'percent_change': pct_change,
        'significant': p_value < 0.05
    }


def interpret_effect_size(d: float) -> str:
    """Interpret Cohen's d effect size."""
    d_abs = abs(d)
    if d_abs < 0.2:
        return "negligible"
    elif d_abs < 0.5:
        return "small"
    elif d_abs < 0.8:
        return "medium"
    else:
        return "large"


print("Statistical analysis functions defined.")


Statistical analysis functions defined.


In [6]:
# Cell 6: Load Tokenizers
# =======================

print("Loading tokenizers...")

# Load baseline tokenizer
baseline_tokenizer = AutoTokenizer.from_pretrained(BASELINE_MODEL)
print(f"Baseline tokenizer: {BASELINE_MODEL}")
print(f"  Vocab size: {baseline_tokenizer.vocab_size}")

# Load adapted tokenizer (will fail gracefully if not available)
try:
    adapted_tokenizer = AutoTokenizer.from_pretrained(ADAPTED_MODEL)
    print(f"\nAdapted tokenizer: {ADAPTED_MODEL}")
    print(f"  Vocab size: {adapted_tokenizer.vocab_size}")
    HAS_ADAPTED = True
except Exception as e:
    print(f"\nAdapted model not found: {ADAPTED_MODEL}")
    print("  Running baseline-only analysis.")
    adapted_tokenizer = None
    HAS_ADAPTED = False


Loading tokenizers...


Baseline tokenizer: EleutherAI/pythia-1b
  Vocab size: 50254



Adapted tokenizer: /home/ubuntu/aryan-true-tokalign/true-tokalign/log/1b/0_qwen2-7b_S2/checkpoint-2500
  Vocab size: 151643


In [7]:
# Cell 7: Load Datasets (PARALLELIZED)
# =====================================

import multiprocessing
NUM_PROC = multiprocessing.cpu_count() // 2  # Use 26 physical cores
print(f"Loading datasets with {NUM_PROC} parallel workers...")

# Load Spanish Wikipedia in parallel (non-streaming for speed)
print("\n1. Loading Spanish Wikipedia...")
target_samples = WIKIPEDIA_SAMPLES + OSCAR_SAMPLES  # Combine both quotas

wiki_es_dataset = load_dataset(
    "wikimedia/wikipedia", 
    "20231101.es",
    split=f"train[:{target_samples * 2}]",  # Load extra to account for filtering
    num_proc=NUM_PROC
)

# Filter in parallel
def filter_short_texts(example):
    text = example.get('text', '')
    return text and len(text) > 50

wiki_es_filtered = wiki_es_dataset.filter(filter_short_texts, num_proc=NUM_PROC)
all_spanish_texts = wiki_es_filtered['text'][:target_samples]
print(f"   Loaded {len(all_spanish_texts)} Spanish Wikipedia samples")

# Load English Wikipedia in parallel
print("\n2. Loading English Wikipedia for comparison...")
wiki_en_dataset = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    split="train[:15000]",  # Load extra to account for filtering
    num_proc=NUM_PROC
)

wiki_en_filtered = wiki_en_dataset.filter(filter_short_texts, num_proc=NUM_PROC)
english_texts = wiki_en_filtered['text'][:5000]
print(f"   Loaded {len(english_texts)} English Wikipedia samples")

print(f"\nTotal Spanish corpus: {len(all_spanish_texts)} samples")


Loading datasets with 26 parallel workers...

1. Loading Spanish Wikipedia...


   Loaded 20000 Spanish Wikipedia samples

2. Loading English Wikipedia for comparison...


   Loaded 5000 English Wikipedia samples

Total Spanish corpus: 20000 samples


In [8]:
# Cell 8: Analyze Baseline Tokenizer (PARALLEL)
# ==============================================

print("=" * 70)
print("ANALYZING BASELINE TOKENIZER")
print("=" * 70)

# Spanish analysis - use parallel processing
print("\n--- Spanish Corpus ---")
baseline_spanish_df = analyze_corpus_parallel(all_spanish_texts, BASELINE_MODEL, "Baseline (Spanish)")

# English analysis (for fertility comparison)
print("\n--- English Corpus (Reference) ---")
baseline_english_df = analyze_corpus_parallel(english_texts, BASELINE_MODEL, "Baseline (English)")

# Save baseline results
baseline_spanish_df.to_csv(BASELINE_OUTPUT, index=False)
print(f"\nBaseline Spanish results saved to: {BASELINE_OUTPUT}")


ANALYZING BASELINE TOKENIZER

--- Spanish Corpus ---
  Baseline (Spanish): Processing 20000 texts with 26 parallel workers...



Baseline (Spanish) (num_proc=26):   0%|                                                                                                      | 0/20000 [00:00<?, ? examples/s]


Baseline (Spanish) (num_proc=26):   2%|██▎                                                                                         | 500/20000 [00:21<14:13, 22.85 examples/s]


Baseline (Spanish) (num_proc=26):   4%|███▌                                                                                        | 769/20000 [00:22<08:01, 39.93 examples/s]


Baseline (Spanish) (num_proc=26):   6%|█████▊                                                                                     | 1269/20000 [00:25<04:49, 64.61 examples/s]


Baseline (Spanish) (num_proc=26):   9%|████████                                                                                   | 1769/20000 [00:31<04:15, 71.43 examples/s]


Baseline (Spanish) (num_proc=26):  11%|██████████▏                                                                               | 2269/20000 [00:32<02:38, 111.81 examples/s]


Baseline (Spanish) (num_proc=26):  13%|███████████▍                                                                              | 2538/20000 [00:33<02:22, 122.27 examples/s]


Baseline (Spanish) (num_proc=26):  15%|█████████████▋                                                                            | 3038/20000 [00:33<01:29, 190.38 examples/s]


Baseline (Spanish) (num_proc=26):  18%|███████████████▉                                                                          | 3538/20000 [00:39<02:04, 132.33 examples/s]


Baseline (Spanish) (num_proc=26):  20%|██████████████████▏                                                                       | 4038/20000 [00:40<01:25, 187.67 examples/s]


Baseline (Spanish) (num_proc=26):  25%|██████████████████████▋                                                                   | 5038/20000 [00:41<00:52, 287.17 examples/s]


Baseline (Spanish) (num_proc=26):  28%|████████████████████████▉                                                                 | 5538/20000 [00:44<00:55, 262.60 examples/s]


Baseline (Spanish) (num_proc=26):  33%|█████████████████████████████▍                                                            | 6538/20000 [00:44<00:31, 429.45 examples/s]


Baseline (Spanish) (num_proc=26):  34%|██████████████████████████████▋                                                           | 6807/20000 [00:44<00:29, 445.77 examples/s]


Baseline (Spanish) (num_proc=26):  35%|███████████████████████████████▊                                                          | 7076/20000 [00:45<00:26, 493.20 examples/s]


Baseline (Spanish) (num_proc=26):  37%|█████████████████████████████████                                                         | 7345/20000 [00:45<00:28, 442.47 examples/s]


Baseline (Spanish) (num_proc=26):  39%|███████████████████████████████████▎                                                      | 7845/20000 [00:46<00:21, 552.71 examples/s]


Baseline (Spanish) (num_proc=26):  43%|██████████████████████████████████████▊                                                   | 8615/20000 [00:47<00:16, 679.67 examples/s]


Baseline (Spanish) (num_proc=26):  46%|█████████████████████████████████████████                                                 | 9115/20000 [00:47<00:12, 873.35 examples/s]


Baseline (Spanish) (num_proc=26):  48%|███████████████████████████████████████████▎                                              | 9615/20000 [00:48<00:13, 778.57 examples/s]


Baseline (Spanish) (num_proc=26):  51%|█████████████████████████████████████████████                                            | 10115/20000 [00:49<00:17, 567.38 examples/s]


Baseline (Spanish) (num_proc=26):  54%|████████████████████████████████████████████████▍                                        | 10885/20000 [00:49<00:11, 826.66 examples/s]


Baseline (Spanish) (num_proc=26):  56%|█████████████████████████████████████████████████▋                                       | 11154/20000 [00:50<00:10, 808.67 examples/s]


Baseline (Spanish) (num_proc=26):  57%|██████████████████████████████████████████████████▊                                      | 11423/20000 [00:53<00:25, 338.50 examples/s]


Baseline (Spanish) (num_proc=26):  58%|████████████████████████████████████████████████████                                     | 11692/20000 [00:53<00:22, 369.00 examples/s]


Baseline (Spanish) (num_proc=26):  63%|████████████████████████████████████████████████████████▍                                | 12692/20000 [00:54<00:11, 611.11 examples/s]


Baseline (Spanish) (num_proc=26):  66%|██████████████████████████████████████████████████████████▋                              | 13192/20000 [00:57<00:18, 365.16 examples/s]


Baseline (Spanish) (num_proc=26):  67%|███████████████████████████████████████████████████████████▉                             | 13461/20000 [00:57<00:18, 361.87 examples/s]


Baseline (Spanish) (num_proc=26):  70%|██████████████████████████████████████████████████████████████▏                          | 13961/20000 [00:58<00:12, 487.32 examples/s]


Baseline (Spanish) (num_proc=26):  71%|███████████████████████████████████████████████████████████████▎                         | 14230/20000 [00:58<00:10, 574.52 examples/s]


Baseline (Spanish) (num_proc=26):  72%|████████████████████████████████████████████████████████████████▌                        | 14499/20000 [00:58<00:08, 680.33 examples/s]


Baseline (Spanish) (num_proc=26):  74%|█████████████████████████████████████████████████████████████████▋                       | 14768/20000 [00:58<00:06, 798.65 examples/s]


Baseline (Spanish) (num_proc=26):  75%|██████████████████████████████████████████████████████████████████▉                      | 15037/20000 [00:59<00:09, 512.11 examples/s]


Baseline (Spanish) (num_proc=26):  78%|█████████████████████████████████████████████████████████████████████▏                   | 15537/20000 [01:00<00:09, 480.17 examples/s]


Baseline (Spanish) (num_proc=26):  79%|██████████████████████████████████████████████████████████████████████▎                  | 15806/20000 [01:02<00:11, 357.58 examples/s]


Baseline (Spanish) (num_proc=26):  80%|███████████████████████████████████████████████████████████████████████▌                 | 16076/20000 [01:02<00:08, 454.85 examples/s]


Baseline (Spanish) (num_proc=26):  83%|█████████████████████████████████████████████████████████████████████████▊               | 16576/20000 [01:02<00:06, 547.37 examples/s]


Baseline (Spanish) (num_proc=26):  84%|██████████████████████████████████████████████████████████████████████████▉              | 16846/20000 [01:04<00:09, 337.52 examples/s]


Baseline (Spanish) (num_proc=26):  87%|█████████████████████████████████████████████████████████████████████████████▏           | 17346/20000 [01:05<00:07, 363.70 examples/s]


Baseline (Spanish) (num_proc=26):  88%|██████████████████████████████████████████████████████████████████████████████▍          | 17615/20000 [01:06<00:06, 350.46 examples/s]


Baseline (Spanish) (num_proc=26):  89%|███████████████████████████████████████████████████████████████████████████████▌         | 17884/20000 [01:07<00:06, 334.48 examples/s]


Baseline (Spanish) (num_proc=26):  91%|████████████████████████████████████████████████████████████████████████████████▊        | 18154/20000 [01:08<00:05, 331.07 examples/s]


Baseline (Spanish) (num_proc=26):  92%|█████████████████████████████████████████████████████████████████████████████████▉       | 18423/20000 [01:08<00:03, 413.98 examples/s]


Baseline (Spanish) (num_proc=26):  95%|████████████████████████████████████████████████████████████████████████████████████▏    | 18923/20000 [01:12<00:05, 208.92 examples/s]


Baseline (Spanish) (num_proc=26):  96%|█████████████████████████████████████████████████████████████████████████████████████▍   | 19193/20000 [01:13<00:03, 214.47 examples/s]


Baseline (Spanish) (num_proc=26):  97%|██████████████████████████████████████████████████████████████████████████████████████▌  | 19462/20000 [01:14<00:01, 275.16 examples/s]


Baseline (Spanish) (num_proc=26):  99%|███████████████████████████████████████████████████████████████████████████████████████▊ | 19731/20000 [01:20<00:02, 113.48 examples/s]


Baseline (Spanish) (num_proc=26): 100%|█████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [01:22<00:00, 112.52 examples/s]


Baseline (Spanish) (num_proc=26): 100%|█████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [01:23<00:00, 240.57 examples/s]





--- English Corpus (Reference) ---
  Baseline (English): Processing 5000 texts with 26 parallel workers...



Baseline (English) (num_proc=26):   0%|                                                                                                       | 0/5000 [00:00<?, ? examples/s]


Baseline (English) (num_proc=26):   4%|███▌                                                                                         | 192/5000 [00:10<04:16, 18.76 examples/s]


Baseline (English) (num_proc=26):  12%|██████████▋                                                                                  | 576/5000 [00:10<01:03, 69.15 examples/s]


Baseline (English) (num_proc=26):  15%|██████████████▎                                                                              | 768/5000 [00:11<00:47, 89.77 examples/s]


Baseline (English) (num_proc=26):  19%|█████████████████▋                                                                          | 960/5000 [00:12<00:34, 118.32 examples/s]


Baseline (English) (num_proc=26):  23%|████████████████████▉                                                                      | 1152/5000 [00:12<00:26, 147.09 examples/s]


Baseline (English) (num_proc=26):  27%|████████████████████████▍                                                                  | 1345/5000 [00:13<00:22, 165.74 examples/s]


Baseline (English) (num_proc=26):  31%|███████████████████████████▉                                                               | 1537/5000 [00:13<00:15, 230.66 examples/s]


Baseline (English) (num_proc=26):  35%|███████████████████████████████▍                                                           | 1729/5000 [00:13<00:11, 286.11 examples/s]


Baseline (English) (num_proc=26):  38%|██████████████████████████████████▉                                                        | 1922/5000 [00:14<00:08, 347.66 examples/s]


Baseline (English) (num_proc=26):  42%|██████████████████████████████████████▍                                                    | 2114/5000 [00:14<00:08, 324.65 examples/s]


Baseline (English) (num_proc=26):  46%|█████████████████████████████████████████▉                                                 | 2306/5000 [00:15<00:07, 354.87 examples/s]


Baseline (English) (num_proc=26):  50%|█████████████████████████████████████████████▍                                             | 2499/5000 [00:15<00:06, 401.56 examples/s]


Baseline (English) (num_proc=26):  54%|████████████████████████████████████████████████▉                                          | 2691/5000 [00:15<00:04, 475.85 examples/s]


Baseline (English) (num_proc=26):  62%|███████████████████████████████████████████████████████▉                                   | 3076/5000 [00:16<00:03, 505.80 examples/s]


Baseline (English) (num_proc=26):  65%|███████████████████████████████████████████████████████████▍                               | 3269/5000 [00:17<00:05, 319.17 examples/s]


Baseline (English) (num_proc=26):  69%|███████████████████████████████████████████████████████████████                            | 3462/5000 [00:18<00:05, 270.71 examples/s]


Baseline (English) (num_proc=26):  73%|██████████████████████████████████████████████████████████████████▌                        | 3655/5000 [00:19<00:04, 309.92 examples/s]


Baseline (English) (num_proc=26):  77%|██████████████████████████████████████████████████████████████████████                     | 3847/5000 [00:20<00:04, 285.99 examples/s]


Baseline (English) (num_proc=26):  81%|█████████████████████████████████████████████████████████████████████████▌                 | 4039/5000 [00:21<00:04, 204.97 examples/s]


Baseline (English) (num_proc=26):  85%|█████████████████████████████████████████████████████████████████████████████              | 4232/5000 [00:21<00:02, 274.33 examples/s]


Baseline (English) (num_proc=26):  92%|████████████████████████████████████████████████████████████████████████████████████       | 4616/5000 [00:21<00:00, 469.92 examples/s]


Baseline (English) (num_proc=26):  96%|███████████████████████████████████████████████████████████████████████████████████████▌   | 4808/5000 [00:22<00:00, 516.34 examples/s]


Baseline (English) (num_proc=26): 100%|███████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:22<00:00, 623.41 examples/s]


Baseline (English) (num_proc=26): 100%|███████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:22<00:00, 219.48 examples/s]





Baseline Spanish results saved to: results/tokenizer_analysis_baseline.csv


In [9]:
# Cell 9: Analyze Adapted Tokenizer (PARALLEL)
# =============================================

if HAS_ADAPTED:
    print("=" * 70)
    print("ANALYZING ADAPTED TOKENIZER")
    print("=" * 70)
    
    # Spanish analysis - use parallel processing
    print("\n--- Spanish Corpus ---")
    adapted_spanish_df = analyze_corpus_parallel(all_spanish_texts, ADAPTED_MODEL, "Adapted (Spanish)")
    
    # English analysis
    print("\n--- English Corpus ---")
    adapted_english_df = analyze_corpus_parallel(english_texts, ADAPTED_MODEL, "Adapted (English)")
    
    # Save adapted results
    adapted_spanish_df.to_csv(ADAPTED_OUTPUT, index=False)
    print(f"\nAdapted Spanish results saved to: {ADAPTED_OUTPUT}")
else:
    adapted_spanish_df = None
    adapted_english_df = None
    print("Skipping adapted tokenizer analysis (model not available).")


ANALYZING ADAPTED TOKENIZER

--- Spanish Corpus ---
  Adapted (Spanish): Processing 20000 texts with 26 parallel workers...



Adapted (Spanish) (num_proc=26):   0%|                                                                                                       | 0/20000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (42059 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (35791 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (40102 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (50885 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (37656 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (33291 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (51298 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (46362 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (38010 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (41823 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):   2%|██▎                                                                                          | 500/20000 [00:22<14:46, 22.00 examples/s]


Adapted (Spanish) (num_proc=26):   4%|███▌                                                                                         | 769/20000 [00:23<08:29, 37.74 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (61242 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):   6%|█████▊                                                                                      | 1269/20000 [00:26<05:05, 61.40 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (35987 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (69423 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (32774 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (39471 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):   9%|████████▏                                                                                   | 1769/20000 [00:33<04:26, 68.40 examples/s]


Adapted (Spanish) (num_proc=26):  11%|██████████▎                                                                                | 2269/20000 [00:33<02:45, 107.04 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (37670 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  14%|████████████▌                                                                              | 2769/20000 [00:35<02:06, 136.33 examples/s]


Adapted (Spanish) (num_proc=26):  15%|█████████████▊                                                                             | 3038/20000 [00:35<01:39, 170.82 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (40715 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  18%|████████████████                                                                           | 3538/20000 [00:41<02:09, 127.53 examples/s]


Adapted (Spanish) (num_proc=26):  20%|██████████████████▎                                                                        | 4038/20000 [00:41<01:30, 176.33 examples/s]


Adapted (Spanish) (num_proc=26):  23%|████████████████████▋                                                                      | 4538/20000 [00:42<01:01, 250.64 examples/s]


Adapted (Spanish) (num_proc=26):  25%|██████████████████████▉                                                                    | 5038/20000 [00:43<00:53, 277.44 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (38600 > 32768). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (35989 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  28%|█████████████████████████▏                                                                 | 5538/20000 [00:46<00:58, 246.78 examples/s]


Adapted (Spanish) (num_proc=26):  30%|███████████████████████████▍                                                               | 6038/20000 [00:46<00:41, 337.89 examples/s]


Adapted (Spanish) (num_proc=26):  33%|█████████████████████████████▋                                                             | 6538/20000 [00:46<00:28, 472.61 examples/s]


Adapted (Spanish) (num_proc=26):  34%|██████████████████████████████▉                                                            | 6807/20000 [00:46<00:23, 554.70 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (36398 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  35%|████████████████████████████████▏                                                          | 7076/20000 [00:47<00:29, 438.94 examples/s]


Adapted (Spanish) (num_proc=26):  37%|█████████████████████████████████▍                                                         | 7346/20000 [00:48<00:32, 385.06 examples/s]


Adapted (Spanish) (num_proc=26):  38%|██████████████████████████████████▋                                                        | 7615/20000 [00:48<00:25, 478.59 examples/s]


Adapted (Spanish) (num_proc=26):  41%|████████████████████████████████████▉                                                      | 8115/20000 [00:48<00:16, 715.85 examples/s]


Adapted (Spanish) (num_proc=26):  43%|███████████████████████████████████████▏                                                   | 8615/20000 [00:49<00:13, 843.40 examples/s]


Adapted (Spanish) (num_proc=26):  46%|█████████████████████████████████████████                                                 | 9115/20000 [00:49<00:09, 1165.70 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (35850 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  48%|███████████████████████████████████████████▋                                               | 9615/20000 [00:50<00:11, 916.77 examples/s]


Adapted (Spanish) (num_proc=26):  49%|████████████████████████████████████████████▉                                              | 9885/20000 [00:52<00:21, 460.77 examples/s]


Adapted (Spanish) (num_proc=26):  52%|██████████████████████████████████████████████▋                                           | 10385/20000 [00:52<00:15, 604.17 examples/s]


Adapted (Spanish) (num_proc=26):  54%|████████████████████████████████████████████████▉                                         | 10885/20000 [00:52<00:10, 835.95 examples/s]


Adapted (Spanish) (num_proc=26):  57%|███████████████████████████████████████████████████▍                                      | 11423/20000 [00:55<00:23, 367.91 examples/s]


Adapted (Spanish) (num_proc=26):  58%|████████████████████████████████████████████████████▌                                     | 11692/20000 [00:55<00:19, 422.68 examples/s]


Adapted (Spanish) (num_proc=26):  61%|██████████████████████████████████████████████████████▊                                   | 12192/20000 [00:56<00:15, 516.02 examples/s]


Adapted (Spanish) (num_proc=26):  63%|█████████████████████████████████████████████████████████                                 | 12692/20000 [00:57<00:14, 496.50 examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (52379 > 32768). Running this sequence through the model will result in indexing errors



Adapted (Spanish) (num_proc=26):  66%|███████████████████████████████████████████████████████████▎                              | 13192/20000 [00:59<00:19, 346.83 examples/s]


Adapted (Spanish) (num_proc=26):  67%|████████████████████████████████████████████████████████████▌                             | 13461/20000 [01:00<00:18, 345.59 examples/s]


Adapted (Spanish) (num_proc=26):  69%|█████████████████████████████████████████████████████████████▊                            | 13730/20000 [01:00<00:14, 427.08 examples/s]


Adapted (Spanish) (num_proc=26):  72%|█████████████████████████████████████████████████████████████████▏                        | 14499/20000 [01:01<00:08, 613.20 examples/s]


Adapted (Spanish) (num_proc=26):  75%|███████████████████████████████████████████████████████████████████▋                      | 15037/20000 [01:03<00:10, 462.32 examples/s]


Adapted (Spanish) (num_proc=26):  78%|█████████████████████████████████████████████████████████████████████▉                    | 15537/20000 [01:03<00:07, 595.86 examples/s]


Adapted (Spanish) (num_proc=26):  79%|███████████████████████████████████████████████████████████████████████▏                  | 15806/20000 [01:05<00:11, 374.54 examples/s]


Adapted (Spanish) (num_proc=26):  83%|██████████████████████████████████████████████████████████████████████████▌               | 16576/20000 [01:05<00:05, 590.80 examples/s]


Adapted (Spanish) (num_proc=26):  84%|███████████████████████████████████████████████████████████████████████████▊              | 16846/20000 [01:07<00:08, 357.95 examples/s]


Adapted (Spanish) (num_proc=26):  87%|██████████████████████████████████████████████████████████████████████████████            | 17346/20000 [01:08<00:07, 367.83 examples/s]


Adapted (Spanish) (num_proc=26):  88%|███████████████████████████████████████████████████████████████████████████████▎          | 17615/20000 [01:10<00:07, 313.30 examples/s]


Adapted (Spanish) (num_proc=26):  89%|████████████████████████████████████████████████████████████████████████████████▍         | 17884/20000 [01:10<00:05, 387.68 examples/s]


Adapted (Spanish) (num_proc=26):  91%|█████████████████████████████████████████████████████████████████████████████████▋        | 18153/20000 [01:11<00:05, 333.11 examples/s]


Adapted (Spanish) (num_proc=26):  92%|██████████████████████████████████████████████████████████████████████████████████▉       | 18423/20000 [01:12<00:04, 343.98 examples/s]


Adapted (Spanish) (num_proc=26):  95%|█████████████████████████████████████████████████████████████████████████████████████▏    | 18923/20000 [01:16<00:05, 200.67 examples/s]


Adapted (Spanish) (num_proc=26):  96%|██████████████████████████████████████████████████████████████████████████████████████▎   | 19192/20000 [01:17<00:03, 223.11 examples/s]


Adapted (Spanish) (num_proc=26):  99%|████████████████████████████████████████████████████████████████████████████████████████▊ | 19731/20000 [01:23<00:02, 128.92 examples/s]


Adapted (Spanish) (num_proc=26): 100%|██████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [01:26<00:00, 122.50 examples/s]


Adapted (Spanish) (num_proc=26): 100%|██████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [01:27<00:00, 229.74 examples/s]





--- English Corpus ---
  Adapted (English): Processing 5000 texts with 26 parallel workers...



Adapted (English) (num_proc=26):   0%|                                                                                                        | 0/5000 [00:00<?, ? examples/s]


Adapted (English) (num_proc=26):   4%|███▌                                                                                          | 192/5000 [00:12<05:23, 14.86 examples/s]


Adapted (English) (num_proc=26):   8%|███████▏                                                                                      | 384/5000 [00:13<02:09, 35.58 examples/s]


Adapted (English) (num_proc=26):  12%|██████████▊                                                                                   | 576/5000 [00:13<01:09, 63.82 examples/s]


Adapted (English) (num_proc=26):  15%|██████████████▍                                                                               | 768/5000 [00:14<00:49, 85.14 examples/s]


Adapted (English) (num_proc=26):  19%|█████████████████▊                                                                           | 961/5000 [00:14<00:33, 119.92 examples/s]


Adapted (English) (num_proc=26):  23%|█████████████████████▏                                                                      | 1153/5000 [00:15<00:23, 166.80 examples/s]


Adapted (English) (num_proc=26):  27%|████████████████████████▊                                                                   | 1346/5000 [00:15<00:18, 202.50 examples/s]


Adapted (English) (num_proc=26):  31%|████████████████████████████▎                                                               | 1538/5000 [00:15<00:13, 259.42 examples/s]


Adapted (English) (num_proc=26):  35%|███████████████████████████████▊                                                            | 1730/5000 [00:17<00:15, 205.60 examples/s]


Adapted (English) (num_proc=26):  46%|██████████████████████████████████████████▍                                                 | 2308/5000 [00:17<00:07, 352.27 examples/s]


Adapted (English) (num_proc=26):  50%|██████████████████████████████████████████████                                              | 2500/5000 [00:18<00:07, 354.20 examples/s]


Adapted (English) (num_proc=26):  54%|█████████████████████████████████████████████████▌                                          | 2693/5000 [00:18<00:06, 369.37 examples/s]


Adapted (English) (num_proc=26):  62%|████████████████████████████████████████████████████████▌                                   | 3077/5000 [00:19<00:04, 405.51 examples/s]


Adapted (English) (num_proc=26):  69%|███████████████████████████████████████████████████████████████▋                            | 3462/5000 [00:20<00:02, 515.66 examples/s]


Adapted (English) (num_proc=26):  73%|███████████████████████████████████████████████████████████████████▎                        | 3655/5000 [00:21<00:03, 349.90 examples/s]


Adapted (English) (num_proc=26):  77%|██████████████████████████████████████████████████████████████████████▊                     | 3847/5000 [00:22<00:03, 311.32 examples/s]


Adapted (English) (num_proc=26):  81%|██████████████████████████████████████████████████████████████████████████▎                 | 4040/5000 [00:22<00:02, 329.05 examples/s]


Adapted (English) (num_proc=26):  85%|█████████████████████████████████████████████████████████████████████████████▊              | 4232/5000 [00:24<00:03, 211.97 examples/s]


Adapted (English) (num_proc=26):  92%|████████████████████████████████████████████████████████████████████████████████████▉       | 4616/5000 [00:24<00:01, 337.85 examples/s]


Adapted (English) (num_proc=26): 100%|████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:25<00:00, 196.83 examples/s]





Adapted Spanish results saved to: results/tokenizer_analysis_adapted.csv


In [10]:
# Cell 10: Summary Statistics
# ============================

def print_summary_stats(df: pd.DataFrame, name: str):
    """Print summary statistics for a metrics DataFrame."""
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    
    metrics = ['fertility', 'compression_ratio', 'pcw', 'unk_rate', 'strr']
    
    print(f"\n{'Metric':<20} {'Mean':>10} {'Std':>10} {'Median':>10} {'P5':>10} {'P95':>10}")
    print("-" * 70)
    
    for metric in metrics:
        vals = df[metric].values
        mean_val, ci_lower, ci_upper = bootstrap_ci(vals, BOOTSTRAP_ITERATIONS, CONFIDENCE_LEVEL)
        print(f"{metric:<20} {np.mean(vals):>10.4f} {np.std(vals):>10.4f} "
              f"{np.median(vals):>10.4f} {np.percentile(vals, 5):>10.4f} {np.percentile(vals, 95):>10.4f}")
        print(f"{'  95% CI':<20} [{ci_lower:>10.4f}, {ci_upper:>10.4f}]")


# Print baseline statistics
print_summary_stats(baseline_spanish_df, "BASELINE TOKENIZER - SPANISH")
print_summary_stats(baseline_english_df, "BASELINE TOKENIZER - ENGLISH (Reference)")

# Print adapted statistics if available
if HAS_ADAPTED:
    print_summary_stats(adapted_spanish_df, "ADAPTED TOKENIZER - SPANISH")
    print_summary_stats(adapted_english_df, "ADAPTED TOKENIZER - ENGLISH")



BASELINE TOKENIZER - SPANISH

Metric                     Mean        Std     Median         P5        P95
----------------------------------------------------------------------


fertility                2.0591     0.2316     1.9925     1.8155     2.4877
  95% CI             [    2.0558,     2.0622]


compression_ratio        3.1687     0.2203     3.1986     2.7828     3.4907
  95% CI             [    3.1659,     3.1718]


pcw                      0.5883     0.0548     0.5741     0.5273     0.7031
  95% CI             [    0.5875,     0.5890]


unk_rate                 0.0000     0.0000     0.0000     0.0000     0.0000
  95% CI             [    0.0000,     0.0000]


strr                     0.4117     0.0548     0.4259     0.2969     0.4727
  95% CI             [    0.4110,     0.4125]

BASELINE TOKENIZER - ENGLISH (Reference)

Metric                     Mean        Std     Median         P5        P95
----------------------------------------------------------------------
fertility                1.5276     0.2401     1.4625     1.2994     1.9520
  95% CI             [    1.5210,     1.5340]
compression_ratio        4.2671     0.4574     4.3118     3.4530     4.9427
  95% CI             [    4.2541,     4.2792]


pcw                      0.3983     0.0722     0.3864     0.3072     0.5327
  95% CI             [    0.3963,     0.4002]
unk_rate                 0.0000     0.0000     0.0000     0.0000     0.0000
  95% CI             [    0.0000,     0.0000]
strr                     0.6017     0.0722     0.6136     0.4673     0.6928
  95% CI             [    0.5998,     0.6036]

ADAPTED TOKENIZER - SPANISH

Metric                     Mean        Std     Median         P5        P95
----------------------------------------------------------------------


fertility                1.8739     0.2627     1.8022     1.5949     2.4336
  95% CI             [    1.8705,     1.8774]


compression_ratio        3.5014     0.3490     3.5323     2.8710     4.0531
  95% CI             [    3.4966,     3.5060]


pcw                      0.5722     0.0598     0.5554     0.5036     0.6990
  95% CI             [    0.5714,     0.5731]


unk_rate                 0.0000     0.0000     0.0000     0.0000     0.0000
  95% CI             [    0.0000,     0.0000]


strr                     0.4278     0.0598     0.4446     0.3010     0.4964
  95% CI             [    0.4270,     0.4285]

ADAPTED TOKENIZER - ENGLISH

Metric                     Mean        Std     Median         P5        P95
----------------------------------------------------------------------
fertility                1.5612     0.2633     1.4894     1.2945     2.1154
  95% CI             [    1.5539,     1.5688]
compression_ratio        4.1903     0.5108     4.2307     3.2357     4.9741
  95% CI             [    4.1759,     4.2055]


pcw                      0.3885     0.0722     0.3781     0.2928     0.5355
  95% CI             [    0.3865,     0.3904]
unk_rate                 0.0000     0.0000     0.0000     0.0000     0.0000
  95% CI             [    0.0000,     0.0000]
strr                     0.6115     0.0722     0.6219     0.4645     0.7072
  95% CI             [    0.6094,     0.6135]


In [11]:
# Cell 11: Statistical Comparison (Baseline vs Adapted)
# =====================================================

if HAS_ADAPTED:
    print("=" * 70)
    print("STATISTICAL COMPARISON: BASELINE vs ADAPTED (SPANISH)")
    print("=" * 70)
    
    metrics_to_compare = ['fertility', 'compression_ratio', 'pcw', 'unk_rate', 'strr']
    comparison_results = []
    
    for metric in metrics_to_compare:
        result = compare_tokenizers(baseline_spanish_df, adapted_spanish_df, metric)
        comparison_results.append(result)
        
        effect_interpretation = interpret_effect_size(result['cohens_d'])
        sig_marker = "***" if result['p_value'] < 0.001 else "**" if result['p_value'] < 0.01 else "*" if result['p_value'] < 0.05 else ""
        
        print(f"\n--- {metric.upper()} ---")
        print(f"  Baseline: {result['baseline_mean']:.4f} [{result['baseline_ci_lower']:.4f}, {result['baseline_ci_upper']:.4f}]")
        print(f"  Adapted:  {result['adapted_mean']:.4f} [{result['adapted_ci_lower']:.4f}, {result['adapted_ci_upper']:.4f}]")
        print(f"  Change:   {result['percent_change']:+.2f}%")
        print(f"  t-stat:   {result['t_statistic']:.3f}, p-value: {result['p_value']:.2e} {sig_marker}")
        print(f"  Cohen's d: {result['cohens_d']:.3f} ({effect_interpretation})")
    
    # Save comparison results
    comparison_df = pd.DataFrame(comparison_results)
    comparison_df.to_csv(COMPARISON_OUTPUT, index=False)
    print(f"\nComparison results saved to: {COMPARISON_OUTPUT}")
else:
    print("Comparison skipped (adapted model not available).")


STATISTICAL COMPARISON: BASELINE vs ADAPTED (SPANISH)



--- FERTILITY ---
  Baseline: 2.0591 [2.0559, 2.0620]
  Adapted:  1.8739 [1.8705, 1.8776]
  Change:   -8.99%
  t-stat:   219.567, p-value: 0.00e+00 ***
  Cohen's d: 0.748 (medium)



--- COMPRESSION_RATIO ---
  Baseline: 3.1687 [3.1656, 3.1719]
  Adapted:  3.5014 [3.4970, 3.5065]
  Change:   +10.50%
  t-stat:   -231.707, p-value: 0.00e+00 ***
  Cohen's d: -1.140 (large)



--- PCW ---
  Baseline: 0.5883 [0.5875, 0.5890]
  Adapted:  0.5722 [0.5714, 0.5730]
  Change:   -2.73%
  t-stat:   112.742, p-value: 0.00e+00 ***
  Cohen's d: 0.279 (small)



--- UNK_RATE ---
  Baseline: 0.0000 [0.0000, 0.0000]
  Adapted:  0.0000 [0.0000, 0.0000]
  Change:   +0.00%
  t-stat:   nan, p-value: nan 
  Cohen's d: 0.000 (negligible)



--- STRR ---
  Baseline: 0.4117 [0.4109, 0.4125]
  Adapted:  0.4278 [0.4269, 0.4286]
  Change:   +3.89%
  t-stat:   -112.742, p-value: 0.00e+00 ***
  Cohen's d: -0.279 (small)

Comparison results saved to: results/tokenizer_comparison.csv


In [12]:
# Cell 12: Fertility Gap Analysis
# ================================

print("=" * 70)
print("FERTILITY GAP ANALYSIS")
print("=" * 70)
print("\nGoal: Spanish fertility approaching English (~1.4 tokens/word)")

# Calculate fertility gap for baseline
baseline_es_fertility = baseline_spanish_df['fertility'].mean()
baseline_en_fertility = baseline_english_df['fertility'].mean()
baseline_gap = baseline_es_fertility - baseline_en_fertility
baseline_gap_pct = (baseline_gap / baseline_en_fertility) * 100

print(f"\n--- BASELINE ---")
print(f"  English fertility:  {baseline_en_fertility:.4f} tokens/word")
print(f"  Spanish fertility:  {baseline_es_fertility:.4f} tokens/word")
print(f"  Fertility gap:      {baseline_gap:+.4f} ({baseline_gap_pct:+.1f}% overhead)")

if HAS_ADAPTED:
    adapted_es_fertility = adapted_spanish_df['fertility'].mean()
    adapted_en_fertility = adapted_english_df['fertility'].mean()
    adapted_gap = adapted_es_fertility - adapted_en_fertility
    adapted_gap_pct = (adapted_gap / adapted_en_fertility) * 100
    
    print(f"\n--- ADAPTED ---")
    print(f"  English fertility:  {adapted_en_fertility:.4f} tokens/word")
    print(f"  Spanish fertility:  {adapted_es_fertility:.4f} tokens/word")
    print(f"  Fertility gap:      {adapted_gap:+.4f} ({adapted_gap_pct:+.1f}% overhead)")
    
    # Improvement calculation
    gap_reduction = baseline_gap - adapted_gap
    gap_reduction_pct = (gap_reduction / baseline_gap) * 100 if baseline_gap != 0 else 0
    
    print(f"\n--- IMPROVEMENT ---")
    print(f"  Fertility gap reduction: {gap_reduction:.4f} ({gap_reduction_pct:.1f}% reduction)")
    print(f"  Estimated inference speedup: {(baseline_es_fertility / adapted_es_fertility - 1) * 100:.1f}%")


FERTILITY GAP ANALYSIS

Goal: Spanish fertility approaching English (~1.4 tokens/word)

--- BASELINE ---
  English fertility:  1.5276 tokens/word
  Spanish fertility:  2.0591 tokens/word
  Fertility gap:      +0.5315 (+34.8% overhead)

--- ADAPTED ---
  English fertility:  1.5612 tokens/word
  Spanish fertility:  1.8739 tokens/word
  Fertility gap:      +0.3127 (+20.0% overhead)

--- IMPROVEMENT ---
  Fertility gap reduction: 0.2188 (41.2% reduction)
  Estimated inference speedup: 9.9%


In [13]:
# Cell 13: Generate LaTeX Tables
# ===============================

def generate_latex_table():
    """Generate LaTeX-formatted results table."""
    
    print("\n" + "=" * 70)
    print("LATEX TABLE OUTPUT")
    print("=" * 70)
    
    # Table 1: Summary metrics
    print("\n% Table 1: Tokenizer Efficiency Metrics")
    print("\\begin{table}[h]")
    print("\\centering")
    print("\\caption{Tokenizer Efficiency Analysis on Spanish Corpus}")
    print("\\label{tab:tokenizer-efficiency}")
    print("\\begin{tabular}{lccccc}")
    print("\\toprule")
    print("Model & Fertility$\\downarrow$ & Compression$\\uparrow$ & PCW$\\downarrow$ & UNK$\\downarrow$ & STRR$\\uparrow$ \\\\")
    print("\\midrule")
    
    # Baseline row
    b_fert = baseline_spanish_df['fertility'].mean()
    b_comp = baseline_spanish_df['compression_ratio'].mean()
    b_pcw = baseline_spanish_df['pcw'].mean()
    b_unk = baseline_spanish_df['unk_rate'].mean()
    b_strr = baseline_spanish_df['strr'].mean()
    print(f"Baseline & {b_fert:.3f} & {b_comp:.3f} & {b_pcw:.3f} & {b_unk:.4f} & {b_strr:.3f} \\\\")
    
    if HAS_ADAPTED:
        a_fert = adapted_spanish_df['fertility'].mean()
        a_comp = adapted_spanish_df['compression_ratio'].mean()
        a_pcw = adapted_spanish_df['pcw'].mean()
        a_unk = adapted_spanish_df['unk_rate'].mean()
        a_strr = adapted_spanish_df['strr'].mean()
        print(f"Adapted & {a_fert:.3f} & {a_comp:.3f} & {a_pcw:.3f} & {a_unk:.4f} & {a_strr:.3f} \\\\")
        
        # Delta row
        print("\\midrule")
        delta_fert = ((a_fert - b_fert) / b_fert) * 100 if b_fert != 0 else 0
        delta_comp = ((a_comp - b_comp) / b_comp) * 100 if b_comp != 0 else 0
        delta_pcw = ((a_pcw - b_pcw) / b_pcw) * 100 if b_pcw != 0 else 0
        delta_strr = ((a_strr - b_strr) / b_strr) * 100 if b_strr != 0 else 0
        print(f"$\\Delta$ & {delta_fert:+.1f}\\% & {delta_comp:+.1f}\\% & {delta_pcw:+.1f}\\% & -- & {delta_strr:+.1f}\\% \\\\")
    
    print("\\bottomrule")
    print("\\end{tabular}")
    print("\\end{table}")


generate_latex_table()

print("\n" + "=" * 70)
print("TOKENIZER EFFICIENCY ANALYSIS COMPLETE")
print("=" * 70)



LATEX TABLE OUTPUT

% Table 1: Tokenizer Efficiency Metrics
\begin{table}[h]
\centering
\caption{Tokenizer Efficiency Analysis on Spanish Corpus}
\label{tab:tokenizer-efficiency}
\begin{tabular}{lccccc}
\toprule
Model & Fertility$\downarrow$ & Compression$\uparrow$ & PCW$\downarrow$ & UNK$\downarrow$ & STRR$\uparrow$ \\
\midrule
Baseline & 2.059 & 3.169 & 0.588 & 0.0000 & 0.412 \\
Adapted & 1.874 & 3.501 & 0.572 & 0.0000 & 0.428 \\
\midrule
$\Delta$ & -9.0\% & +10.5\% & -2.7\% & -- & +3.9\% \\
\bottomrule
\end{tabular}
\end{table}

TOKENIZER EFFICIENCY ANALYSIS COMPLETE
