# Full Model Evaluation - Baseline (Pythia-1B)

**TokAlign ACL Publication-Grade Evaluation Framework - Notebook 2/3**

Comprehensive evaluation of `EleutherAI/pythia-1b` covering:
- **Section A**: Perplexity (Spanish + English catastrophic forgetting check)
- **Section B**: Downstream NLU Tasks (Belebele, SIB-200, XCOPA, XNLI)
- **Section C**: Machine Translation (FLORES-200, BLEU/chrF++/COMET)
- **Section D**: Generation Quality (Distinct-N, repetition, Self-BLEU)
- **Section E**: Computational Efficiency (throughput, latency, VRAM)

## References
- Belebele (Bandarkar et al., ACL 2024) - 221 citations
- SIB-200 (Adelani et al., EACL 2023) - 118 citations
- CVA Study (Yamaguchi et al., EMNLP 2024) - 271.5% speedup
- Branch-and-Merge (Alexandrov et al., EMNLP 2024) - Catastrophic forgetting mitigation


In [None]:
# Cell 1: Configuration
# =====================

# === MODEL CONFIGURATION ===
MODEL_PATH = "EleutherAI/pythia-1b"
MODEL_NAME = "baseline"  # Used for output file naming

# === EVALUATION SETTINGS ===
PERPLEXITY_SAMPLES_ES = 2000
PERPLEXITY_SAMPLES_EN = 1000
GENERATION_PROMPTS = 500
MT_SAMPLES = 1012  # FLORES-200 devtest size
EFFICIENCY_SAMPLES = 1000
EFFICIENCY_RUNS = 3
WARMUP_SAMPLES = 100

# === BATCH SIZES ===
BATCH_SIZE_PPL = 16
BATCH_SIZE_GEN = 8
MAX_NEW_TOKENS = 128

# === OUTPUT ===
OUTPUT_DIR = "results"
PPL_OUTPUT = f"{OUTPUT_DIR}/perplexity_{MODEL_NAME}.csv"
NLU_OUTPUT = f"{OUTPUT_DIR}/nlu_results_{MODEL_NAME}.json"
MT_OUTPUT = f"{OUTPUT_DIR}/mt_results_{MODEL_NAME}.csv"
GEN_OUTPUT = f"{OUTPUT_DIR}/generation_{MODEL_NAME}.csv"
EFFICIENCY_OUTPUT = f"{OUTPUT_DIR}/efficiency_{MODEL_NAME}.csv"

# === THRESHOLDS (per plan) ===
ENGLISH_PPL_DEGRADATION_THRESHOLD = 0.05  # <5% increase acceptable
ENGLISH_ACCURACY_DEGRADATION_THRESHOLD = 0.02  # <2% drop acceptable

RANDOM_SEED = 42


In [None]:
# Cell 2: Install Dependencies
!pip install transformers datasets accelerate pandas numpy scipy tqdm pynvml -q
!pip install sacrebleu unbabel-comet lm-eval -q
!pip install flash-attn --no-build-isolation -q


In [None]:
# Cell 3: Imports and GPU Validation
import os
import json
import time
import torch
import numpy as np
import pandas as pd
import pynvml
from scipy import stats
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Optional
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# GPU Validation
assert torch.cuda.is_available(), "CUDA is not available - GPU required"
compute_capability = torch.cuda.get_device_capability()
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Compute Capability: {compute_capability[0]}.{compute_capability[1]}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")

# Initialize VRAM monitoring
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

# Set random seed
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"\nOutput directory: {OUTPUT_DIR}")


In [None]:
# Cell 4: Load Model
# ==================

print(f"Loading model: {MODEL_PATH}")

# Determine if Flash Attention 2 is available
use_flash_attn = compute_capability[0] >= 8
attn_impl = "flash_attention_2" if use_flash_attn else "eager"
print(f"Attention implementation: {attn_impl}")

# Load model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    attn_implementation=attn_impl,
)
model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Required for batched generation

print(f"\nModel loaded successfully on {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")
print(f"Vocab size: {tokenizer.vocab_size}")


## Section A: Perplexity Evaluation

Evaluates model perplexity on:
1. **Spanish (Target Language)**: OSCAR-2301 Spanish validation
2. **English (Catastrophic Forgetting)**: WikiText-2 validation

Threshold: <5% PPL increase on English acceptable per Branch-and-Merge methodology.


In [None]:
# Cell 5: Perplexity Functions
# =============================

@torch.no_grad()
def calculate_batch_perplexity(texts: List[str], model, tokenizer, max_length: int = 512) -> List[Dict]:
    """
    Vectorized perplexity calculation using cross-entropy loss.
    Returns list of dicts with PPL and token counts per sample.
    """
    # Tokenize batch with padding
    encodings = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
    ).to("cuda")
    
    input_ids = encodings.input_ids
    attention_mask = encodings.attention_mask
    
    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Shift logits and labels for causal LM loss calculation
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()
    shift_mask = attention_mask[:, 1:].contiguous()
    
    # Calculate per-token cross entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    losses = losses.view(shift_labels.size())
    
    # Mask out padding tokens and calculate mean loss per sequence
    masked_losses = losses * shift_mask
    seq_lengths = shift_mask.sum(dim=1).clamp(min=1)
    mean_losses = masked_losses.sum(dim=1) / seq_lengths
    
    # Convert to perplexity
    perplexities = torch.exp(mean_losses)
    
    # Return detailed results
    results = []
    for i, (text, ppl, n_tokens) in enumerate(zip(texts, perplexities.cpu().tolist(), seq_lengths.cpu().tolist())):
        results.append({
            'text': text[:200],
            'perplexity': ppl,
            'num_tokens': int(n_tokens),
            'cross_entropy': mean_losses[i].cpu().item()
        })
    
    return results


def evaluate_perplexity(texts: List[str], model, tokenizer, batch_size: int, desc: str) -> pd.DataFrame:
    """Evaluate perplexity on a corpus."""
    all_results = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = texts[i:i+batch_size]
        batch = [t for t in batch if t and t.strip()]  # Filter empty
        if batch:
            results = calculate_batch_perplexity(batch, model, tokenizer)
            all_results.extend(results)
    
    return pd.DataFrame(all_results)


print("Perplexity functions defined.")


In [None]:
# Cell 6: Load Perplexity Datasets
# =================================

print("Loading perplexity evaluation datasets...")

# Spanish: OSCAR-2301
print("\n1. Loading Spanish OSCAR...")
oscar_es = load_dataset(
    "oscar-corpus/OSCAR-2301",
    "es",
    split="train",
    streaming=True,
    trust_remote_code=True
)

spanish_texts = []
for i, sample in enumerate(oscar_es):
    if i >= PERPLEXITY_SAMPLES_ES:
        break
    text = sample.get('text', '')
    if text and len(text) > 100:  # Filter short texts
        spanish_texts.append(text[:1000])  # Truncate very long texts

print(f"   Loaded {len(spanish_texts)} Spanish samples")

# English: WikiText-2
print("\n2. Loading English WikiText-2...")
wikitext = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
english_texts = [t for t in wikitext['text'] if t and len(t) > 100][:PERPLEXITY_SAMPLES_EN]
print(f"   Loaded {len(english_texts)} English samples")


In [None]:
# Cell 7: Run Perplexity Evaluation
# ==================================

print("=" * 70)
print("PERPLEXITY EVALUATION")
print("=" * 70)

# Spanish perplexity
print("\n--- Spanish (Target Language) ---")
spanish_ppl_df = evaluate_perplexity(spanish_texts, model, tokenizer, BATCH_SIZE_PPL, "Spanish PPL")

# English perplexity
print("\n--- English (Catastrophic Forgetting Check) ---")
english_ppl_df = evaluate_perplexity(english_texts, model, tokenizer, BATCH_SIZE_PPL, "English PPL")

# Combine and save
spanish_ppl_df['language'] = 'es'
english_ppl_df['language'] = 'en'
ppl_df = pd.concat([spanish_ppl_df, english_ppl_df], ignore_index=True)
ppl_df.to_csv(PPL_OUTPUT, index=False)
print(f"\nPerplexity results saved to: {PPL_OUTPUT}")


In [None]:
# Cell 8: Perplexity Summary Statistics
# ======================================

def ppl_summary(df: pd.DataFrame, lang: str) -> Dict:
    """Compute perplexity summary statistics."""
    ppl_vals = df[df['language'] == lang]['perplexity'].values
    return {
        'language': lang,
        'mean': np.mean(ppl_vals),
        'median': np.median(ppl_vals),
        'std': np.std(ppl_vals),
        'p5': np.percentile(ppl_vals, 5),
        'p25': np.percentile(ppl_vals, 25),
        'p50': np.percentile(ppl_vals, 50),
        'p75': np.percentile(ppl_vals, 75),
        'p95': np.percentile(ppl_vals, 95),
        'min': np.min(ppl_vals),
        'max': np.max(ppl_vals),
        'n_samples': len(ppl_vals)
    }


print("\n" + "=" * 70)
print("PERPLEXITY SUMMARY")
print("=" * 70)

# Spanish summary
es_summary = ppl_summary(ppl_df, 'es')
print(f"\n--- SPANISH ---")
print(f"  Mean PPL:   {es_summary['mean']:.2f}")
print(f"  Median PPL: {es_summary['median']:.2f}")
print(f"  Std PPL:    {es_summary['std']:.2f}")
print(f"  Percentiles: P5={es_summary['p5']:.2f}, P25={es_summary['p25']:.2f}, "
      f"P50={es_summary['p50']:.2f}, P75={es_summary['p75']:.2f}, P95={es_summary['p95']:.2f}")

# English summary
en_summary = ppl_summary(ppl_df, 'en')
print(f"\n--- ENGLISH ---")
print(f"  Mean PPL:   {en_summary['mean']:.2f}")
print(f"  Median PPL: {en_summary['median']:.2f}")
print(f"  Std PPL:    {en_summary['std']:.2f}")
print(f"  Percentiles: P5={en_summary['p5']:.2f}, P25={en_summary['p25']:.2f}, "
      f"P50={en_summary['p50']:.2f}, P75={en_summary['p75']:.2f}, P95={en_summary['p95']:.2f}")

# Store for later comparison
BASELINE_ENGLISH_PPL = en_summary['mean']
print(f"\n[BASELINE] English Mean PPL: {BASELINE_ENGLISH_PPL:.2f} (reference for catastrophic forgetting)")


## Section B: Downstream NLU Tasks

Using lm-evaluation-harness for standardized benchmarks:
- **Belebele** (spa_Latn + eng_Latn): Reading comprehension, 221 citations
- **SIB-200** (spa_Latn): Topic classification, 118 citations  
- **XCOPA** (et/translation_es): Commonsense reasoning
- **XNLI** (es): Natural language inference

English regression tests: arc_easy, hellaswag, lambada_openai


In [None]:
# Cell 9: NLU Evaluation via lm-evaluation-harness
# ================================================

# Note: lm-eval tasks are run via command line for best compatibility
# This cell generates the command and parses results

print("=" * 70)
print("NLU EVALUATION (lm-evaluation-harness)")
print("=" * 70)

# Define tasks to evaluate
SPANISH_TASKS = [
    "belebele_spa_latn",  # Reading comprehension
    # "sib200_spa_latn",    # Topic classification (if available)
    # "xcopa_translation-es",  # Commonsense (if available)
]

ENGLISH_TASKS = [
    "belebele_eng_latn",  # Reading comprehension (English baseline)
    "arc_easy",           # Regression test
    "hellaswag",          # Regression test
    "lambada_openai",     # Regression test
]

ALL_TASKS = SPANISH_TASKS + ENGLISH_TASKS

print("\nTasks to evaluate:")
for task in ALL_TASKS:
    print(f"  - {task}")

# Generate lm-eval command
lm_eval_cmd = f"""
# Run this command in terminal:
lm_eval --model hf \\
    --model_args pretrained={MODEL_PATH},dtype=bfloat16 \\
    --tasks {','.join(ALL_TASKS)} \\
    --batch_size auto \\
    --output_path {NLU_OUTPUT.replace('.json', '')}
"""

print("\n" + "=" * 70)
print("LM-EVAL COMMAND")
print("=" * 70)
print(lm_eval_cmd)


In [None]:
# Cell 10: Run lm-eval (alternative: programmatic API)
# ====================================================

try:
    from lm_eval import evaluator
    from lm_eval.models.huggingface import HFLM
    
    print("Running lm-evaluation-harness programmatically...")
    
    # Create model wrapper using our already-loaded model
    lm = HFLM(
        pretrained=model,
        tokenizer=tokenizer,
        batch_size="auto"
    )
    
    # Run evaluation on available tasks
    # Note: Some tasks may not be available in all lm-eval versions
    available_tasks = []
    for task in ALL_TASKS:
        try:
            results = evaluator.simple_evaluate(
                model=lm,
                tasks=[task],
                batch_size="auto",
                log_samples=False
            )
            available_tasks.append(task)
            print(f"  [OK] {task}")
        except Exception as e:
            print(f"  [SKIP] {task}: {str(e)[:50]}...")
    
    if available_tasks:
        # Full evaluation on available tasks
        nlu_results = evaluator.simple_evaluate(
            model=lm,
            tasks=available_tasks,
            batch_size="auto",
            log_samples=False
        )
        
        # Save results
        with open(NLU_OUTPUT, 'w') as f:
            json.dump(nlu_results, f, indent=2, default=str)
        print(f"\nNLU results saved to: {NLU_OUTPUT}")
    else:
        print("\nNo tasks available. Run lm-eval command manually.")
        nlu_results = None
        
except ImportError:
    print("lm-eval not installed. Run the command above manually.")
    nlu_results = None
except Exception as e:
    print(f"Error running lm-eval: {e}")
    print("Run the command above manually.")
    nlu_results = None


In [None]:
# Cell 11: Parse and Display NLU Results
# ======================================

def parse_nlu_results(results_path: str) -> pd.DataFrame:
    """Parse lm-eval JSON results into DataFrame."""
    try:
        with open(results_path, 'r') as f:
            results = json.load(f)
        
        rows = []
        for task, metrics in results.get('results', {}).items():
            row = {'task': task}
            for metric, value in metrics.items():
                if isinstance(value, (int, float)):
                    row[metric] = value
            rows.append(row)
        
        return pd.DataFrame(rows)
    except Exception as e:
        print(f"Could not parse results: {e}")
        return pd.DataFrame()


if nlu_results:
    print("\n" + "=" * 70)
    print("NLU RESULTS SUMMARY")
    print("=" * 70)
    
    results_df = []
    for task, metrics in nlu_results.get('results', {}).items():
        acc = metrics.get('acc,none', metrics.get('acc_norm,none', metrics.get('acc', 'N/A')))
        acc_stderr = metrics.get('acc_stderr,none', metrics.get('acc_norm_stderr,none', 'N/A'))
        print(f"\n{task}:")
        print(f"  Accuracy: {acc:.4f} (± {acc_stderr:.4f})" if isinstance(acc, float) else f"  Accuracy: {acc}")
        results_df.append({
            'task': task,
            'accuracy': acc,
            'stderr': acc_stderr
        })
    
    nlu_df = pd.DataFrame(results_df)
    print("\n" + nlu_df.to_string(index=False))
else:
    print("NLU results not available. Run lm-eval manually and reload results.")


## Section C: Machine Translation Evaluation

FLORES-200 style evaluation:
- Direction: Spanish → English
- Metrics: BLEU, chrF++, COMET, TER (per WMT standards)


In [None]:
# Cell 12: MT Evaluation Functions
# =================================

@torch.no_grad()
def generate_translations(spanish_texts: List[str], model, tokenizer, 
                          max_new_tokens: int = 128, batch_size: int = 8) -> List[str]:
    """Generate English translations from Spanish inputs."""
    all_translations = []
    
    for i in tqdm(range(0, len(spanish_texts), batch_size), desc="Translating"):
        batch = spanish_texts[i:i+batch_size]
        
        # Format prompts
        prompts = [f"Spanish: {text}\nEnglish:" for text in batch]
        
        # Tokenize
        encodings = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to("cuda")
        
        # Generate
        generated_ids = model.generate(
            input_ids=encodings.input_ids,
            attention_mask=encodings.attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
        
        # Decode only new tokens
        input_length = encodings.input_ids.shape[1]
        translations = tokenizer.batch_decode(
            generated_ids[:, input_length:],
            skip_special_tokens=True
        )
        
        # Clean up
        cleaned = []
        for t in translations:
            # Stop at newline
            if "\n" in t:
                t = t.split("\n")[0]
            cleaned.append(t.strip())
        
        all_translations.extend(cleaned)
    
    return all_translations


print("MT evaluation functions defined.")


In [None]:
# Cell 13: Load MT Dataset and Generate Translations
# ===================================================

print("=" * 70)
print("MACHINE TRANSLATION EVALUATION")
print("=" * 70)

# Try to load FLORES-200
print("\nLoading translation dataset...")
try:
    flores = load_dataset("facebook/flores", "spa_Latn-eng_Latn", split="devtest")
    spanish_sources = [x['sentence_spa_Latn'] for x in flores][:MT_SAMPLES]
    english_references = [x['sentence_eng_Latn'] for x in flores][:MT_SAMPLES]
    print(f"Loaded FLORES-200: {len(spanish_sources)} samples")
except Exception as e:
    print(f"FLORES-200 not available: {e}")
    print("Falling back to globalvoices dataset...")
    gv = load_dataset("alvations/globalvoices-en-es", split="train")
    gv = gv.shuffle(seed=RANDOM_SEED).select(range(min(MT_SAMPLES, len(gv))))
    spanish_sources = [x['es'] for x in gv]
    english_references = [x['en'] for x in gv]
    print(f"Loaded GlobalVoices: {len(spanish_sources)} samples")

# Generate translations
print("\nGenerating translations...")
hypotheses = generate_translations(spanish_sources, model, tokenizer, MAX_NEW_TOKENS, BATCH_SIZE_GEN)
print(f"Generated {len(hypotheses)} translations")


In [None]:
# Cell 14: Compute MT Metrics
# ============================

from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter

print("\nComputing MT metrics...")

# BLEU (with signature for reproducibility)
bleu_result = corpus_bleu(hypotheses, [english_references])
bleu_score = bleu_result.score
bleu_signature = bleu_result.get_signature()

# chrF++ (word_order=2 for chrF++)
chrf_result = corpus_chrf(hypotheses, [english_references], word_order=2)
chrf_score = chrf_result.score

# TER
ter_result = corpus_ter(hypotheses, [english_references])
ter_score = ter_result.score

print(f"\n--- MT Metrics ---")
print(f"BLEU:   {bleu_score:.2f}")
print(f"chrF++: {chrf_score:.2f}")
print(f"TER:    {ter_score:.2f}")
print(f"\nBLEU signature: {bleu_signature}")

# Try COMET if available
try:
    from comet import download_model, load_from_checkpoint
    
    print("\nComputing COMET score...")
    comet_model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(comet_model_path)
    
    comet_data = [
        {"src": src, "mt": hyp, "ref": ref}
        for src, hyp, ref in zip(spanish_sources, hypotheses, english_references)
    ]
    comet_output = comet_model.predict(comet_data, batch_size=8, gpus=1)
    comet_score = comet_output.system_score
    print(f"COMET:  {comet_score:.4f}")
except Exception as e:
    print(f"COMET not available: {e}")
    comet_score = None


In [None]:
# Cell 15: Save MT Results
# =========================

mt_results = {
    'source': spanish_sources,
    'reference': english_references,
    'hypothesis': hypotheses,
}
mt_df = pd.DataFrame(mt_results)
mt_df.to_csv(MT_OUTPUT, index=False)

# Summary metrics
mt_summary = {
    'bleu': bleu_score,
    'chrf++': chrf_score,
    'ter': ter_score,
    'comet': comet_score,
    'bleu_signature': str(bleu_signature),
    'n_samples': len(hypotheses)
}

print(f"\nMT results saved to: {MT_OUTPUT}")
print("\n" + "=" * 70)
print("MT SUMMARY")
print("=" * 70)
print(f"BLEU:   {bleu_score:.2f}")
print(f"chrF++: {chrf_score:.2f}")
print(f"TER:    {ter_score:.2f}")
if comet_score:
    print(f"COMET:  {comet_score:.4f}")


## Section D: Generation Quality Analysis

Metrics per HelloBench and Contrastive Decoding papers:
- **Distinct-1/2/3**: Unique n-gram ratios (diversity)
- **Repetition Rate**: % repeated 4-grams (degeneration)
- **Self-BLEU**: Corpus-level diversity (lower = more diverse)
- **Length Ratio**: Output length vs expected


In [None]:
# Cell 16: Generation Quality Functions
# ======================================

def get_ngrams(text: str, n: int) -> List[Tuple]:
    """Extract n-grams from text."""
    tokens = text.split()
    if len(tokens) < n:
        return []
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]


def distinct_n(texts: List[str], n: int) -> float:
    """
    Distinct-N: Ratio of unique n-grams to total n-grams.
    Higher = more diverse generation.
    """
    all_ngrams = []
    for text in texts:
        all_ngrams.extend(get_ngrams(text, n))
    
    if not all_ngrams:
        return 0.0
    
    return len(set(all_ngrams)) / len(all_ngrams)


def repetition_rate(texts: List[str], n: int = 4) -> float:
    """
    Repetition rate: Proportion of n-grams that appear more than once.
    Lower = less repetitive (better).
    """
    all_ngrams = []
    for text in texts:
        all_ngrams.extend(get_ngrams(text, n))
    
    if not all_ngrams:
        return 0.0
    
    ngram_counts = Counter(all_ngrams)
    repeated = sum(1 for count in ngram_counts.values() if count > 1)
    
    return repeated / len(ngram_counts) if ngram_counts else 0.0


def self_bleu(texts: List[str], sample_size: int = 100) -> float:
    """
    Self-BLEU: Average BLEU of each text against all others.
    Lower = more diverse corpus.
    """
    from sacrebleu import sentence_bleu
    
    if len(texts) < 2:
        return 0.0
    
    # Sample for efficiency
    if len(texts) > sample_size:
        indices = np.random.choice(len(texts), sample_size, replace=False)
        texts = [texts[i] for i in indices]
    
    scores = []
    for i, hyp in enumerate(texts):
        refs = [texts[j] for j in range(len(texts)) if j != i]
        if refs and hyp:
            score = sentence_bleu(hyp, refs[:5]).score  # Use up to 5 refs
            scores.append(score)
    
    return np.mean(scores) if scores else 0.0


def analyze_generation_quality(texts: List[str]) -> Dict:
    """Compute all generation quality metrics."""
    # Filter empty texts
    texts = [t for t in texts if t and t.strip()]
    
    return {
        'distinct_1': distinct_n(texts, 1),
        'distinct_2': distinct_n(texts, 2),
        'distinct_3': distinct_n(texts, 3),
        'repetition_rate_4gram': repetition_rate(texts, 4),
        'self_bleu': self_bleu(texts),
        'avg_length': np.mean([len(t.split()) for t in texts]),
        'n_samples': len(texts)
    }


print("Generation quality functions defined.")


In [None]:
# Cell 17: Load Generation Prompts and Generate
# ==============================================

print("=" * 70)
print("GENERATION QUALITY ANALYSIS")
print("=" * 70)

# Load Spanish prompts from OSCAR
print("\nLoading Spanish prompts...")
oscar_prompts = load_dataset(
    "oscar-corpus/OSCAR-2301",
    "es",
    split="train",
    streaming=True,
    trust_remote_code=True
)

# Filter for good prompt length (50-200 chars, complete sentences)
prompts = []
for sample in oscar_prompts:
    text = sample.get('text', '')
    if 50 <= len(text) <= 200 and text.endswith(('.', '!', '?')):
        prompts.append(text)
        if len(prompts) >= GENERATION_PROMPTS:
            break

print(f"Collected {len(prompts)} Spanish prompts")

# Generate with greedy decoding
print("\nGenerating with greedy decoding...")
greedy_outputs = []
for i in tqdm(range(0, len(prompts), BATCH_SIZE_GEN), desc="Greedy generation"):
    batch = prompts[i:i+BATCH_SIZE_GEN]
    
    encodings = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    ).to("cuda")
    
    with torch.no_grad():
        generated = model.generate(
            input_ids=encodings.input_ids,
            attention_mask=encodings.attention_mask,
            max_new_tokens=64,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    input_len = encodings.input_ids.shape[1]
    outputs = tokenizer.batch_decode(generated[:, input_len:], skip_special_tokens=True)
    greedy_outputs.extend(outputs)

# Generate with nucleus sampling
print("\nGenerating with nucleus sampling (p=0.9)...")
nucleus_outputs = []
for i in tqdm(range(0, len(prompts), BATCH_SIZE_GEN), desc="Nucleus generation"):
    batch = prompts[i:i+BATCH_SIZE_GEN]
    
    encodings = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    ).to("cuda")
    
    with torch.no_grad():
        generated = model.generate(
            input_ids=encodings.input_ids,
            attention_mask=encodings.attention_mask,
            max_new_tokens=64,
            do_sample=True,
            top_p=0.9,
            temperature=1.0,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    input_len = encodings.input_ids.shape[1]
    outputs = tokenizer.batch_decode(generated[:, input_len:], skip_special_tokens=True)
    nucleus_outputs.extend(outputs)

print(f"\nGenerated {len(greedy_outputs)} greedy and {len(nucleus_outputs)} nucleus samples")


In [None]:
# Cell 18: Compute Generation Quality Metrics
# ============================================

print("\nComputing generation quality metrics...")

greedy_metrics = analyze_generation_quality(greedy_outputs)
nucleus_metrics = analyze_generation_quality(nucleus_outputs)

print("\n" + "=" * 70)
print("GENERATION QUALITY SUMMARY")
print("=" * 70)

print(f"\n{'Metric':<25} {'Greedy':>12} {'Nucleus (p=0.9)':>15}")
print("-" * 55)
print(f"{'Distinct-1':<25} {greedy_metrics['distinct_1']:>12.4f} {nucleus_metrics['distinct_1']:>15.4f}")
print(f"{'Distinct-2':<25} {greedy_metrics['distinct_2']:>12.4f} {nucleus_metrics['distinct_2']:>15.4f}")
print(f"{'Distinct-3':<25} {greedy_metrics['distinct_3']:>12.4f} {nucleus_metrics['distinct_3']:>15.4f}")
print(f"{'Repetition Rate (4-gram)':<25} {greedy_metrics['repetition_rate_4gram']:>12.4f} {nucleus_metrics['repetition_rate_4gram']:>15.4f}")
print(f"{'Self-BLEU':<25} {greedy_metrics['self_bleu']:>12.2f} {nucleus_metrics['self_bleu']:>15.2f}")
print(f"{'Avg Length (words)':<25} {greedy_metrics['avg_length']:>12.1f} {nucleus_metrics['avg_length']:>15.1f}")

# Save results
gen_results = pd.DataFrame({
    'prompt': prompts,
    'greedy_output': greedy_outputs,
    'nucleus_output': nucleus_outputs
})
gen_results.to_csv(GEN_OUTPUT, index=False)
print(f"\nGeneration results saved to: {GEN_OUTPUT}")


## Section E: Computational Efficiency

Protocol per CVA study:
- Warm-up: 100 samples discarded
- Measurement: 1000 samples, 3 runs
- Metrics: Tokens/sec, TTFT (Time to First Token), Peak VRAM


In [None]:
# Cell 19: Efficiency Benchmark Functions
# =======================================

def get_vram_mb() -> float:
    """Get current VRAM usage in MB."""
    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    return info.used / (1024 ** 2)


def benchmark_throughput(texts: List[str], model, tokenizer, 
                         batch_size: int, max_new_tokens: int,
                         warmup: int = 100) -> Dict:
    """
    Benchmark generation throughput.
    
    Returns:
        - tokens_per_second: Average generation throughput
        - time_to_first_token: Average TTFT
        - peak_vram_mb: Peak VRAM during benchmark
    """
    # Warm-up
    print(f"  Warming up ({warmup} samples)...")
    for i in range(0, min(warmup, len(texts)), batch_size):
        batch = texts[i:i+batch_size]
        encodings = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
        with torch.no_grad():
            _ = model.generate(
                input_ids=encodings.input_ids,
                attention_mask=encodings.attention_mask,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
            )
    torch.cuda.synchronize()
    
    # Reset peak memory
    torch.cuda.reset_peak_memory_stats()
    
    # Benchmark
    total_tokens = 0
    total_time = 0
    ttft_times = []
    
    print(f"  Benchmarking ({len(texts)} samples)...")
    for i in tqdm(range(0, len(texts), batch_size), desc="  Throughput"):
        batch = texts[i:i+batch_size]
        
        encodings = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
        input_len = encodings.input_ids.shape[1]
        
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        with torch.no_grad():
            generated = model.generate(
                input_ids=encodings.input_ids,
                attention_mask=encodings.attention_mask,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        
        batch_time = end_time - start_time
        batch_tokens = (generated.shape[1] - input_len) * generated.shape[0]
        
        total_tokens += batch_tokens
        total_time += batch_time
        
        # Estimate TTFT (first token is generated quickly)
        ttft_times.append(batch_time / max(generated.shape[1] - input_len, 1))
    
    peak_vram = torch.cuda.max_memory_allocated() / (1024 ** 2)
    
    return {
        'tokens_per_second': total_tokens / total_time,
        'samples_per_second': len(texts) / total_time,
        'avg_ttft_ms': np.mean(ttft_times) * 1000,
        'total_time_s': total_time,
        'total_tokens': total_tokens,
        'peak_vram_mb': peak_vram
    }


print("Efficiency benchmark functions defined.")


In [None]:
# Cell 20: Run Efficiency Benchmark
# ==================================

print("=" * 70)
print("COMPUTATIONAL EFFICIENCY BENCHMARK")
print("=" * 70)

# Prepare benchmark texts (use Spanish texts for target language efficiency)
efficiency_texts = spanish_texts[:EFFICIENCY_SAMPLES]
print(f"\nBenchmark corpus: {len(efficiency_texts)} Spanish samples")

# Run multiple times for stability
all_runs = []
for run in range(EFFICIENCY_RUNS):
    print(f"\n--- Run {run + 1}/{EFFICIENCY_RUNS} ---")
    run_results = benchmark_throughput(
        efficiency_texts, model, tokenizer,
        batch_size=BATCH_SIZE_GEN,
        max_new_tokens=64,
        warmup=WARMUP_SAMPLES
    )
    all_runs.append(run_results)
    print(f"  Tokens/sec: {run_results['tokens_per_second']:.1f}")
    print(f"  Peak VRAM: {run_results['peak_vram_mb']:.0f} MB")

# Aggregate results
efficiency_summary = {
    'tokens_per_second_mean': np.mean([r['tokens_per_second'] for r in all_runs]),
    'tokens_per_second_std': np.std([r['tokens_per_second'] for r in all_runs]),
    'samples_per_second_mean': np.mean([r['samples_per_second'] for r in all_runs]),
    'avg_ttft_ms': np.mean([r['avg_ttft_ms'] for r in all_runs]),
    'peak_vram_mb': max([r['peak_vram_mb'] for r in all_runs]),
    'n_samples': len(efficiency_texts),
    'n_runs': EFFICIENCY_RUNS,
    'batch_size': BATCH_SIZE_GEN
}

# Save results
efficiency_df = pd.DataFrame([efficiency_summary])
efficiency_df.to_csv(EFFICIENCY_OUTPUT, index=False)

print("\n" + "=" * 70)
print("EFFICIENCY SUMMARY")
print("=" * 70)
print(f"Tokens/second:     {efficiency_summary['tokens_per_second_mean']:.1f} ± {efficiency_summary['tokens_per_second_std']:.1f}")
print(f"Samples/second:    {efficiency_summary['samples_per_second_mean']:.2f}")
print(f"Avg TTFT:          {efficiency_summary['avg_ttft_ms']:.2f} ms")
print(f"Peak VRAM:         {efficiency_summary['peak_vram_mb']:.0f} MB")
print(f"\nResults saved to: {EFFICIENCY_OUTPUT}")


In [None]:
# Cell 21: Final Summary and Cleanup
# ===================================

print("\n" + "=" * 70)
print("EVALUATION COMPLETE")
print("=" * 70)

print(f"\nModel: {MODEL_PATH}")
print(f"Model Name: {MODEL_NAME}")

print("\n--- Output Files ---")
print(f"  Perplexity:   {PPL_OUTPUT}")
print(f"  NLU Results:  {NLU_OUTPUT}")
print(f"  MT Results:   {MT_OUTPUT}")
print(f"  Generation:   {GEN_OUTPUT}")
print(f"  Efficiency:   {EFFICIENCY_OUTPUT}")

print("\n--- Key Metrics Summary ---")
print(f"  Spanish PPL (mean):      {es_summary['mean']:.2f}")
print(f"  English PPL (mean):      {en_summary['mean']:.2f}")
print(f"  MT BLEU:                 {bleu_score:.2f}")
print(f"  MT chrF++:               {chrf_score:.2f}")
print(f"  Distinct-2 (greedy):     {greedy_metrics['distinct_2']:.4f}")
print(f"  Throughput (tok/s):      {efficiency_summary['tokens_per_second_mean']:.1f}")
print(f"  Peak VRAM:               {efficiency_summary['peak_vram_mb']:.0f} MB")

# Cleanup
pynvml.nvmlShutdown()
print("\n[BASELINE EVALUATION COMPLETE]")
