In [5]:
# Cell 1: Package Installation
!pip install transformers datasets accelerate pandas nvidia-ml-py flash-attn --no-build-isolation -q


In [6]:
# Cell 2: Imports and GPU Validation
import torch
import pandas as pd
import pynvml
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import time
from tqdm.auto import tqdm

# Hard check for H100/A100 GPU (compute capability >= 8.0 required for Flash Attention 2)
assert torch.cuda.is_available(), "CUDA is not available - H100 GPU required"
compute_capability = torch.cuda.get_device_capability()
assert compute_capability[0] >= 8, f"H100/A100 GPU required for Flash Attention 2 (got compute capability {compute_capability})"

# Initialize pynvml for VRAM monitoring
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Compute Capability: {compute_capability[0]}.{compute_capability[1]}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")


GPU: NVIDIA H100 80GB HBM3
Compute Capability: 9.0
CUDA Version: 12.8
PyTorch Version: 2.7.0


In [7]:
# Cell 3: Model Loading (H100 Native)
MODEL_NAME = "EleutherAI/pythia-1b"

print(f"Loading {MODEL_NAME} with Flash Attention 2...")

# Load model with H100-optimized settings
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.bfloat16,
    device_map="cuda",
    attn_implementation="flash_attention_2",
)
model.eval()

# Load tokenizer with proper padding configuration for batched generation
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Required for batched generation

print(f"Model loaded successfully on {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")
print(f"Tokenizer pad_token: '{tokenizer.pad_token}' (id: {tokenizer.pad_token_id})")
print(f"Tokenizer padding_side: {tokenizer.padding_side}")


Loading EleutherAI/pythia-1b with Flash Attention 2...
Model loaded successfully on cuda:0
Model dtype: torch.bfloat16
Tokenizer pad_token: '<|endoftext|>' (id: 0)
Tokenizer padding_side: left


In [8]:
# Cell 4: Data Pipeline with Quality Filtering

# === CONFIGURATION ===
BATCH_SIZE = 32  # Reduced for longer sequences
NUM_SAMPLES = 5000
MAX_NEW_TOKENS = 128  # Increased from 64 to avoid truncation

# Quality filter thresholds
MIN_CHARS = 30  # Minimum characters per text
MAX_CHARS = 500  # Maximum characters (avoid very long sequences)
MIN_WORDS = 5  # Minimum words per text

print("Loading globalvoices-en-es dataset...")
dataset = load_dataset("alvations/globalvoices-en-es", split="train")

# Inspection step: verify dataset structure
print(f"\nRaw dataset size: {len(dataset)}")
print(f"Dataset column names: {dataset.column_names}")
print(f"First sample: {dataset[0]}")

# Rename columns from en/es to spanish/english for consistency
def rename_columns(example):
    return {
        "spanish": example["es"],
        "english": example["en"]
    }

dataset = dataset.map(rename_columns, remove_columns=dataset.column_names)

# === QUALITY FILTERING ===
print(f"\n--- Applying Quality Filters ---")
print(f"Min chars: {MIN_CHARS}, Max chars: {MAX_CHARS}, Min words: {MIN_WORDS}")

def is_quality_sample(example):
    """Filter out degenerate samples."""
    es = example["spanish"]
    en = example["english"]
    
    # Length checks
    if len(es) < MIN_CHARS or len(en) < MIN_CHARS:
        return False
    if len(es) > MAX_CHARS or len(en) > MAX_CHARS:
        return False
    
    # Word count checks
    if len(es.split()) < MIN_WORDS or len(en.split()) < MIN_WORDS:
        return False
    
    # Content quality checks
    # Reject if mostly punctuation/special chars
    alpha_ratio_es = sum(c.isalpha() for c in es) / max(len(es), 1)
    alpha_ratio_en = sum(c.isalpha() for c in en) / max(len(en), 1)
    if alpha_ratio_es < 0.5 or alpha_ratio_en < 0.5:
        return False
    
    # Reject titles/headers (often end with · Global Voices)
    if es.strip().endswith("·") or en.strip().endswith("·"):
        return False
    
    return True

pre_filter_size = len(dataset)
dataset = dataset.filter(is_quality_sample)
post_filter_size = len(dataset)
filtered_out = pre_filter_size - post_filter_size

print(f"Samples before filtering: {pre_filter_size}")
print(f"Samples after filtering: {post_filter_size}")
print(f"Filtered out: {filtered_out} ({100*filtered_out/pre_filter_size:.1f}%)")

# Shuffle and select samples
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))

print(f"\nFinal dataset size: {len(dataset)}")
print(f"Sample after filtering: {dataset[0]}")

# Show dataset statistics
spanish_lens = [len(x["spanish"]) for x in dataset]
english_lens = [len(x["english"]) for x in dataset]
print(f"\n--- Dataset Statistics ---")
print(f"Spanish chars: min={min(spanish_lens)}, max={max(spanish_lens)}, mean={sum(spanish_lens)/len(spanish_lens):.0f}")
print(f"English chars: min={min(english_lens)}, max={max(english_lens)}, mean={sum(english_lens)/len(english_lens):.0f}")

# Create a custom Dataset class for DataLoader
class TranslationDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item["spanish"], item["english"]

# Create DataLoader
torch_dataset = TranslationDataset(dataset)
dataloader = DataLoader(
    torch_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=0,  # Avoid multiprocessing issues in notebook
    pin_memory=True
)

print(f"\nDataLoader created: {len(dataloader)} batches of size {BATCH_SIZE}")
print(f"Max new tokens for generation: {MAX_NEW_TOKENS}")


Loading globalvoices-en-es dataset...

Raw dataset size: 355136
Dataset column names: ['en', 'es']
First sample: {'en': 'Argentina: Stencil Graffiti · Global Voices ', 'es': 'Argentina: Graffitis '}

--- Applying Quality Filters ---
Min chars: 30, Max chars: 500, Min words: 5


Filter: 100%|██████████| 355136/355136 [00:05<00:00, 61675.58 examples/s]


Samples before filtering: 355136
Samples after filtering: 321373
Filtered out: 33763 (9.5%)

Final dataset size: 5000
Sample after filtering: {'spanish': '“Estos estados del CCG no son para nada competentes para lidiar con pedidos populares de libertad, sin dejar de mencionar gobierno democrático, porque ellos mismos son en su mayoría regímenes despóticos”, observó el Consejo de Coordinación de Yemen de la Revolución Juvenil de Cambio (CCYRC, por sus siglas en inglés). ', 'english': '“These GCC states are not at all competent to deal with popular requests for liberty and freedom, not to mention democratic government, because they themselves are mostly despotic regimes,” observed Yemen’s Coordinating Council of the Youth Revolution of Change (CCYRC). '}

--- Dataset Statistics ---
Spanish chars: min=30, max=492, mean=132
English chars: min=30, max=465, mean=122

DataLoader created: 157 batches of size 32
Max new tokens for generation: 128


In [9]:
# Cell 5: Benchmark Functions (Improved)

@torch.no_grad()
def calculate_batch_perplexity(texts: list[str], model, tokenizer) -> list[float]:
    """
    Vectorized perplexity calculation using cross-entropy loss.
    Uses teacher-forcing: compute loss on input tokens and convert to PPL via exp(loss).
    """
    # Tokenize batch with padding
    encodings = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to("cuda")
    
    input_ids = encodings.input_ids
    attention_mask = encodings.attention_mask
    
    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Shift logits and labels for causal LM loss calculation
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()
    shift_mask = attention_mask[:, 1:].contiguous()
    
    # Calculate per-token cross entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    losses = losses.view(shift_labels.size())
    
    # Mask out padding tokens and calculate mean loss per sequence
    masked_losses = losses * shift_mask
    seq_lengths = shift_mask.sum(dim=1).clamp(min=1)  # Avoid division by zero
    mean_losses = masked_losses.sum(dim=1) / seq_lengths
    
    # Convert to perplexity
    perplexities = torch.exp(mean_losses)
    
    return perplexities.cpu().tolist()


@torch.no_grad()
def generate_translations(spanish_texts: list[str], model, tokenizer, max_new_tokens: int = 128) -> tuple[list[str], list[bool]]:
    """
    Batched generation with greedy decoding and early stopping.
    Format: "Spanish: {es}\nEnglish:"
    
    Returns:
        - generated_texts: List of generated translations
        - hit_max_tokens: List of bools indicating if generation hit the token limit
    """
    # Format prompts
    prompts = [f"Spanish: {text}\nEnglish:" for text in spanish_texts]
    
    # Tokenize with left padding (already configured in tokenizer)
    encodings = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to("cuda")
    
    # Generate with greedy decoding + stopping criteria
    generated_ids = model.generate(
        input_ids=encodings.input_ids,
        attention_mask=encodings.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # Greedy decoding
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        # Stop on newline to prevent runaway generation
        stop_strings=["\n\n", "Spanish:", "\nSpanish"],
        tokenizer=tokenizer,
    )
    
    # Calculate how many new tokens were generated per sequence
    input_length = encodings.input_ids.shape[1]
    new_token_counts = (generated_ids.shape[1] - input_length)
    hit_max_tokens = [new_token_counts >= max_new_tokens] * len(spanish_texts)
    
    # Decode only the new tokens (exclude input prompt)
    generated_texts = tokenizer.batch_decode(
        generated_ids[:, input_length:],
        skip_special_tokens=True
    )
    
    # Clean up outputs - take only the first line/sentence
    cleaned_texts = []
    for text in generated_texts:
        # Stop at newline or "Spanish:" echo
        for stop in ["\n", "Spanish:"]:
            if stop in text:
                text = text.split(stop)[0]
        cleaned_texts.append(text.strip())
    
    return cleaned_texts, hit_max_tokens


def get_peak_vram_mb(handle) -> float:
    """Get peak VRAM usage in MB using pynvml."""
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return info.used / (1024 ** 2)


def reset_peak_memory_stats():
    """Reset PyTorch's peak memory tracking."""
    torch.cuda.reset_peak_memory_stats()


def get_pytorch_peak_memory_mb() -> float:
    """Get PyTorch's tracked peak memory in MB."""
    return torch.cuda.max_memory_allocated() / (1024 ** 2)


print("Benchmark functions defined successfully.")
print(f"Generation will use max_new_tokens={MAX_NEW_TOKENS} with early stopping")


Benchmark functions defined successfully.
Generation will use max_new_tokens=128 with early stopping


In [10]:
# Cell 6: Main Benchmark Loop (Improved)

# Results storage
all_spanish = []
all_english = []
all_outputs = []
all_ppls = []
all_hit_max = []
peak_vram_readings = []

print("=" * 70)
print("STARTING BENCHMARK")
print("=" * 70)
print(f"\nConfiguration:")
print(f"  Total samples: {len(torch_dataset)}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Total batches: {len(dataloader)}")
print(f"  Max new tokens: {MAX_NEW_TOKENS}")
print("-" * 70)

# Warm-up run to stabilize GPU
print("\nWarming up GPU...")
warmup_texts = ["Hola, ¿cómo estás? Esta es una prueba de traducción."] * 4
_ = calculate_batch_perplexity(warmup_texts, model, tokenizer)
_, _ = generate_translations(warmup_texts, model, tokenizer, max_new_tokens=32)
torch.cuda.synchronize()
print("Warm-up complete.\n")

# Main benchmark
start_time = time.perf_counter()

for batch_idx, (spanish_batch, english_batch) in enumerate(tqdm(dataloader, desc="Processing batches")):
    # Convert tuple of strings to list
    spanish_list = list(spanish_batch)
    english_list = list(english_batch)
    
    # Reset memory tracking for this batch
    reset_peak_memory_stats()
    
    # Metric 1: Calculate perplexity on Spanish input (batched)
    batch_ppls = calculate_batch_perplexity(spanish_list, model, tokenizer)
    
    # Metric 2: Generate translations (batched) with max token tracking
    batch_outputs, batch_hit_max = generate_translations(
        spanish_list, model, tokenizer, max_new_tokens=MAX_NEW_TOKENS
    )
    
    # Synchronize to ensure all GPU work is complete before measuring VRAM
    torch.cuda.synchronize()
    
    # Log peak VRAM usage for this batch
    batch_peak_vram = get_peak_vram_mb(gpu_handle)
    peak_vram_readings.append(batch_peak_vram)
    
    # Accumulate results
    all_spanish.extend(spanish_list)
    all_english.extend(english_list)
    all_outputs.extend(batch_outputs)
    all_ppls.extend(batch_ppls)
    all_hit_max.extend(batch_hit_max)
    
    # Progress logging every 20 batches
    if (batch_idx + 1) % 20 == 0:
        elapsed = time.perf_counter() - start_time
        samples_done = (batch_idx + 1) * BATCH_SIZE
        throughput = samples_done / elapsed
        print(f"  Batch {batch_idx + 1}/{len(dataloader)} | "
              f"Throughput: {throughput:.1f} samples/s | "
              f"Peak VRAM: {batch_peak_vram:.0f} MB")

end_time = time.perf_counter()
total_time = end_time - start_time

print("-" * 70)
print(f"Benchmark complete!")
print(f"Total samples processed: {len(all_spanish)}")
print(f"Total time: {total_time:.2f} seconds")
print(f"Overall throughput: {len(all_spanish) / total_time:.2f} samples/second")
print(f"Max peak VRAM observed: {max(peak_vram_readings):.0f} MB")
print(f"Average peak VRAM: {sum(peak_vram_readings) / len(peak_vram_readings):.0f} MB")


STARTING BENCHMARK

Configuration:
  Total samples: 5000
  Batch size: 32
  Total batches: 157
  Max new tokens: 128
----------------------------------------------------------------------

Warming up GPU...
Warm-up complete.



Processing batches:  13%|█▎        | 20/157 [00:53<06:07,  2.68s/it]

  Batch 20/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  25%|██▌       | 40/157 [01:47<05:13,  2.68s/it]

  Batch 40/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  38%|███▊      | 60/157 [02:41<04:20,  2.69s/it]

  Batch 60/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  51%|█████     | 80/157 [03:35<03:26,  2.69s/it]

  Batch 80/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  64%|██████▎   | 100/157 [04:29<02:34,  2.71s/it]

  Batch 100/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  76%|███████▋  | 120/157 [05:23<01:38,  2.67s/it]

  Batch 120/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches:  89%|████████▉ | 140/157 [06:17<00:46,  2.72s/it]

  Batch 140/157 | Throughput: 11.9 samples/s | Peak VRAM: 5518 MB


Processing batches: 100%|██████████| 157/157 [07:02<00:00,  2.69s/it]

----------------------------------------------------------------------
Benchmark complete!
Total samples processed: 5000
Total time: 422.78 seconds
Overall throughput: 11.83 samples/second
Max peak VRAM observed: 5518 MB
Average peak VRAM: 5490 MB





In [11]:
# Cell 7: Output and Reporting with Token Analysis

import numpy as np

# Create results DataFrame
results_df = pd.DataFrame({
    "spanish_src": all_spanish,
    "english_ref": all_english,
    "model_output": all_outputs,
    "input_ppl": all_ppls,
    "hit_max_tokens": all_hit_max
})

# --- Token Count Analysis ---
def count_tokens(text):
    """Count tokens using the model's tokenizer."""
    if not text:
        return 0
    return len(tokenizer.encode(text, add_special_tokens=False))

def count_chars(text):
    """Count characters (excluding whitespace for fertility calc)."""
    return len(text.replace(" ", ""))

# Calculate token counts and character counts
results_df["spanish_tokens"] = results_df["spanish_src"].apply(count_tokens)
results_df["english_ref_tokens"] = results_df["english_ref"].apply(count_tokens)
results_df["output_tokens"] = results_df["model_output"].apply(count_tokens)

results_df["spanish_chars"] = results_df["spanish_src"].apply(count_chars)
results_df["english_ref_chars"] = results_df["english_ref"].apply(count_chars)
results_df["output_chars"] = results_df["model_output"].apply(count_chars)

# Calculate Fertility (Tokens per Character)
results_df["spanish_fertility"] = results_df["spanish_tokens"] / results_df["spanish_chars"].replace(0, 1)
results_df["english_ref_fertility"] = results_df["english_ref_tokens"] / results_df["english_ref_chars"].replace(0, 1)
results_df["output_fertility"] = results_df["output_tokens"] / results_df["output_chars"].replace(0, 1)

# Detect failure cases
results_df["is_empty"] = results_df["model_output"].str.strip() == ""
results_df["is_repetitive"] = results_df["model_output"].apply(
    lambda x: len(set(x.split())) < len(x.split()) * 0.4 if len(x.split()) > 5 else False
)
# Check if output contains Spanish words from input (translation failure)
results_df["has_spanish_leak"] = results_df.apply(
    lambda row: any(
        word.lower() in row["model_output"].lower() 
        for word in row["spanish_src"].split()[:3] 
        if len(word) > 4 and word.lower() not in row["english_ref"].lower()
    ) if len(row["model_output"]) > 0 else False,
    axis=1
)
# Check output length ratio (too short or too long compared to reference)
results_df["length_ratio"] = results_df["output_tokens"] / results_df["english_ref_tokens"].replace(0, 1)
results_df["is_too_short"] = results_df["length_ratio"] < 0.3
results_df["is_too_long"] = results_df["length_ratio"] > 3.0

# Save to CSV
output_file = "pythia_1b_h100_baseline.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Display summary statistics
print("\n" + "=" * 70)
print("BENCHMARK SUMMARY")
print("=" * 70)
print(f"\nModel: {MODEL_NAME}")
print(f"Dataset: alvations/globalvoices-en-es (filtered)")
print(f"Total Samples: {len(results_df)}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Max New Tokens: {MAX_NEW_TOKENS}")

print(f"\n--- Performance Metrics ---")
print(f"Total Runtime: {total_time:.2f} seconds")
print(f"Throughput: {len(results_df) / total_time:.2f} samples/second")
print(f"Average Time per Sample: {total_time / len(results_df) * 1000:.2f} ms")

print(f"\n--- Token Count Statistics ---")
print(f"{'Metric':<25} {'Spanish Src':>12} {'English Ref':>12} {'Model Output':>12}")
print("-" * 65)
print(f"{'Mean tokens':<25} {results_df['spanish_tokens'].mean():>12.1f} {results_df['english_ref_tokens'].mean():>12.1f} {results_df['output_tokens'].mean():>12.1f}")
print(f"{'Median tokens':<25} {results_df['spanish_tokens'].median():>12.1f} {results_df['english_ref_tokens'].median():>12.1f} {results_df['output_tokens'].median():>12.1f}")
print(f"{'Std tokens':<25} {results_df['spanish_tokens'].std():>12.1f} {results_df['english_ref_tokens'].std():>12.1f} {results_df['output_tokens'].std():>12.1f}")
print(f"{'Min tokens':<25} {results_df['spanish_tokens'].min():>12} {results_df['english_ref_tokens'].min():>12} {results_df['output_tokens'].min():>12}")
print(f"{'Max tokens':<25} {results_df['spanish_tokens'].max():>12} {results_df['english_ref_tokens'].max():>12} {results_df['output_tokens'].max():>12}")

print(f"\n--- Fertility (Tokens/Char) Statistics ---")
print(f"{'Metric':<25} {'Spanish Src':>12} {'English Ref':>12} {'Model Output':>12}")
print("-" * 65)
print(f"{'Mean fertility':<25} {results_df['spanish_fertility'].mean():>12.4f} {results_df['english_ref_fertility'].mean():>12.4f} {results_df['output_fertility'].mean():>12.4f}")
print(f"{'Median fertility':<25} {results_df['spanish_fertility'].median():>12.4f} {results_df['english_ref_fertility'].median():>12.4f} {results_df['output_fertility'].median():>12.4f}")

print(f"\n--- Perplexity Statistics ---")
print(f"Mean PPL: {results_df['input_ppl'].mean():.2f}")
print(f"Median PPL: {results_df['input_ppl'].median():.2f}")
print(f"Std PPL: {results_df['input_ppl'].std():.2f}")
print(f"Min PPL: {results_df['input_ppl'].min():.2f}")
print(f"Max PPL: {results_df['input_ppl'].max():.2f}")

print(f"\n--- Percentiles ---")
percentiles = [5, 10, 25, 50, 75, 90, 95, 99]
print(f"{'Percentile':<12} {'PPL':>12} {'Spanish Tok':>12} {'Output Tok':>12}")
print("-" * 50)
for p in percentiles:
    ppl_p = np.percentile(results_df['input_ppl'], p)
    src_tok_p = np.percentile(results_df['spanish_tokens'], p)
    out_tok_p = np.percentile(results_df['output_tokens'], p)
    print(f"{'P' + str(p):<12} {ppl_p:>12.2f} {src_tok_p:>12.0f} {out_tok_p:>12.0f}")

print(f"\n--- VRAM Usage ---")
print(f"Max Peak VRAM: {max(peak_vram_readings):.0f} MB")
print(f"Avg Peak VRAM: {sum(peak_vram_readings) / len(peak_vram_readings):.0f} MB")

print(f"\n--- Failure Analysis ---")
empty_count = results_df["is_empty"].sum()
repetitive_count = results_df["is_repetitive"].sum()
spanish_leak_count = results_df["has_spanish_leak"].sum()
too_short_count = results_df["is_too_short"].sum()
too_long_count = results_df["is_too_long"].sum()
hit_max_count = results_df["hit_max_tokens"].sum()

print(f"Empty outputs: {empty_count} ({100*empty_count/len(results_df):.1f}%)")
print(f"Repetitive outputs: {repetitive_count} ({100*repetitive_count/len(results_df):.1f}%)")
print(f"Spanish word leak: {spanish_leak_count} ({100*spanish_leak_count/len(results_df):.1f}%)")
print(f"Too short (<30% ref): {too_short_count} ({100*too_short_count/len(results_df):.1f}%)")
print(f"Too long (>300% ref): {too_long_count} ({100*too_long_count/len(results_df):.1f}%)")
print(f"Hit max tokens ({MAX_NEW_TOKENS}): {hit_max_count} ({100*hit_max_count/len(results_df):.1f}%)")

print(f"\n--- Length Ratio Statistics (output/reference) ---")
print(f"Mean: {results_df['length_ratio'].mean():.2f}")
print(f"Median: {results_df['length_ratio'].median():.2f}")
print(f"Std: {results_df['length_ratio'].std():.2f}")

# --- Sampled Outputs: Good, Bad, and Ugly ---
print("\n" + "=" * 70)
print("SAMPLE OUTPUTS")
print("=" * 70)

def print_sample(row, label):
    ratio_str = f", Ratio: {row['length_ratio']:.2f}" if 'length_ratio' in row else ""
    print(f"\n[{label}] (PPL: {row['input_ppl']:.2f}, Out Tokens: {row['output_tokens']}, Ref Tokens: {row['english_ref_tokens']}{ratio_str})")
    print(f"  Spanish:     {row['spanish_src'][:120]}{'...' if len(row['spanish_src']) > 120 else ''}")
    print(f"  English ref: {row['english_ref'][:120]}{'...' if len(row['english_ref']) > 120 else ''}")
    print(f"  Model out:   {row['model_output'][:200]}{'...' if len(row['model_output']) > 200 else ''}")

# Best cases (lowest PPL, good length ratio)
print("\n--- BEST CASES (Lowest PPL, good length ratio 0.5-2.0) ---")
good_ratio = (results_df["length_ratio"] >= 0.5) & (results_df["length_ratio"] <= 2.0)
best_df = results_df[(~results_df["is_empty"]) & good_ratio].nsmallest(5, "input_ppl")
for idx, row in best_df.iterrows():
    print_sample(row, f"Best #{list(best_df.index).index(idx)+1}")

# Worst cases (highest PPL)
print("\n--- WORST CASES (Highest PPL) ---")
worst_df = results_df.nlargest(5, "input_ppl")
for idx, row in worst_df.iterrows():
    print_sample(row, f"Worst #{list(worst_df.index).index(idx)+1}")

# Empty outputs
if empty_count > 0:
    print("\n--- EMPTY OUTPUT FAILURES ---")
    empty_df = results_df[results_df["is_empty"]].head(3)
    for idx, row in empty_df.iterrows():
        print_sample(row, "Empty")

# Repetitive outputs
if repetitive_count > 0:
    print("\n--- REPETITIVE OUTPUT FAILURES ---")
    rep_df = results_df[results_df["is_repetitive"]].head(3)
    for idx, row in rep_df.iterrows():
        print_sample(row, "Repetitive")

# Spanish word leak
if spanish_leak_count > 0:
    print("\n--- SPANISH WORD LEAK (untranslated Spanish in output) ---")
    leak_df = results_df[results_df["has_spanish_leak"]].head(3)
    for idx, row in leak_df.iterrows():
        print_sample(row, "SpanishLeak")

# Too short outputs
if too_short_count > 0:
    print("\n--- TOO SHORT OUTPUTS (<30% of reference length) ---")
    short_df = results_df[results_df["is_too_short"]].head(3)
    for idx, row in short_df.iterrows():
        print_sample(row, "TooShort")

# Too long outputs  
if too_long_count > 0:
    print("\n--- TOO LONG OUTPUTS (>300% of reference length) ---")
    long_df = results_df[results_df["is_too_long"]].head(3)
    for idx, row in long_df.iterrows():
        print_sample(row, "TooLong")

# Random sample for diversity
print("\n--- RANDOM SAMPLES ---")
random_df = results_df.sample(n=min(5, len(results_df)), random_state=42)
for idx, row in random_df.iterrows():
    print_sample(row, f"Random")

# Cleanup pynvml
pynvml.nvmlShutdown()
print("\n" + "=" * 70)
print("Benchmark complete. Results saved to", output_file)


Results saved to: pythia_1b_h100_baseline.csv

BENCHMARK SUMMARY

Model: EleutherAI/pythia-1b
Dataset: alvations/globalvoices-en-es (filtered)
Total Samples: 5000
Batch Size: 32
Max New Tokens: 128

--- Performance Metrics ---
Total Runtime: 422.78 seconds
Throughput: 11.83 samples/second
Average Time per Sample: 84.56 ms

--- Token Count Statistics ---
Metric                     Spanish Src  English Ref Model Output
-----------------------------------------------------------------
Mean tokens                       40.4         27.5         18.6
Median tokens                     36.0         24.0         15.0
Std tokens                        22.0         14.9         19.4
Min tokens                           8            6            0
Max tokens                         148          239          129

--- Fertility (Tokens/Char) Statistics ---
Metric                     Spanish Src  English Ref Model Output
-----------------------------------------------------------------
Mean fertilit