In [None]:
# Cell 1: Package Installation
!pip install transformers datasets accelerate pandas nvidia-ml-py flash-attn --no-build-isolation -q


In [None]:
# Cell 2: Imports and GPU Validation
import torch
import pandas as pd
import pynvml
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import time
from tqdm.auto import tqdm

# Hard check for H100/A100 GPU (compute capability >= 8.0 required for Flash Attention 2)
assert torch.cuda.is_available(), "CUDA is not available - H100 GPU required"
compute_capability = torch.cuda.get_device_capability()
assert compute_capability[0] >= 8, f"H100/A100 GPU required for Flash Attention 2 (got compute capability {compute_capability})"

# Initialize pynvml for VRAM monitoring
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Compute Capability: {compute_capability[0]}.{compute_capability[1]}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")


In [None]:
# Cell 3: Model Loading (H100 Native)
MODEL_NAME = "EleutherAI/pythia-1b"

print(f"Loading {MODEL_NAME} with Flash Attention 2...")

# Load model with H100-optimized settings
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    attn_implementation="flash_attention_2",
)
model.eval()

# Load tokenizer with proper padding configuration for batched generation
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Required for batched generation

print(f"Model loaded successfully on {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")
print(f"Tokenizer pad_token: '{tokenizer.pad_token}' (id: {tokenizer.pad_token_id})")
print(f"Tokenizer padding_side: {tokenizer.padding_side}")


In [None]:
# Cell 4: Data Pipeline (Robust)
BATCH_SIZE = 64
NUM_SAMPLES = 5000

print("Loading global_voices dataset (es-en split)...")
dataset = load_dataset("Divyanshu/global_voices", "es-en", split="train")

# Inspection step: verify dataset structure
print(f"\nDataset column names: {dataset.column_names}")
print(f"First sample: {dataset[0]}")

# Extract Spanish (es) and English (en) from the translation column dictionary
def extract_translations(example):
    return {
        "spanish": example["translation"]["es"],
        "english": example["translation"]["en"]
    }

dataset = dataset.map(extract_translations, remove_columns=dataset.column_names)

# Filter to first 5,000 rows
dataset = dataset.select(range(min(NUM_SAMPLES, len(dataset))))

print(f"\nProcessed dataset size: {len(dataset)}")
print(f"Sample after extraction: {dataset[0]}")

# Create a custom Dataset class for DataLoader
class TranslationDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item["spanish"], item["english"]

# Create DataLoader
torch_dataset = TranslationDataset(dataset)
dataloader = DataLoader(
    torch_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=0,  # Avoid multiprocessing issues in notebook
    pin_memory=True
)

print(f"\nDataLoader created: {len(dataloader)} batches of size {BATCH_SIZE}")


In [None]:
# Cell 5: Benchmark Functions

@torch.no_grad()
def calculate_batch_perplexity(texts: list[str], model, tokenizer) -> list[float]:
    """
    Vectorized perplexity calculation using cross-entropy loss.
    Uses teacher-forcing: compute loss on input tokens and convert to PPL via exp(loss).
    """
    # Tokenize batch with padding
    encodings = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to("cuda")
    
    input_ids = encodings.input_ids
    attention_mask = encodings.attention_mask
    
    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Shift logits and labels for causal LM loss calculation
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()
    shift_mask = attention_mask[:, 1:].contiguous()
    
    # Calculate per-token cross entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    losses = losses.view(shift_labels.size())
    
    # Mask out padding tokens and calculate mean loss per sequence
    masked_losses = losses * shift_mask
    seq_lengths = shift_mask.sum(dim=1).clamp(min=1)  # Avoid division by zero
    mean_losses = masked_losses.sum(dim=1) / seq_lengths
    
    # Convert to perplexity
    perplexities = torch.exp(mean_losses)
    
    return perplexities.cpu().tolist()


@torch.no_grad()
def generate_translations(spanish_texts: list[str], model, tokenizer, max_new_tokens: int = 64) -> list[str]:
    """
    Batched generation with greedy decoding.
    Format: "Spanish: {es}\nEnglish:"
    """
    # Format prompts
    prompts = [f"Spanish: {text}\nEnglish:" for text in spanish_texts]
    
    # Tokenize with left padding (already configured in tokenizer)
    encodings = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).to("cuda")
    
    # Generate with greedy decoding
    generated_ids = model.generate(
        input_ids=encodings.input_ids,
        attention_mask=encodings.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # Greedy decoding
        pad_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the new tokens (exclude input prompt)
    input_length = encodings.input_ids.shape[1]
    generated_texts = tokenizer.batch_decode(
        generated_ids[:, input_length:],
        skip_special_tokens=True
    )
    
    return generated_texts


def get_peak_vram_mb(handle) -> float:
    """Get peak VRAM usage in MB using pynvml."""
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return info.used / (1024 ** 2)


def reset_peak_memory_stats():
    """Reset PyTorch's peak memory tracking."""
    torch.cuda.reset_peak_memory_stats()


def get_pytorch_peak_memory_mb() -> float:
    """Get PyTorch's tracked peak memory in MB."""
    return torch.cuda.max_memory_allocated() / (1024 ** 2)


print("Benchmark functions defined successfully.")


In [None]:
# Cell 6: Main Benchmark Loop ("Unbottlenecked" - no Python for-loops for inference)

# Results storage
all_spanish = []
all_english = []
all_outputs = []
all_ppls = []
peak_vram_readings = []

print("Starting benchmark...")
print(f"Total samples: {len(torch_dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Total batches: {len(dataloader)}")
print("-" * 60)

# Warm-up run to stabilize GPU
print("Warming up GPU...")
warmup_texts = ["Hola, ¿cómo estás?"] * 4
_ = calculate_batch_perplexity(warmup_texts, model, tokenizer)
_ = generate_translations(warmup_texts, model, tokenizer, max_new_tokens=16)
torch.cuda.synchronize()
print("Warm-up complete.\n")

# Main benchmark
start_time = time.perf_counter()

for batch_idx, (spanish_batch, english_batch) in enumerate(tqdm(dataloader, desc="Processing batches")):
    # Convert tuple of strings to list
    spanish_list = list(spanish_batch)
    english_list = list(english_batch)
    
    # Reset memory tracking for this batch
    reset_peak_memory_stats()
    
    # Metric 1: Calculate perplexity on Spanish input (batched)
    batch_ppls = calculate_batch_perplexity(spanish_list, model, tokenizer)
    
    # Metric 2: Generate translations (batched)
    batch_outputs = generate_translations(spanish_list, model, tokenizer, max_new_tokens=64)
    
    # Synchronize to ensure all GPU work is complete before measuring VRAM
    torch.cuda.synchronize()
    
    # Log peak VRAM usage for this batch
    batch_peak_vram = get_peak_vram_mb(gpu_handle)
    peak_vram_readings.append(batch_peak_vram)
    
    # Accumulate results
    all_spanish.extend(spanish_list)
    all_english.extend(english_list)
    all_outputs.extend(batch_outputs)
    all_ppls.extend(batch_ppls)
    
    # Progress logging every 10 batches
    if (batch_idx + 1) % 10 == 0:
        elapsed = time.perf_counter() - start_time
        samples_done = (batch_idx + 1) * BATCH_SIZE
        throughput = samples_done / elapsed
        print(f"  Batch {batch_idx + 1}/{len(dataloader)} | "
              f"Throughput: {throughput:.1f} samples/s | "
              f"Peak VRAM: {batch_peak_vram:.0f} MB")

end_time = time.perf_counter()
total_time = end_time - start_time

print("-" * 60)
print(f"Benchmark complete!")
print(f"Total samples processed: {len(all_spanish)}")
print(f"Total time: {total_time:.2f} seconds")
print(f"Overall throughput: {len(all_spanish) / total_time:.2f} samples/second")
print(f"Max peak VRAM observed: {max(peak_vram_readings):.0f} MB")
print(f"Average peak VRAM: {sum(peak_vram_readings) / len(peak_vram_readings):.0f} MB")


In [None]:
# Cell 7: Output and Reporting

# Create results DataFrame
results_df = pd.DataFrame({
    "spanish_src": all_spanish,
    "english_ref": all_english,
    "model_output": all_outputs,
    "input_ppl": all_ppls
})

# Save to CSV
output_file = "pythia_1b_h100_baseline.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Display summary statistics
print("\n" + "=" * 60)
print("BENCHMARK SUMMARY")
print("=" * 60)
print(f"\nModel: {MODEL_NAME}")
print(f"Dataset: global_voices (es-en)")
print(f"Total Samples: {len(results_df)}")
print(f"Batch Size: {BATCH_SIZE}")

print(f"\n--- Performance Metrics ---")
print(f"Total Runtime: {total_time:.2f} seconds")
print(f"Throughput: {len(results_df) / total_time:.2f} samples/second")
print(f"Average Time per Sample: {total_time / len(results_df) * 1000:.2f} ms")

print(f"\n--- Perplexity Statistics ---")
print(f"Mean PPL: {results_df['input_ppl'].mean():.2f}")
print(f"Median PPL: {results_df['input_ppl'].median():.2f}")
print(f"Std PPL: {results_df['input_ppl'].std():.2f}")
print(f"Min PPL: {results_df['input_ppl'].min():.2f}")
print(f"Max PPL: {results_df['input_ppl'].max():.2f}")

print(f"\n--- VRAM Usage ---")
print(f"Max Peak VRAM: {max(peak_vram_readings):.0f} MB")
print(f"Avg Peak VRAM: {sum(peak_vram_readings) / len(peak_vram_readings):.0f} MB")

print("\n--- Sample Outputs ---")
for i in range(min(3, len(results_df))):
    print(f"\nSample {i+1}:")
    print(f"  Spanish: {results_df.iloc[i]['spanish_src'][:100]}...")
    print(f"  English (ref): {results_df.iloc[i]['english_ref'][:100]}...")
    print(f"  Model output: {results_df.iloc[i]['model_output'][:100]}...")
    print(f"  Input PPL: {results_df.iloc[i]['input_ppl']:.2f}")

# Cleanup pynvml
pynvml.nvmlShutdown()
print("\n" + "=" * 60)
print("Benchmark complete. Results saved to", output_file)
