# Compare Base vs Trained Models

This notebook loads helper functions from `scripts/compare_models.py` and lets you
interactively configure models, prompts, and evaluation settings.


In [None]:
# Imports and helpers
import json
from types import SimpleNamespace
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from scripts.compare_models import (
    CompareConfig,
    load_model_and_tokenizer,
    generate_text,
    compute_logprobs,
)

# Configure here
cfg = CompareConfig(
    base_model="/path/to/original",
    trained_model="/path/to/trained",
    prompts_path=None,  # or a path to .jsonl/.json with {"prompt": ...}
    max_new_tokens=128,
    temperature=0.0,
    top_p=1.0,
    top_k=0,
    device="auto",
    dtype="float16",
    batch_size=8,
    eos_token=None,
)

use_vllm = True
vllm_tp_size = 1

# Load prompts
if cfg.prompts_path is None:
    prompts = [
        "Write a short poem about the sea.",
        "What is the capital of France?",
        "Explain the concept of reinforcement learning in one paragraph.",
        "Translate to Spanish: 'Good morning, how are you?'",
    ]
else:
    p = Path(cfg.prompts_path)
    if p.suffix == ".jsonl":
        prompts = [json.loads(line)["prompt"] for line in p.open()]
    else:  # .json
        items = json.load(p.open())
        prompts = [item["prompt"] if isinstance(item, dict) else item for item in items]

if use_vllm:
    # vLLM-based generation
    base_outputs = vllm_generate(
        cfg.base_model, prompts, cfg.max_new_tokens, cfg.temperature, cfg.top_p, cfg.top_k, cfg.eos_token, vllm_tp_size
    )
    trained_outputs = vllm_generate(
        cfg.trained_model, prompts, cfg.max_new_tokens, cfg.temperature, cfg.top_p, cfg.top_k, cfg.eos_token, vllm_tp_size
    )
    # For log-probs, reuse HF forward on same models for now
    base_model, base_tok = load_model_and_tokenizer(cfg.base_model, cfg.device, cfg.dtype)
    trained_model, trained_tok = load_model_and_tokenizer(cfg.trained_model, cfg.device, cfg.dtype)
else:
    # HF generation path
    base_model, base_tok = load_model_and_tokenizer(cfg.base_model, cfg.device, cfg.dtype)
    trained_model, trained_tok = load_model_and_tokenizer(cfg.trained_model, cfg.device, cfg.dtype)
    base_outputs = generate_text(base_model, base_tok, prompts, cfg)
    trained_outputs = generate_text(trained_model, trained_tok, prompts, cfg)

# Compute log-probs of generated responses under each model
base_logps = compute_logprobs(base_model, base_tok, prompts, base_outputs, cfg)
trained_logps = compute_logprobs(trained_model, trained_tok, prompts, trained_outputs, cfg)

# Aggregate per-sample metrics
rows = []
for i, prompt in enumerate(prompts):
    base_nonzero = (base_logps[i] != 0).sum().clamp(min=1).item()
    trained_nonzero = (trained_logps[i] != 0).sum().clamp(min=1).item()
    rows.append({
        "prompt": prompt,
        "base_text": base_outputs[i],
        "trained_text": trained_outputs[i],
        "base_mean_logp": float(base_logps[i].sum().item() / base_nonzero),
        "trained_mean_logp": float(trained_logps[i].sum().item() / trained_nonzero),
    })

rows[:2]  # preview


In [None]:
# vLLM generation utilities
try:
    from vllm import LLM, SamplingParams
except Exception as e:
    raise RuntimeError("vLLM is not installed. Please install vLLM to use this cell.") from e


def vllm_generate(model_path: str, prompts: list[str], max_new_tokens: int, temperature: float, top_p: float, top_k: int, eos_token: str | None = None, tensor_parallel_size: int = 1):
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k if top_k > 0 else None,
        max_tokens=max_new_tokens,
        stop=[eos_token] if eos_token else None,
    )
    llm = LLM(model=model_path, tensor_parallel_size=tensor_parallel_size)
    outputs = llm.generate(prompts, sampling_params)
    return [o.outputs[0].text for o in outputs]

