# Week 2 — Notebook 3: Model Evaluation & VJ Scoring

Measures the impact of SFT + DPO across four dimensions:  
`correctness` · `groundedness` · `problem_solution` · `style`

Covers:
1. Generating responses from base vs. SFT vs. DPO checkpoints
2. VJ (Virtual Judge) scoring via GPT-4o
3. **Meta-eval** — Judge-on-Judge agreement check (500-sample human review proxy)
4. Before/after comparison: alignment tax detection
5. Score distribution & correlation analysis

---
> **GPU requirement:** 1 GPU for inference; scoring calls OpenAI API.  
> **Cost estimate:** ~\$3–8 for 500 scored samples at GPT-4o-mini rates.

## 0. Imports & Config

In [None]:
# !pip install openai transformers peft bitsandbytes datasets pandas seaborn scipy

In [None]:
import os
import json
import asyncio
import time
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv

load_dotenv("../../week1/.env")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from openai import OpenAI
from tqdm.auto import tqdm

DATA_DIR   = Path("../data")
MODELS_DIR = Path("../models")
EVAL_DIR   = DATA_DIR / "eval_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
JUDGE_MODEL = "gpt-4o-mini"   # upgrade to gpt-4o for higher precision meta-eval

print("Imports OK")

## 1. VJ Scoring Functions

In [None]:
SCORE_PROMPT = """\
You are an expert evaluator for a customer support AI system.
Score the RESPONSE to the USER QUERY on these four dimensions.
Return ONLY valid JSON with keys: correctness, groundedness, problem_solution, style.
Each value must be a float between 0.0 and 1.0.

Definitions:
- correctness: factual accuracy, no hallucinations
- groundedness: claims are supported by plausible context, no fabrications
- problem_solution: response actually solves or addresses the user's problem
- style: tone is professional, empathetic, concise

USER QUERY: {prompt}
RESPONSE: {response}

JSON:"""


def score_response(
    prompt: str,
    response: str,
    model: str = JUDGE_MODEL,
    retries: int = 3,
) -> Optional[dict]:
    """Score a single response across 4 VJ dimensions."""
    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[{
                    "role": "user",
                    "content": SCORE_PROMPT.format(prompt=prompt[:600], response=response[:600]),
                }],
                temperature=0.0,
                max_tokens=100,
            )
            return json.loads(completion.choices[0].message.content.strip())
        except Exception as e:
            if attempt == retries - 1:
                print(f"Scoring failed: {e}")
                return None
            time.sleep(2 ** attempt)


# Test
test = score_response(
    "My order hasn't arrived after 2 weeks.",
    "I apologize for the delay. Please contact support@shop.com with your order number."
)
print("Test score:", test)

## 2. Generate Responses from Each Checkpoint

Compare: **base** · **SFT** · **DPO**  
Uses the same 200-sample held-out test set.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
RUN_NAME   = "qwen2.5-7b-customer-support-dpo"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

def load_checkpoint(adapter_path: Optional[str] = None):
    """Load base model, optionally with LoRA adapter."""
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL, quantization_config=bnb_config,
        device_map="auto", trust_remote_code=True
    )
    if adapter_path and Path(adapter_path).exists():
        model = PeftModel.from_pretrained(model, adapter_path)
    model.eval()
    return model, tokenizer


def generate(model, tokenizer, prompt: str, max_new_tokens: int = 200) -> str:
    text = f"### User:\n{prompt}\n\n### Assistant:\n"
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=max_new_tokens,
            do_sample=False, temperature=1.0,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)


print("Functions defined. Load checkpoints in next cell when ready.")

In [None]:
# Load test set
df_test = pd.read_json(DATA_DIR / "dpo_weighted.jsonl", lines=True).sample(200, random_state=99)
prompts = df_test["prompt"].tolist()

CHECKPOINTS = {
    "base": None,
    "sft":  str(MODELS_DIR / f"{RUN_NAME}-sft-final"),
    "dpo":  str(MODELS_DIR / f"{RUN_NAME}-dpo-final"),
}

all_responses = {}

for ckpt_name, adapter_path in CHECKPOINTS.items():
    print(f"\n=== Generating: {ckpt_name} ===")
    model, tokenizer = load_checkpoint(adapter_path)
    responses = [generate(model, tokenizer, p) for p in tqdm(prompts)]
    all_responses[ckpt_name] = responses
    # Free VRAM between checkpoints
    del model
    torch.cuda.empty_cache()

print("Generation complete.")

## 3. Score All Responses with VJ

In [None]:
DIMS = ["correctness", "groundedness", "problem_solution", "style"]
results = []

for ckpt_name, responses in all_responses.items():
    print(f"Scoring: {ckpt_name}...")
    for prompt, response in tqdm(zip(prompts, responses), total=len(prompts)):
        scores = score_response(prompt, response)
        if scores:
            results.append({"checkpoint": ckpt_name, "prompt": prompt, **scores})

df_scores = pd.DataFrame(results)
df_scores.to_json(EVAL_DIR / "vj_scores.jsonl", orient="records", lines=True)
print(f"Scored {len(df_scores)} rows.")
df_scores.groupby("checkpoint")[DIMS].mean().round(3)

## 4. Alignment Tax Detection

In [None]:
mean_scores = df_scores.groupby("checkpoint")[DIMS].mean()

fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, dim in zip(axes, DIMS):
    vals = mean_scores[dim]
    colors = ["#4C72B0" if v == vals.max() else "#DD8452" for v in vals]
    ax.bar(vals.index, vals.values, color=colors)
    ax.set_title(dim, fontsize=13)
    ax.set_ylim(0, 1)
    ax.axhline(vals["base"], linestyle="--", color="gray", linewidth=1, label="base")
    for i, v in enumerate(vals.values):
        ax.text(i, v + 0.01, f"{v:.3f}", ha="center", fontsize=10)

plt.suptitle("VJ Scores: Base → SFT → DPO", fontsize=15, y=1.02)
plt.tight_layout()
plt.savefig(EVAL_DIR / "alignment_tax.png", dpi=150, bbox_inches="tight")
plt.show()

# Flag alignment tax
for dim in ["correctness", "problem_solution"]:
    delta = mean_scores.loc["dpo", dim] - mean_scores.loc["base", dim]
    if delta < -0.02:
        print(f"⚠  Alignment tax on '{dim}': {delta:+.3f}")
    else:
        print(f"✓  No significant tax on '{dim}': {delta:+.3f}")

## 5. Meta-Eval — Judge-on-Judge Agreement

Estimates VJ precision by comparing GPT-4o-mini scores to GPT-4o scores on 500 samples.

In [None]:
META_EVAL_N = 100   # raise to 500 for production; 100 here to control cost

df_meta = df_scores[df_scores["checkpoint"] == "dpo"].sample(META_EVAL_N, random_state=7)
strong_judge_scores = []

print(f"Running meta-eval on {META_EVAL_N} samples with gpt-4o...")
for _, row in tqdm(df_meta.iterrows(), total=META_EVAL_N):
    # Find corresponding response
    idx = df_scores[
        (df_scores["checkpoint"] == "dpo") & (df_scores["prompt"] == row["prompt"])
    ].index[0]
    response = all_responses["dpo"][prompts.index(row["prompt"])] if row["prompt"] in prompts else ""
    s = score_response(row["prompt"], response, model="gpt-4o")
    if s:
        strong_judge_scores.append({"prompt": row["prompt"], **{f"strong_{k}": v for k, v in s.items()}})

df_meta_eval = df_meta.merge(
    pd.DataFrame(strong_judge_scores), on="prompt", how="inner"
)

print("\n=== Judge Agreement (Pearson r): VJ-mini vs GPT-4o ===")
for dim in DIMS:
    r, p = stats.pearsonr(df_meta_eval[dim], df_meta_eval[f"strong_{dim}"])
    quality = "good" if r > 0.7 else "moderate" if r > 0.5 else "⚠ LOW"
    print(f"  {dim:<22} r={r:.3f}  p={p:.3f}  [{quality}]")

## 6. Score Correlation Heatmap

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, ckpt in zip(axes, ["base", "sft", "dpo"]):
    sub = df_scores[df_scores["checkpoint"] == ckpt][DIMS]
    sns.heatmap(
        sub.corr(), annot=True, fmt=".2f", cmap="coolwarm",
        vmin=-1, vmax=1, ax=ax, cbar=False,
    )
    ax.set_title(f"{ckpt} — score correlations")

plt.tight_layout()
plt.savefig(EVAL_DIR / "score_correlations.png", dpi=150, bbox_inches="tight")
plt.show()

print("Negative style↔correctness correlation in DPO? That's the alignment tax in numbers.")

---
## Summary

| Checkpoint | correctness | problem_solution | style | alignment tax? |
|------------|-------------|------------------|-------|----------------|
| base       | —           | —                | —     | —              |
| sft        | ↑           | ↑                | ~     | None           |
| dpo        | ±           | ±                | ↑     | Check output   |

**Next:** `04_alignment_pain_points.ipynb` — deeper analysis + remedies.