# Frontier model evaluation (optional)
Run the phishing classification evaluation against hosted frontier models and save results to JSON files.


## Prerequisites
Set API keys for each provider (environment variables).

```bash
export OPENAI_API_KEY=...
export ANTHROPIC_API_KEY=...
export GEMINI_API_KEY=...  # or GOOGLE_API_KEY
```

Update the model IDs below to match the exact frontier model names you want to test.


In [ ]:
# Install SDKs if needed
!pip install -q openai anthropic google-generativeai


## Run evaluation
This will save JSON results under `outputs/` similar to the local model evaluations.


In [ ]:
import json
import os
from datetime import datetime, timezone
from pathlib import Path

from openai import OpenAI
from anthropic import Anthropic
import google.generativeai as genai

DATA_PATH = Path("../data/processed_small/test.jsonl")
MAX_SAMPLES = 200

OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5.2")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-3")
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")

if not OPENAI_API_KEY:
    raise SystemExit("Missing OPENAI_API_KEY")
if not ANTHROPIC_API_KEY:
    raise SystemExit("Missing ANTHROPIC_API_KEY")
if not GEMINI_API_KEY:
    raise SystemExit("Missing GEMINI_API_KEY or GOOGLE_API_KEY")

openai_client = OpenAI(api_key=OPENAI_API_KEY)
anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY)
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel(GEMINI_MODEL)

def build_prompt(subject: str, body: str) -> str:
    return (
        "### Instruction:\n"
        "Classify the email as phishing or benign. Reply with only the label.\n"
        "### Email:\n"
        f"Subject: {subject.strip()}\n"
        f"Body: {body.strip()}\n"
        "### Response:\n"
    )

def normalize_label(text: str) -> str:
    lowered = text.lower()
    if "phish" in lowered:
        return "phishing"
    if "benign" in lowered or "ham" in lowered:
        return "benign"
    return "unknown"

def call_openai(prompt: str) -> str:
    response = openai_client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=6,
    )
    return response.choices[0].message.content.strip()

def call_anthropic(prompt: str) -> str:
    response = anthropic_client.messages.create(
        model=ANTHROPIC_MODEL,
        max_tokens=6,
        temperature=0.0,
        messages=[{"role": "user", "content": prompt}],
    )
    return response.content[0].text.strip()

def call_gemini(prompt: str) -> str:
    response = gemini_model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            temperature=0.0,
            max_output_tokens=6,
        ),
    )
    return (response.text or "").strip()

def load_rows(path: Path, max_samples: int):
    rows = []
    with path.open("r", encoding="utf-8") as handle:
        for line in handle:
            if len(rows) >= max_samples:
                break
            rows.append(json.loads(line))
    return rows

def run_eval(name: str, model_name: str, call_fn, output_file: Path) -> dict:
    rows = load_rows(DATA_PATH, MAX_SAMPLES)
    correct = 0
    total = 0
    errors = 0
    for row in rows:
        prompt = build_prompt(row.get("subject", ""), row.get("body", ""))
        try:
            raw = call_fn(prompt)
        except Exception:
            errors += 1
            continue
        pred = normalize_label(raw)
        if pred == row.get("label"):
            correct += 1
        total += 1
    accuracy = (correct / total) if total else 0.0
    payload = {
        "provider": name,
        "model_name": model_name,
        "correct": correct,
        "total": total,
        "accuracy": accuracy,
        "errors": errors,
        "test_file": str(DATA_PATH),
        "max_samples": MAX_SAMPLES,
        "timestamp": datetime.now(timezone.utc).isoformat(),
    }
    output_file.parent.mkdir(parents=True, exist_ok=True)
    output_file.write_text(json.dumps(payload, indent=2))
    print(f"{name} ({model_name}) accuracy: {accuracy:.2%} on {total} samples")
    return payload

results = {}
results["gpt_5_2"] = run_eval("openai", OPENAI_MODEL, call_openai, Path("../outputs/eval_gpt_5_2.json"))
results["gemini_3"] = run_eval("gemini", GEMINI_MODEL, call_gemini, Path("../outputs/eval_gemini_3.json"))
results["claude"] = run_eval("anthropic", ANTHROPIC_MODEL, call_anthropic, Path("../outputs/eval_claude.json"))

results
