In [55]:
import os
import csv
from statistics import mean
from openai import OpenAI
import anthropic
import google.generativeai as genai
from datasets import load_dataset
from huggingface_hub import InferenceClient
import random
# ====================================
# API Keys
# ====================================
os.environ["HF_TOKEN"] = "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"
os.environ["OPENAI_API_KEY"] = "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"
os.environ["ANTHROPIC_API_KEY"] = "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"
os.environ["GOOGLE_API_KEY"] = "AIzaSyD5wH2wTSjf2jJhgbAhbibrXFsGRQ9br6M"
os.environ["XAI_API_KEY"] = "xai-KhGs5o0Vsp4HuwGmFW8qdp16YLppzAhuzEjdDNutCwJEM11Cx1NVwSMrz8hTnRJ7AxFQ5l3p6TA5AqGp"
# ====================================
# Initialize Clients
# ====================================
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
anthropic_client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

hf_client = InferenceClient(provider="novita", api_key=os.environ["HF_TOKEN"])

# ====================================
# Load Datasets
# ====================================
print("ðŸ“š Loading datasets...")

mmlu_abstract = load_dataset("brucewlee1/mmlu-abstract-algebra")["test"]
mmlu_college = load_dataset("brucewlee1/mmlu-college-mathematics")["test"]
gsm8k = load_dataset("openai/gsm8k", "main")["test"]
hendrycks_algebra = load_dataset("EleutherAI/hendrycks_math", "algebra")["test"]
hendrycks_counting = load_dataset("EleutherAI/hendrycks_math", "counting_and_probability")["test"]

def sample_balanced(dataset, n_samples=100):
    """Sample evenly across difficulty levels."""
    levels = list(set(dataset["level"]))
    per_level = max(1, n_samples // len(levels))
    samples = []
    for lvl in levels:
        subset = [ex for ex in dataset if ex["level"] == lvl]
        samples.extend(random.sample(subset, min(per_level, len(subset))))
    return samples[:n_samples]

hendrycks_algebra_sampled = sample_balanced(hendrycks_algebra, 100)
hendrycks_counting_sampled = sample_balanced(hendrycks_counting, 100)

datasets_cfg = {
    "MMLU-abstract": (mmlu_abstract, 99),
    "MMLU-college": (mmlu_college, 99),
    "GSM-8k": (gsm8k, 102),
    "Hendrycks-Algebra": (hendrycks_algebra_sampled, 100),
    "Hendrycks-Counting": (hendrycks_counting_sampled, 100),
}


models = [
    "gpt-4o-mini",
    "grok-3-mini-beta",
    "claude-3-5-haiku-20241022",
    "gemini-2.0-flash",
    "llama-3.3-70b-it",
]

judge_model = "gpt-4.1-mini"

# ====================================
# Model Generators
# ====================================

def generate_4o_answer(question):
    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer concisely with only the final result."},
            {"role": "user", "content": question},
        ],
        max_tokens=1000,
    )
    return resp.choices[0].message.content.strip()

def generate_grok_answer(question):
    client = OpenAI(
        base_url="https://api.x.ai/v1",
        api_key=os.environ["XAI_API_KEY"],
    )
    resp = client.chat.completions.create(
        model="grok-3-mini-beta",
        messages=[{"role": "user", "content": question}],
        max_tokens=1000,
    )
    return resp.choices[0].message.content.strip()

def generate_claude_answer(question):
    resp = anthropic_client.messages.create(
        model="claude-3-5-haiku-20241022",
        max_tokens=1000,
        messages=[
            {"role": "user", "content": f"{question}\nAnswer concisely with only the final answer."}
        ],
    )
    return resp.content[0].text.strip()

def generate_gemini_answer(question):
    model = genai.GenerativeModel("gemini-2.0-flash")
    resp = model.generate_content(
        f"{question}\nAnswer concisely with only the final answer.",
        generation_config=genai.types.GenerationConfig(max_output_tokens=1000),
    )
    return resp.text.strip()

def generate_llama_answer(question):
    resp = hf_client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct",
        messages=[
            {"role": "system", "content": "Answer concisely with only the final result."},
            {"role": "user", "content": question},
        ],
        max_tokens=1000,
    )
    return resp.choices[0].message.content.strip()

# unified routing
def generate_model_answer(model_name, question):
    try:
        if "gpt" in model_name:
            return generate_4o_answer(question)
        elif "grok" in model_name:
            return generate_grok_answer(question)
        elif "claude" in model_name:
            return generate_claude_answer(question)
        elif "gemini" in model_name:
            return generate_gemini_answer(question)
        elif "llama" in model_name:
            return generate_llama_answer(question)
        else:
            return "UNKNOWN MODEL"
    except Exception as e:
        print(f"[Warning] {model_name} failed: {e}")
        return "ERROR"

# ====================================
# Judge (GPT-4.1-mini)
# ====================================

def judge_answer(question, correct_answer, model_answer):
    """Use GPT-4.1-mini to decide correctness (1/0)."""
    prompt = f"""
You are an accurate grader.

Question: {question}
Ground Truth Answer: {correct_answer}
Model Answer: {model_answer}

Decide if the model answer means the same as the correct one.
If correct, return ONLY "1".
If incorrect, return ONLY "0".
No explanations.
"""
    try:
        resp = openai_client.chat.completions.create(
            model=judge_model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=5,
        )
        result = resp.choices[0].message.content.strip()
        return int(result) if result in ["0", "1"] else 0
    except Exception as e:
        print(f"[Warning] Judge failed: {e}")
        return 0

# ====================================
# Load Question & Ground Truth
# ====================================
def load_question(dataset_name, ds, idx):
    item = ds[idx % len(ds)]
    level = item.get("level", "N/A")

    if "GSM" in dataset_name:
        question = item["question"]
        correct_answer = item["answer"]
    elif "MMLU" in dataset_name:
        question = item["centerpiece"]
        correct_answer = item["correct_options_literal"][0]
    else:
        question = item["problem"]
        correct_answer = item["solution"]
    return question, correct_answer, level
# ====================================
# Main Evaluation Loop
# ====================================
results = []

for dataset_name, (dataset, n) in datasets_cfg.items():
    for i in range(n):
        question, correct_answer, level = load_question(dataset_name, dataset, i)
        qid = f"{dataset_name}_{i+1}"

        for model_name in models:
            print(f"ðŸ§© Evaluating {model_name} on {dataset_name} Q{i+1}")
            model_answer = generate_model_answer(model_name, question)
            score = judge_answer(question, correct_answer, model_answer)

            results.append({
                "question_id": qid,
                "question": question,
                "correct_answer": correct_answer,
                "model_name": model_name,
                "model_answer": model_answer,
                "judge_result": score,
                "level": level,
            })

# ====================================
# Save CSV
# ====================================
with open("evaluation_results.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "question_id",
            "question",
            "model_name",
            "correct_answer",
            "model_answer",
            "judge_result",
            "level",
        ],
    )
    writer.writeheader()
    writer.writerows(results)

print("âœ… Evaluation complete. Results saved to evaluation_results.csv")

# ====================================
# Accuracy Summary
# ====================================
print("\nðŸ“Š Accuracy Summary:")
model_groups = {}
for r in results:
    model_groups.setdefault(r["model_name"], []).append(r["judge_result"])
for m, vals in model_groups.items():
    acc = mean(vals)
    print(f"{m:25s}  accuracy = {acc:.2%}")


ðŸ“š Loading datasets...
ðŸ§© Evaluating gpt-4o-mini on MMLU-abstract Q1
ðŸ§© Evaluating grok-3-mini-beta on MMLU-abstract Q1
ðŸ§© Evaluating claude-3-5-haiku-20241022 on MMLU-abstract Q1
ðŸ§© Evaluating gemini-2.0-flash on MMLU-abstract Q1
ðŸ§© Evaluating llama-3.3-70b-it on MMLU-abstract Q1
ðŸ§© Evaluating gpt-4o-mini on MMLU-abstract Q2
ðŸ§© Evaluating grok-3-mini-beta on MMLU-abstract Q2
ðŸ§© Evaluating claude-3-5-haiku-20241022 on MMLU-abstract Q2
ðŸ§© Evaluating gemini-2.0-flash on MMLU-abstract Q2
ðŸ§© Evaluating llama-3.3-70b-it on MMLU-abstract Q2
ðŸ§© Evaluating gpt-4o-mini on MMLU-abstract Q3
ðŸ§© Evaluating grok-3-mini-beta on MMLU-abstract Q3
ðŸ§© Evaluating claude-3-5-haiku-20241022 on MMLU-abstract Q3
ðŸ§© Evaluating gemini-2.0-flash on MMLU-abstract Q3
ðŸ§© Evaluating llama-3.3-70b-it on MMLU-abstract Q3
ðŸ§© Evaluating gpt-4o-mini on MMLU-abstract Q4
ðŸ§© Evaluating grok-3-mini-beta on MMLU-abstract Q4
ðŸ§© Evaluating claude-3-5-haiku-20241022 on MMLU-abstract Q4
ðŸ§© 