In [None]:
%load_ext autoreload
%autoreload 2

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import os
from tqdm import tqdm
import glob
import csv
from typing import Any

from cs336_alignment.zeroshot import parse_mmlu_response

# Load Qwen model and tokenizer
model_path = "/home/alvin/Homework/s2025-assignment3-alignment/notebooks/qwen2.5-3B-instruct-finetuned/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")

# MMLU test folder
mmlu_test_dir = "../data/mmlu/test"
csv_files = glob.glob(os.path.join(mmlu_test_dir, "*.csv"))

# Format input prompts
def format_prompt(example):
    question = example["question"]
    for i, choice in enumerate(example['choices']):
        question += f"({chr(65+i)}) {choice}  "
    prompt = (
            "Below is an instruction that describes a task. Write a response that appropriately completes the request."
            f"\n\n### Instruction:\n{question}\n\n### Response:\n"
        )
    prompt += "Answer:"
    return prompt

# Generate predictions
def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded

# Evaluate each subject
os.makedirs("mmlu_evaluation_results_finetuned", exist_ok=True)

for file_path in csv_files[:10]:
    category = os.path.splitext(os.path.basename(file_path))[0]
    examples = []

    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            examples.append({
                "question": row[0],
                "choices": row[1:5],
                "answer": row[5].strip().upper(),
                "subject": category
            })

    results = []
    correct = 0

    for example in tqdm(examples, desc=f"Evaluating {category}"):
        prompt = format_prompt(example)
        # break
        raw_output = generate_answer(prompt)
        pred = parse_mmlu_response(example, raw_output)
        gt = example['answer']
        results.append({
            "question": example['question'],
            "choices": example['choices'],
            "ground_truth": gt,
            "prediction": pred,
            "raw_output": raw_output
        })
        if pred == gt:
            correct += 1

    accuracy = correct / len(results)
    print(f"Zero-shot accuracy on {category}: {accuracy:.2%}")
    with open(f"mmlu_evaluation_results_finetuned/qwen_mmlu_{category}.json", "w") as f:
        json.dump({"accuracy": accuracy, "results": results}, f, indent=2)

  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 3/3 [00:38<00:00, 12.86s/it]
Evaluating high_school_government_and_politics_test: 100%|██████████| 193/193 [01:21<00:00,  2.36it/s]


Zero-shot accuracy on high_school_government_and_politics_test: 40.41%


Evaluating security_studies_test: 100%|██████████| 245/245 [01:38<00:00,  2.48it/s]


Zero-shot accuracy on security_studies_test: 22.45%


Evaluating global_facts_test: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Zero-shot accuracy on global_facts_test: 22.00%


Evaluating sociology_test: 100%|██████████| 201/201 [01:19<00:00,  2.52it/s]


Zero-shot accuracy on sociology_test: 36.32%


Evaluating high_school_european_history_test: 100%|██████████| 165/165 [01:06<00:00,  2.46it/s]


Zero-shot accuracy on high_school_european_history_test: 36.97%


Evaluating college_biology_test: 100%|██████████| 144/144 [00:58<00:00,  2.48it/s]


Zero-shot accuracy on college_biology_test: 36.81%


Evaluating high_school_psychology_test: 100%|██████████| 545/545 [03:37<00:00,  2.51it/s]


Zero-shot accuracy on high_school_psychology_test: 35.41%


Evaluating astronomy_test: 100%|██████████| 152/152 [01:00<00:00,  2.52it/s]


Zero-shot accuracy on astronomy_test: 25.66%


Evaluating electrical_engineering_test: 100%|██████████| 145/145 [00:57<00:00,  2.50it/s]


Zero-shot accuracy on electrical_engineering_test: 37.93%


Evaluating logical_fallacies_test: 100%|██████████| 163/163 [01:06<00:00,  2.45it/s]

Zero-shot accuracy on logical_fallacies_test: 33.13%





In [3]:
import glob
import csv
import json
import os

def summarize_evaluation_results(results_dir="mmlu_evaluation_results_finetuned"):
    total_correct = 0
    total_questions = 0
    total_none_outputs = 0

    result_files = glob.glob(os.path.join(results_dir, "qwen_mmlu_*.json"))
    
    for file_path in result_files:
        with open(file_path, "r") as f:
            data = json.load(f)
            results = data["results"]
            for result in results:
                pred = result["prediction"]
                gt = result["ground_truth"]
                if pred == "None" or pred == "none" or pred == None: 
                    total_none_outputs += 1
                if pred == gt:
                    total_correct += 1
                total_questions += 1

    overall_accuracy = total_correct / total_questions if total_questions > 0 else 0.0
    print(f"Overall accuracy across all subjects: {overall_accuracy:.2%}")
    print(f"Total predictions: {total_questions}")
    print(f"Total 'None' predictions: {total_none_outputs}")
summarize_evaluation_results()

Overall accuracy across all subjects: 33.27%
Total predictions: 2053
Total 'None' predictions: 0
