In [2]:
%load_ext autoreload
%autoreload 2

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import os
from tqdm import tqdm
import glob
import csv
from typing import Any

from cs336_alignment.zeroshot import parse_mmlu_response

# Load Qwen model and tokenizer
model_path = "../models/Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")

# MMLU test folder
mmlu_test_dir = "../data/mmlu/test"
csv_files = glob.glob(os.path.join(mmlu_test_dir, "*.csv"))

# Format input prompts
def format_prompt(example):
    prompt = f"Question: {example['question']}\n"
    for i, choice in enumerate(example['choices']):
        prompt += f"({chr(65+i)}) {choice}  "
    prompt += "\nAnswer:"
    return prompt

# Generate predictions
def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded

# Evaluate each subject
os.makedirs("evaluation_results", exist_ok=True)

for file_path in csv_files:
    category = os.path.splitext(os.path.basename(file_path))[0]
    examples = []

    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            examples.append({
                "question": row[0],
                "choices": row[1:5],
                "answer": row[5].strip().upper(),
                "subject": category
            })

    results = []
    correct = 0

    for example in tqdm(examples, desc=f"Evaluating {category}"):
        prompt = format_prompt(example)
        # break
        raw_output = generate_answer(prompt)
        pred = parse_mmlu_response(example, raw_output)
        gt = example['answer']
        results.append({
            "question": example['question'],
            "choices": example['choices'],
            "ground_truth": gt,
            "prediction": pred,
            "raw_output": raw_output
        })
        if pred == gt:
            correct += 1

    accuracy = correct / len(results)
    print(f"Zero-shot accuracy on {category}: {accuracy:.2%}")
    with open(f"evaluation_results/qwen_mmlu_{category}.json", "w") as f:
        json.dump({"accuracy": accuracy, "results": results}, f, indent=2)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Evaluating high_school_government_and_politics_test:   0%|          | 0/193 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_government_and_politics_test:   1%|          | 1/193 [00:00<00:22,  8.68it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_government_and_politics_test:   1%|          | 2/193 [00:00<00:21,  8.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_government_and_politics_test:   2%|▏         | 4/193 [00:00<00:27,  6.83it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_government_and_politics_test:   3%|▎         | 5/193 [00:00<00:25,  7.37it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_government_and_politics_test:   3%|▎         | 

Zero-shot accuracy on high_school_government_and_politics_test: 53.37%


Evaluating security_studies_test:   0%|          | 0/245 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   0%|          | 1/245 [00:00<01:07,  3.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   1%|          | 2/245 [00:00<01:06,  3.65it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   1%|          | 3/245 [00:00<00:49,  4.93it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   2%|▏         | 4/245 [00:00<00:55,  4.35it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   2%|▏         | 5/245 [00:01<00:45,  5.31it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating security_studies_test:   2%|▏         | 6/245 [00:01<00:39,  6.11it/s]Set

Zero-shot accuracy on security_studies_test: 45.71%


Evaluating global_facts_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   1%|          | 1/100 [00:00<00:26,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   2%|▏         | 2/100 [00:00<00:17,  5.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   3%|▎         | 3/100 [00:00<00:21,  4.57it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   4%|▍         | 4/100 [00:00<00:16,  5.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   5%|▌         | 5/100 [00:01<00:19,  4.77it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating global_facts_test:   6%|▌         | 6/100 [00:01<00:16,  5.68it/s]Setting `pad_token_id` to `eos_

Zero-shot accuracy on global_facts_test: 21.00%


Evaluating sociology_test:   0%|          | 0/201 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   0%|          | 1/201 [00:00<00:22,  9.02it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   1%|          | 2/201 [00:00<00:21,  9.09it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   1%|▏         | 3/201 [00:00<00:21,  9.11it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   2%|▏         | 4/201 [00:00<00:33,  5.81it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   2%|▏         | 5/201 [00:00<00:40,  4.85it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating sociology_test:   3%|▎         | 6/201 [00:01<00:44,  4.42it/s]Setting `pad_token_id` to `eos_token_id`:151643 for 

Zero-shot accuracy on sociology_test: 61.69%


Evaluating high_school_european_history_test:   0%|          | 0/165 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_european_history_test:   1%|          | 1/165 [00:00<00:18,  8.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_european_history_test:   1%|          | 2/165 [00:00<00:18,  8.84it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_european_history_test:   2%|▏         | 3/165 [00:00<00:18,  8.93it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_european_history_test:   2%|▏         | 4/165 [00:00<00:18,  8.81it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_european_history_test:   3%|▎         | 5/165 [00:00<00:17,  8.91it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating h

Zero-shot accuracy on high_school_european_history_test: 48.48%


Evaluating college_biology_test:   0%|          | 0/144 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_biology_test:   1%|▏         | 2/144 [00:00<00:11, 12.03it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_biology_test:   3%|▎         | 4/144 [00:00<00:20,  6.87it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_biology_test:   3%|▎         | 5/144 [00:00<00:18,  7.37it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_biology_test:   4%|▍         | 6/144 [00:00<00:24,  5.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_biology_test:   5%|▍         | 7/144 [00:01<00:21,  6.43it/s]Setting `pad_toke

Zero-shot accuracy on college_biology_test: 37.50%


Evaluating high_school_psychology_test:   0%|          | 0/545 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   0%|          | 1/545 [00:00<02:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   0%|          | 2/545 [00:00<01:35,  5.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   1%|          | 3/545 [00:00<01:58,  4.57it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   1%|          | 4/545 [00:00<02:10,  4.16it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   1%|          | 5/545 [00:01<01:44,  5.16it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_psychology_test:   1%|   

Zero-shot accuracy on high_school_psychology_test: 60.00%


Evaluating astronomy_test:   0%|          | 0/152 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   1%|          | 1/152 [00:00<00:16,  9.05it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   1%|▏         | 2/152 [00:00<00:30,  4.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   2%|▏         | 3/152 [00:00<00:24,  6.15it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   3%|▎         | 4/152 [00:00<00:21,  7.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   3%|▎         | 5/152 [00:00<00:27,  5.31it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating astronomy_test:   4%|▍         | 6/152 [00:01<00:31,  4.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for 

Zero-shot accuracy on astronomy_test: 49.34%


Evaluating electrical_engineering_test:   0%|          | 0/145 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   1%|          | 1/145 [00:00<00:38,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   1%|▏         | 2/145 [00:00<00:38,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   2%|▏         | 3/145 [00:00<00:38,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   3%|▎         | 4/145 [00:01<00:37,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   3%|▎         | 5/145 [00:01<00:29,  4.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating electrical_engineering_test:   4%|▍  

Zero-shot accuracy on electrical_engineering_test: 51.72%


Evaluating logical_fallacies_test:   0%|          | 0/163 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   1%|          | 1/163 [00:00<00:45,  3.56it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   1%|          | 2/163 [00:00<00:44,  3.64it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   2%|▏         | 3/163 [00:00<00:31,  5.03it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   2%|▏         | 4/163 [00:00<00:36,  4.34it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   3%|▎         | 5/163 [00:01<00:29,  5.31it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating logical_fallacies_test:   4%|▎         | 6/163 [00:01<00:25,  6.17i

Zero-shot accuracy on logical_fallacies_test: 49.08%


Evaluating nutrition_test:   0%|          | 0/306 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   0%|          | 1/306 [00:00<00:33,  9.11it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   1%|          | 2/306 [00:00<00:33,  9.15it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   1%|          | 3/306 [00:00<00:33,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   1%|▏         | 4/306 [00:00<00:52,  5.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   2%|▏         | 5/306 [00:00<01:02,  4.78it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating nutrition_test:   2%|▏         | 6/306 [00:01<01:08,  4.35it/s]Setting `pad_token_id` to `eos_token_id`:151643 for 

Zero-shot accuracy on nutrition_test: 48.69%


Evaluating high_school_biology_test:   0%|          | 0/310 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   0%|          | 1/310 [00:00<00:34,  8.95it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   1%|          | 2/310 [00:00<00:34,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   1%|          | 3/310 [00:00<00:34,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   1%|▏         | 4/310 [00:00<00:34,  8.84it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   2%|▏         | 5/310 [00:00<00:34,  8.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_biology_test:   2%|▏         | 6/310 [00:00

Zero-shot accuracy on high_school_biology_test: 50.65%


Evaluating high_school_macroeconomics_test:   0%|          | 0/390 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_macroeconomics_test:   0%|          | 1/390 [00:00<00:43,  8.99it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_macroeconomics_test:   1%|          | 2/390 [00:00<00:42,  9.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_macroeconomics_test:   1%|          | 3/390 [00:00<01:11,  5.44it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_macroeconomics_test:   1%|          | 4/390 [00:00<00:59,  6.43it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_macroeconomics_test:   1%|▏         | 5/390 [00:00<01:12,  5.31it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_m

Zero-shot accuracy on high_school_macroeconomics_test: 42.56%


Evaluating virology_test:   0%|          | 0/166 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   1%|          | 1/166 [00:00<00:43,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   1%|          | 2/166 [00:00<00:44,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   2%|▏         | 3/166 [00:00<00:43,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   2%|▏         | 4/166 [00:01<00:43,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   3%|▎         | 5/166 [00:01<00:34,  4.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating virology_test:   4%|▎         | 6/166 [00:01<00:28,  5.60it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-en

Zero-shot accuracy on virology_test: 39.16%


Evaluating machine_learning_test:   0%|          | 0/112 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   1%|          | 1/112 [00:00<00:29,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   2%|▏         | 2/112 [00:00<00:19,  5.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   3%|▎         | 3/112 [00:00<00:23,  4.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   4%|▎         | 4/112 [00:00<00:25,  4.23it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   4%|▍         | 5/112 [00:01<00:26,  4.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating machine_learning_test:   5%|▌         | 6/112 [00:01<00:27,  3.91it/s]Set

Zero-shot accuracy on machine_learning_test: 33.04%


Evaluating jurisprudence_test:   0%|          | 0/108 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   1%|          | 1/108 [00:00<00:13,  7.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   2%|▏         | 2/108 [00:00<00:12,  8.37it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   3%|▎         | 3/108 [00:00<00:19,  5.26it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   4%|▎         | 4/108 [00:00<00:16,  6.28it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   5%|▍         | 5/108 [00:00<00:20,  4.96it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating jurisprudence_test:   6%|▌         | 6/108 [00:01<00:22,  4.44it/s]Setting `pad_token_id` t

Zero-shot accuracy on jurisprudence_test: 51.85%


Evaluating professional_psychology_test:   0%|          | 0/612 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:   0%|          | 1/612 [00:00<01:08,  8.93it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:   0%|          | 2/612 [00:00<02:04,  4.91it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:   0%|          | 3/612 [00:00<02:21,  4.30it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:   1%|          | 4/612 [00:00<02:30,  4.05it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:   1%|          | 5/612 [00:01<02:00,  5.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_psychology_test:  

Zero-shot accuracy on professional_psychology_test: 40.69%


Evaluating abstract_algebra_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   1%|          | 1/100 [00:00<00:26,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   2%|▏         | 2/100 [00:00<00:26,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   3%|▎         | 3/100 [00:00<00:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   4%|▍         | 4/100 [00:01<00:25,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   5%|▌         | 5/100 [00:01<00:25,  3.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating abstract_algebra_test:   6%|▌         | 6/100 [00:01<00:25,  3.70it/s]Set

Zero-shot accuracy on abstract_algebra_test: 33.00%


Evaluating econometrics_test:   0%|          | 0/114 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   1%|          | 1/114 [00:00<00:30,  3.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   2%|▏         | 2/114 [00:00<00:29,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   3%|▎         | 3/114 [00:00<00:29,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   4%|▎         | 4/114 [00:01<00:29,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   4%|▍         | 5/114 [00:01<00:29,  3.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating econometrics_test:   5%|▌         | 6/114 [00:01<00:29,  3.71it/s]Setting `pad_token_id` to `eos_

Zero-shot accuracy on econometrics_test: 28.07%


Evaluating high_school_mathematics_test:   0%|          | 0/270 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:   0%|          | 1/270 [00:00<01:12,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:   1%|          | 2/270 [00:00<01:11,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:   1%|          | 3/270 [00:00<01:11,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:   1%|▏         | 4/270 [00:01<01:11,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:   2%|▏         | 5/270 [00:01<01:11,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_mathematics_test:  

Zero-shot accuracy on high_school_mathematics_test: 23.70%


Evaluating high_school_computer_science_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_computer_science_test:   1%|          | 1/100 [00:00<00:26,  3.78it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_computer_science_test:   2%|▏         | 2/100 [00:00<00:25,  3.78it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_computer_science_test:   3%|▎         | 3/100 [00:00<00:25,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_computer_science_test:   4%|▍         | 4/100 [00:00<00:19,  4.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_computer_science_test:   5%|▌         | 5/100 [00:01<00:16,  5.86it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating h

Zero-shot accuracy on high_school_computer_science_test: 50.00%


Evaluating philosophy_test:   0%|          | 0/311 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   0%|          | 1/311 [00:00<01:22,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   1%|          | 2/311 [00:00<00:54,  5.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   1%|          | 3/311 [00:00<01:06,  4.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   1%|▏         | 4/311 [00:00<01:14,  4.13it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   2%|▏         | 5/311 [00:01<00:59,  5.12it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating philosophy_test:   2%|▏         | 6/311 [00:01<00:50,  5.99it/s]Setting `pad_token_id` to `eos_token_id`:1516

Zero-shot accuracy on philosophy_test: 41.80%


Evaluating college_chemistry_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_chemistry_test:   1%|          | 1/100 [00:00<00:26,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_chemistry_test:   3%|▎         | 3/100 [00:00<00:18,  5.26it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_chemistry_test:   4%|▍         | 4/100 [00:00<00:20,  4.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_chemistry_test:   5%|▌         | 5/100 [00:01<00:22,  4.22it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_chemistry_test:   6%|▌         | 6/100 [00:01<00:23,  4.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evalu

Zero-shot accuracy on college_chemistry_test: 31.00%


Evaluating human_sexuality_test:   0%|          | 0/131 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   1%|          | 1/131 [00:00<00:34,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   2%|▏         | 2/131 [00:00<00:22,  5.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   2%|▏         | 3/131 [00:00<00:18,  6.81it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   3%|▎         | 4/131 [00:00<00:24,  5.14it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   4%|▍         | 5/131 [00:00<00:20,  6.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_sexuality_test:   5%|▍         | 6/131 [00:01<00:25,  4.99it/s]Setting `p

Zero-shot accuracy on human_sexuality_test: 48.09%


Evaluating high_school_chemistry_test:   0%|          | 0/203 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   0%|          | 1/203 [00:00<00:22,  9.03it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   1%|          | 2/203 [00:00<00:41,  4.88it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   1%|▏         | 3/203 [00:00<00:32,  6.22it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   2%|▏         | 4/203 [00:00<00:40,  4.93it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   2%|▏         | 5/203 [00:01<00:44,  4.41it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_chemistry_test:   3%|▎         

Zero-shot accuracy on high_school_chemistry_test: 37.93%


Evaluating human_aging_test:   0%|          | 0/223 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   0%|          | 1/223 [00:00<00:59,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   1%|          | 2/223 [00:00<01:00,  3.64it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   1%|▏         | 3/223 [00:00<01:00,  3.66it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   2%|▏         | 4/223 [00:01<00:59,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   2%|▏         | 5/223 [00:01<00:59,  3.68it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating human_aging_test:   3%|▎         | 6/223 [00:01<00:58,  3.68it/s]Setting `pad_token_id` to `eos_token_i

Zero-shot accuracy on human_aging_test: 41.70%


Evaluating anatomy_test:   0%|          | 0/135 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   1%|          | 1/135 [00:00<00:35,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   1%|▏         | 2/135 [00:00<00:23,  5.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   2%|▏         | 3/135 [00:00<00:28,  4.58it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   3%|▎         | 4/135 [00:00<00:31,  4.20it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   4%|▎         | 5/135 [00:01<00:25,  5.20it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating anatomy_test:   4%|▍         | 6/135 [00:01<00:21,  6.07it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end gener

Zero-shot accuracy on anatomy_test: 46.67%


Evaluating management_test:   0%|          | 0/103 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   1%|          | 1/103 [00:00<00:11,  8.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   2%|▏         | 2/103 [00:00<00:20,  4.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   3%|▎         | 3/103 [00:00<00:23,  4.28it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   5%|▍         | 5/103 [00:00<00:14,  6.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   6%|▌         | 6/103 [00:00<00:13,  7.22it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating management_test:   7%|▋         | 7/

Zero-shot accuracy on management_test: 62.14%


Evaluating college_medicine_test:   0%|          | 0/173 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   1%|          | 1/173 [00:00<00:45,  3.77it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   1%|          | 2/173 [00:00<00:29,  5.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   2%|▏         | 3/173 [00:00<00:37,  4.58it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   2%|▏         | 4/173 [00:00<00:40,  4.22it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   3%|▎         | 5/173 [00:01<00:41,  4.02it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_medicine_test:   3%|▎         | 6/173 [00:01<00:42,  3.92it/s]Set

Zero-shot accuracy on college_medicine_test: 50.29%


Evaluating computer_security_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   1%|          | 1/100 [00:00<00:26,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   2%|▏         | 2/100 [00:00<00:26,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   3%|▎         | 3/100 [00:00<00:19,  5.10it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   4%|▍         | 4/100 [00:00<00:15,  6.14it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   5%|▌         | 5/100 [00:01<00:19,  4.99it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating computer_security_test:   6%|▌         | 6/100 [00:01<00:21,  4.45i

Zero-shot accuracy on computer_security_test: 54.00%


Evaluating marketing_test:   0%|          | 0/234 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   0%|          | 1/234 [00:00<00:25,  9.05it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   1%|          | 2/234 [00:00<00:43,  5.38it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   1%|▏         | 3/234 [00:00<00:51,  4.47it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   2%|▏         | 4/234 [00:00<00:55,  4.16it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   2%|▏         | 5/234 [00:01<00:57,  3.98it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating marketing_test:   3%|▎         | 6/234 [00:01<00:46,  4.93it/s]Setting `pad_token_id` to `eos_token_id`:151643 for 

Zero-shot accuracy on marketing_test: 68.80%


Evaluating conceptual_physics_test:   0%|          | 0/235 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   0%|          | 1/235 [00:00<01:02,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   1%|          | 2/235 [00:00<01:02,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   1%|▏         | 3/235 [00:00<01:02,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   2%|▏         | 4/235 [00:01<01:02,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   2%|▏         | 5/235 [00:01<01:02,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating conceptual_physics_test:   3%|▎         | 6/235 [00:01<01:01,

Zero-shot accuracy on conceptual_physics_test: 39.15%


Evaluating medical_genetics_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   1%|          | 1/100 [00:00<00:10,  9.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   2%|▏         | 2/100 [00:00<00:19,  4.97it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   3%|▎         | 3/100 [00:00<00:15,  6.27it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   4%|▍         | 4/100 [00:00<00:19,  4.96it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   5%|▌         | 5/100 [00:01<00:21,  4.44it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating medical_genetics_test:   6%|▌         | 6/100 [00:01<00:17,  5.39it/s]Set

Zero-shot accuracy on medical_genetics_test: 47.00%


Evaluating public_relations_test:   0%|          | 0/110 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   1%|          | 1/110 [00:00<00:12,  9.08it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   2%|▏         | 2/110 [00:00<00:22,  4.89it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   3%|▎         | 3/110 [00:00<00:17,  6.22it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   4%|▎         | 4/110 [00:00<00:15,  7.00it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   5%|▍         | 5/110 [00:00<00:19,  5.32it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating public_relations_test:   5%|▌         | 6/110 [00:01<00:22,  4.64it/s]Set

Zero-shot accuracy on public_relations_test: 48.18%


Evaluating world_religions_test:   0%|          | 0/171 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   1%|          | 1/171 [00:00<00:18,  9.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   1%|          | 2/171 [00:00<00:18,  8.96it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   2%|▏         | 3/171 [00:00<00:31,  5.42it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   2%|▏         | 4/171 [00:00<00:25,  6.44it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   3%|▎         | 5/171 [00:00<00:32,  5.10it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating world_religions_test:   4%|▎         | 6/171 [00:00<00:27,  6.02it/s]Setting `p

Zero-shot accuracy on world_religions_test: 53.22%


Evaluating high_school_us_history_test:   0%|          | 0/204 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   0%|          | 1/204 [00:00<00:55,  3.66it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   1%|          | 2/204 [00:00<00:36,  5.61it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   1%|▏         | 3/204 [00:00<00:43,  4.58it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   2%|▏         | 4/204 [00:00<00:35,  5.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   2%|▏         | 5/204 [00:00<00:30,  6.53it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_us_history_test:   3%|▎  

Zero-shot accuracy on high_school_us_history_test: 48.53%


Evaluating international_law_test:   0%|          | 0/121 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   1%|          | 1/121 [00:00<00:32,  3.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   2%|▏         | 2/121 [00:00<00:31,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   2%|▏         | 3/121 [00:00<00:31,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   3%|▎         | 4/121 [00:00<00:24,  4.84it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   4%|▍         | 5/121 [00:01<00:26,  4.37it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating international_law_test:   5%|▍         | 6/121 [00:01<00:21,  5.30i

Zero-shot accuracy on international_law_test: 58.68%


Evaluating professional_law_test:   0%|          | 0/1534 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 1/1534 [00:00<02:54,  8.79it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 2/1534 [00:00<02:51,  8.94it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 3/1534 [00:00<02:51,  8.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 4/1534 [00:00<04:25,  5.77it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 5/1534 [00:00<03:49,  6.66it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_law_test:   0%|          | 6/1534 [00:00<04:54,  5.19i

Zero-shot accuracy on professional_law_test: 32.79%


Evaluating high_school_physics_test:   0%|          | 0/151 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   1%|          | 1/151 [00:00<00:42,  3.57it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   1%|▏         | 2/151 [00:00<00:26,  5.56it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   2%|▏         | 3/151 [00:00<00:32,  4.55it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   3%|▎         | 4/151 [00:00<00:25,  5.68it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   3%|▎         | 5/151 [00:01<00:30,  4.80it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_physics_test:   4%|▍         | 6/151 [00:01

Zero-shot accuracy on high_school_physics_test: 29.80%


Evaluating moral_disputes_test:   0%|          | 0/346 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   0%|          | 1/346 [00:00<00:38,  9.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   1%|          | 2/346 [00:00<00:37,  9.09it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   1%|          | 3/346 [00:00<00:37,  9.11it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   1%|          | 4/346 [00:00<00:37,  9.01it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   1%|▏         | 5/346 [00:00<00:37,  9.06it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_disputes_test:   2%|▏         | 6/346 [00:00<00:37,  9.06it/s]Setting `pad_toke

Zero-shot accuracy on moral_disputes_test: 44.80%


Evaluating high_school_world_history_test:   0%|          | 0/237 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_history_test:   0%|          | 1/237 [00:00<00:26,  8.86it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_history_test:   1%|          | 2/237 [00:00<00:26,  8.77it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_history_test:   1%|▏         | 3/237 [00:00<00:43,  5.36it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_history_test:   2%|▏         | 4/237 [00:00<00:36,  6.37it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_history_test:   2%|▏         | 5/237 [00:00<00:32,  7.07it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_world_h

Zero-shot accuracy on high_school_world_history_test: 51.05%


Evaluating professional_medicine_test:   0%|          | 0/272 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   0%|          | 1/272 [00:00<01:13,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   1%|          | 2/272 [00:00<00:47,  5.66it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   1%|          | 3/272 [00:00<00:58,  4.56it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   1%|▏         | 4/272 [00:00<00:47,  5.63it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   2%|▏         | 5/272 [00:00<00:40,  6.52it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_medicine_test:   2%|▏         

Zero-shot accuracy on professional_medicine_test: 43.01%


Evaluating miscellaneous_test:   0%|          | 0/783 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   0%|          | 1/783 [00:00<01:32,  8.48it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   0%|          | 2/783 [00:00<02:42,  4.81it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   0%|          | 3/783 [00:00<03:03,  4.26it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   1%|          | 4/783 [00:00<03:12,  4.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   1%|          | 5/783 [00:01<03:18,  3.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating miscellaneous_test:   1%|          | 6/783 [00:01<03:22,  3.84it/s]Setting `pad_token_id` t

Zero-shot accuracy on miscellaneous_test: 50.70%


Evaluating high_school_microeconomics_test:   0%|          | 0/238 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_microeconomics_test:   0%|          | 1/238 [00:00<01:03,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_microeconomics_test:   1%|          | 2/238 [00:00<01:02,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_microeconomics_test:   1%|▏         | 3/238 [00:00<01:02,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_microeconomics_test:   2%|▏         | 4/238 [00:01<01:02,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_microeconomics_test:   2%|▏         | 5/238 [00:01<01:02,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_m

Zero-shot accuracy on high_school_microeconomics_test: 45.38%


Evaluating business_ethics_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   1%|          | 1/100 [00:00<00:26,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   2%|▏         | 2/100 [00:00<00:26,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   3%|▎         | 3/100 [00:00<00:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   4%|▍         | 4/100 [00:01<00:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   5%|▌         | 5/100 [00:01<00:20,  4.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating business_ethics_test:   6%|▌         | 6/100 [00:01<00:21,  4.31it/s]Setting `p

Zero-shot accuracy on business_ethics_test: 47.00%


Evaluating clinical_knowledge_test:   0%|          | 0/265 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   0%|          | 1/265 [00:00<00:30,  8.62it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   1%|          | 2/265 [00:00<00:53,  4.92it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   1%|          | 3/265 [00:00<00:42,  6.21it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   2%|▏         | 4/265 [00:00<00:36,  7.12it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   2%|▏         | 5/265 [00:00<00:48,  5.40it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating clinical_knowledge_test:   2%|▏         | 6/265 [00:00<00:41,

Zero-shot accuracy on clinical_knowledge_test: 53.58%


Evaluating formal_logic_test:   0%|          | 0/126 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   1%|          | 1/126 [00:00<00:14,  8.89it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   2%|▏         | 2/126 [00:00<00:25,  4.84it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   2%|▏         | 3/126 [00:00<00:19,  6.17it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   3%|▎         | 4/126 [00:00<00:24,  4.89it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   4%|▍         | 5/126 [00:00<00:20,  5.85it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating formal_logic_test:   5%|▍         | 6/126 [00:01<00:24,  4.88it/s]Setting `pad_token_id` to `eos_

Zero-shot accuracy on formal_logic_test: 34.13%


Evaluating college_physics_test:   0%|          | 0/102 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   1%|          | 1/102 [00:00<00:27,  3.68it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   2%|▏         | 2/102 [00:00<00:26,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   3%|▎         | 3/102 [00:00<00:26,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   4%|▍         | 4/102 [00:00<00:20,  4.84it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   5%|▍         | 5/102 [00:01<00:22,  4.38it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_physics_test:   6%|▌         | 6/102 [00:01<00:23,  4.09it/s]Setting `p

Zero-shot accuracy on college_physics_test: 30.39%


Evaluating high_school_statistics_test:   0%|          | 0/216 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   0%|          | 1/216 [00:00<00:58,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   1%|          | 2/216 [00:00<00:57,  3.70it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   1%|▏         | 3/216 [00:00<00:57,  3.68it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   2%|▏         | 4/216 [00:01<00:57,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   2%|▏         | 5/216 [00:01<00:57,  3.69it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_statistics_test:   3%|▎  

Zero-shot accuracy on high_school_statistics_test: 38.43%


Evaluating professional_accounting_test:   0%|          | 0/282 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:   0%|          | 1/282 [00:00<01:14,  3.77it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:   1%|          | 2/282 [00:00<01:15,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:   1%|          | 3/282 [00:00<00:54,  5.07it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:   1%|▏         | 4/282 [00:00<01:02,  4.44it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:   2%|▏         | 5/282 [00:01<00:51,  5.42it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating professional_accounting_test:  

Zero-shot accuracy on professional_accounting_test: 33.69%


Evaluating elementary_mathematics_test:   0%|          | 0/378 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   0%|          | 1/378 [00:00<01:40,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   1%|          | 2/378 [00:00<01:40,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   1%|          | 3/378 [00:00<01:41,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   1%|          | 4/378 [00:01<01:40,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   1%|▏         | 5/378 [00:01<01:40,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating elementary_mathematics_test:   2%|▏  

Zero-shot accuracy on elementary_mathematics_test: 29.63%


Evaluating college_mathematics_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   1%|          | 1/100 [00:00<00:26,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   2%|▏         | 2/100 [00:00<00:26,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   3%|▎         | 3/100 [00:00<00:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   4%|▍         | 4/100 [00:01<00:25,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   5%|▌         | 5/100 [00:01<00:25,  3.72it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_mathematics_test:   6%|▌         | 6/100 [00:01

Zero-shot accuracy on college_mathematics_test: 34.00%


Evaluating moral_scenarios_test:   0%|          | 0/895 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   0%|          | 1/895 [00:00<04:00,  3.71it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   0%|          | 2/895 [00:00<03:57,  3.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   0%|          | 3/895 [00:00<03:57,  3.75it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   0%|          | 4/895 [00:01<03:58,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   1%|          | 5/895 [00:01<03:58,  3.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating moral_scenarios_test:   1%|          | 6/895 [00:01<03:58,  3.73it/s]Setting `p

Zero-shot accuracy on moral_scenarios_test: 25.70%


Evaluating college_computer_science_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_test:   1%|          | 1/100 [00:00<00:10,  9.13it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_test:   2%|▏         | 2/100 [00:00<00:19,  4.94it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_test:   3%|▎         | 3/100 [00:00<00:22,  4.34it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_test:   4%|▍         | 4/100 [00:00<00:23,  4.07it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_test:   5%|▌         | 5/100 [00:01<00:24,  3.95it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating college_computer_science_

Zero-shot accuracy on college_computer_science_test: 40.00%


Evaluating high_school_geography_test:   0%|          | 0/198 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   1%|          | 1/198 [00:00<00:21,  9.09it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   1%|          | 2/198 [00:00<00:21,  9.03it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   2%|▏         | 3/198 [00:00<00:35,  5.47it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   2%|▏         | 4/198 [00:00<00:29,  6.48it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   3%|▎         | 5/198 [00:00<00:37,  5.08it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating high_school_geography_test:   3%|▎         

Zero-shot accuracy on high_school_geography_test: 55.05%


Evaluating us_foreign_policy_test:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating us_foreign_policy_test:   1%|          | 1/100 [00:00<00:26,  3.74it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating us_foreign_policy_test:   2%|▏         | 2/100 [00:00<00:17,  5.76it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating us_foreign_policy_test:   3%|▎         | 3/100 [00:00<00:13,  6.96it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating us_foreign_policy_test:   4%|▍         | 4/100 [00:00<00:12,  7.73it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating us_foreign_policy_test:   5%|▌         | 5/100 [00:00<00:11,  8.15it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evalu

Zero-shot accuracy on us_foreign_policy_test: 68.00%


Evaluating prehistory_test:   0%|          | 0/324 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   0%|          | 1/324 [00:00<00:35,  9.10it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   1%|          | 2/324 [00:00<00:35,  9.00it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   1%|          | 3/324 [00:00<00:58,  5.52it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   1%|          | 4/324 [00:00<00:48,  6.57it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   2%|▏         | 5/324 [00:00<01:01,  5.16it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Evaluating prehistory_test:   2%|▏         | 6/324 [00:00<00:52,  6.10it/s]Setting `pad_token_id` to `eos_token_id`:1516

Zero-shot accuracy on prehistory_test: 48.77%





In [5]:
import glob
import csv
import json
import os

def summarize_evaluation_results(results_dir="evaluation_results"):
    total_correct = 0
    total_questions = 0
    total_none_outputs = 0

    result_files = glob.glob(os.path.join(results_dir, "qwen_mmlu_*.json"))
    
    for file_path in result_files:
        with open(file_path, "r") as f:
            data = json.load(f)
            results = data["results"]
            for result in results:
                pred = result["prediction"]
                gt = result["ground_truth"]
                if pred == "None" or pred == "none" or pred == None: 
                    total_none_outputs += 1
                if pred == gt:
                    total_correct += 1
                total_questions += 1

    overall_accuracy = total_correct / total_questions if total_questions > 0 else 0.0
    print(f"Overall accuracy across all subjects: {overall_accuracy:.2%}")
    print(f"Total predictions: {total_questions}")
    print(f"Total 'None' predictions: {total_none_outputs}")
summarize_evaluation_results()

Overall accuracy across all subjects: 42.89%
Total predictions: 14042
Total 'None' predictions: 0
