In [6]:
import re
import json
from pathlib import Path
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

device_map = {"": 0} if torch.cuda.is_available() else {"": "cpu"}
# Define custom load function
def load_custom_model(model_dir):
    model = AutoModelForCausalLM.from_pretrained(model_dir, device_map=device_map)
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    return model, tokenizer

# Define chat function
def chat(model, tokenizer, user_prompt, system_prompt, max_new_tokens=1000):
    messages = []
    
    messages.append({'role': 'user', 'content': user_prompt})
    messages.append({'role': 'system', 'content': system_prompt})
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", truncation=True, max_length=model.config.max_position_embeddings, enable_thinking=False).to(model.device)

    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False, top_k=50, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

    model_response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    

    return model_response

def evaluate_answer(model_output: str, correct_answer: str) -> bool:
    match = re.search(r"\\boxed\{(.+?)\}", model_output)
    if not match:
        return False  # No valid boxed answer found

    extracted = match.group(1).upper()
    is_correct = extracted == correct_answer.upper()

    return is_correct

In [7]:
datasets_info = {
    "test-ai2_arc.parquet": {
        "system_prompt": (
            "You are taking a multiple-choice test.\n"
            "Each question will have exactly 4 options: A, B, C or D.\n"
            "Read the question and choose the correct answer.\n"
            "Output the letter of the correct answer inside \\boxed{}, like this: \\boxed{C}"
        ),
        "context": False,
    },
    
    "test-boolq.parquet": {
        "system_prompt": (
            "You are answering a True/False question.\n"
            "The question will be accompanied by a short passage of context.\n"
            "Your answer must be either False or True.\n"
            "Output your answer inside \\boxed{}, like this: \\boxed{True}"
        ),
        "context": True,
    },
    
    "test-squad_v2.parquet": {
        "system_prompt": (
            "You are answering a question based on a passage.\n"
            "Read the context carefully and provide the exact answer span from the passage.\n"
            "Do not add extra words or explanations.\n"
            "Output your answer inside \\boxed{}, like this: \\boxed{Einstein}"
        ),
        "context": True,
    }
}

In [None]:
models = ["Qwen/Qwen3-1.7B", "Qwen/Qwen3-0.6B"]
datasets = list(Path("../Datasets").glob('*.parquet'))

results_file = Path("benchmark_results.json")
if results_file.exists():
    with open(results_file, "r", encoding="utf-8") as f:
        results = json.load(f)
else:
    results = {}

for model_name in models:
    if model_name in results:
        continue

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map=device_map
    )
    model_results = {}

    print(f"**********************")
    print(f"Model: {model_name}")

    for dataset in datasets:
        dataset_name = dataset.name

        df = pd.read_parquet(dataset)
        total = len(df)
        correct_count = 0

        for idx, row in df.iterrows():
            user_prompt = row["question"]

            if datasets_info[dataset_name]["context"]:
                user_prompt += row["context"]

            response = chat(model, tokenizer, user_prompt, datasets_info[dataset_name]["system_prompt"])

            is_correct = evaluate_answer(response, str(row["answer"]))
            if is_correct:
                correct_count += 1

        # --- Final stats ---
        accuracy = correct_count / total * 100
        model_results[dataset_name] = round(accuracy, 2)
        print(f"**********************")
        print(f"Dataset: {dataset_name}")
        print(f"✅ Total Questions: {total}")
        print(f"✅ Correct Answers: {correct_count}")
        print(f"📊 Accuracy: {accuracy:.2f}%")

    results[model_name] = model_results

# Guardar resultados acumulados
with open(results_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("\n💾 Resultados actualizados en benchmark_results.json")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**********************
Model: Qwen/Qwen3-0.6B


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**********************
Dataset: test-boolq.parquet
✅ Total Questions: 5
✅ Correct Answers: 1
📊 Accuracy: 20.00%


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**********************
Dataset: test-squad_v2.parquet
✅ Total Questions: 5
✅ Correct Answers: 3
📊 Accuracy: 60.00%


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**********************
Dataset: test-ai2_arc.parquet
✅ Total Questions: 5
✅ Correct Answers: 4
📊 Accuracy: 80.00%

💾 Resultados actualizados en benchmark_results.json
