In [None]:
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

import re
from pathlib import Path

device_map = {"": 0} if torch.cuda.is_available() else {"": "cpu"}
# Define custom load function
def load_custom_model(model_dir):
    model = AutoModelForCausalLM.from_pretrained(model_dir, device_map=device_map)
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    return model, tokenizer

# Define chat function
def chat(model, tokenizer, user_prompt, system_prompt, max_new_tokens=1000):
    messages = []
    
    messages.append({'role': 'user', 'content': user_prompt})
    messages.append({'role': 'system', 'content': system_prompt})
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", truncation=True, max_length=model.config.max_position_embeddings, enable_thinking=False).to(model.device)

    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False, top_k=50, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

    model_response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    

    return model_response

def evaluate_answer(model_output: str, correct_answer: str) -> bool:
    match = re.search(r"\\boxed\{(.+?)\}", model_output)
    if not match:
        return False  # No valid boxed answer found

    extracted = match.group(1).upper()
    is_correct = extracted == correct_answer.upper()

    return is_correct

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets_info = {
    "test-ai2_arc.parquet": {
        "system_prompt": (
            "You are taking a multiple-choice test.\n"
            "Each question will have exactly 4 options: A, B, C or D.\n"
            "Read the question and choose the correct answer.\n"
            "Output the letter of the correct answer inside \\boxed{}, like this: \\boxed{C}"
        ),
        "context": False,
    },
    
    "test-boolq.parquet": {
        "system_prompt": (
            "You are answering a True/False question.\n"
            "The question will be accompanied by a short passage of context.\n"
            "Your answer must be either False or True.\n"
            "Output your answer inside \\boxed{}, like this: \\boxed{True}"
        ),
        "context": True,
    },
    
    "test-squad_v2.parquet": {
        "system_prompt": (
            "You are answering a question based on a passage.\n"
            "Read the context carefully and provide the exact answer span from the passage.\n"
            "Do not add extra words or explanations.\n"
            "Output your answer inside \\boxed{}, like this: \\boxed{Einstein}"
        ),
        "context": True,
    }
}

In [None]:
models = ["Qwen/Qwen3-1.7B"]
datasets = list(Path("../Datasets").glob('*.parquet'))

for model_name in models:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map=device_map
    )

    print(f"**********************")
    print(f"Model: {model_name}")

    for dataset in datasets:
        dataset_name = dataset.name

        df = pd.read_parquet(dataset).head(3)
        total = len(df)
        correct_count = 0

        for idx, row in df.iterrows():
            user_prompt = row["question"]

            if datasets_info[dataset_name]["context"]:
                user_prompt += row["context"]

            response = chat(model, tokenizer, user_prompt, datasets_info[dataset_name]["system_prompt"])

            is_correct = evaluate_answer(response, str(row["answer"]))
            if is_correct:
                correct_count += 1

        # --- Final stats ---
        accuracy = correct_count / total * 100
        print(f"**********************")
        print(f"Dataset: {dataset_name}")
        print(f"✅ Total Questions: {total}")
        print(f"✅ Correct Answers: {correct_count}")
        print(f"📊 Accuracy: {accuracy:.2f}%")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 31.79it/s]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**********************
Model: Qwen/Qwen3-1.7B


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


False
True


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


True
True


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


True
True
**********************
Dataset: test-boolq.parquet
✅ Total Questions: 3
✅ Correct Answers: 0
📊 Accuracy: 0.00%


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


France
France


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


10th and 11th centuries
Normans in Normandy


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Denmark, Iceland and Norway
Norse
**********************
Dataset: test-squad_v2.parquet
✅ Total Questions: 3
✅ Correct Answers: 0
📊 Accuracy: 0.00%


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


C
The question states that an astronomer observes a **planet rotating faster** after a **meteorite impact**. We are to determine the most likely effect of this increase in rotation.

Let's analyze the options:

- **A. Planetary density will decrease.**  
  - Density is mass divided by volume. An increase in rotation does **not** directly affect density. Density depends on mass and volume, not rotation. So this is **not likely**.

- **B. Planetary years will become longer.**  
  - A planetary year is the time it takes to orbit the sun. Rotation and orbit are **distinct**. An increase in rotation does **not** affect the orbital period. So this is **not likely**.

- **C. Planetary days will become shorter.**  
  - A planetary day is the time it takes for a planet to rotate once on its axis. If the planet is rotating faster, its **day length** (rotational period) will **decrease**. This is **directly** related to the observed increase in rotation. So this is **likely**.

- **D. Planetary g

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


B
The question asks what will most likely result from testing different building designs for their ability to withstand earthquakes.

- **A. Buildings will be built faster** – This is unlikely, as testing does not necessarily speed up construction.
- **B. Buildings will be made safer** – This is likely, as testing different designs to withstand earthquakes would aim to improve safety.
- **C. Building designs will look nicer** – This is not directly related to earthquake testing and is not a likely outcome.
- **D. Building materials will be cheaper** – This is not directly related to the testing process and is not a likely outcome.

The most logical and direct result of testing different building designs for earthquake resistance is that **buildings will be made safer**.

$$
\boxed{B}
$$
C
The question asks which step signals the beginning of photosynthesis, and the end result is the production of sugar and oxygen.

Photosynthesis begins with **light energy being absorbed** by chlorophy