In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from ast import literal_eval
from transformers import AutoTokenizer, AutoModelForCausalLM

system_message = "You are an AI assistant that answers multiple-choice questions by analyzing a given paragraph. Return only the answer number. Avoid any additional explanations."

PROMPT_NO_QUESTION_PLUS = """Solve the following multiple-choice Korean question. Provide the answer as a single number (1, 2, 3, 4, or 5). Example: Answer: 1

Paragraph:
{paragraph}

Question:
{question}

Choices:
{choices}

Instructions:
1. Carefully read the paragraph to extract relevant information.
2. Identify key points in the question and align them with the paragraph content.
3. Analyze each choice in relation to the paragraph and eliminate incorrect options step by step.
4. Briefly justify why each eliminated option is incorrect.
5. Clearly state the correct answer, providing concise reasoning.

Finally, provide the answer in the format 'Answer: [number]'. The answer must be one of 1, 2, 3, 4, or 5.

Answer:
"""

PROMPT_QUESTION_PLUS = """Solve the following multiple-choice Korean question. Provide the answer as a single number (1, 2, 3, 4, or 5). Example: Answer: 1

Paragraph:
{paragraph}

Question:
{question}

Additional Information:
{question_plus}

Choices:
{choices}

Instructions:
1. Start by understanding the paragraph's context and extract key details.
2. Integrate the additional information provided to refine your understanding.
3. Carefully analyze the question and determine its focus.
4. Evaluate each choice in relation to the paragraph and additional information.
5. Eliminate incorrect answers step by step, explaining why each one does not fit.
6. Select the most appropriate choice and justify your decision concisely.

Finally, provide the answer in the format 'Answer: [number]'. The answer must be one of 1, 2, 3, 4, or 5.

Answer:
"""

# Load Qwen-2.5-32B-instruct Model and Tokenizer
model_name_or_path = "Qwen/Qwen2.5-32B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, trust_remote_code=True,
    load_in_4bit=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, trust_remote_code=True,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def load_and_process_data(file_path):
    dataset = pd.read_csv(file_path)
    
    records = []
    for _, row in dataset.iterrows():
        problems = literal_eval(row['problems'])
        record = {
            'id': row['id'],
            'paragraph': row['paragraph'],
            'question': problems['question'],
            'choices': problems['choices'],
            'answer': problems.get('answer', None),
            "question_plus": problems.get('question_plus', None),
            'explanation': row.get('explanation', None)
        }
        records.append(record)

    df = pd.DataFrame(records)
    df['question_plus'] = df['question_plus'].fillna('')
    return Dataset.from_pandas(df)

def format_inference_dataset(test_df):
    test_dataset = []
    for i, row in test_df.iterrows():
        choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])
        len_choices = len(row["choices"])
        
        if row["question_plus"]:
            user_message = PROMPT_QUESTION_PLUS.format(
                paragraph=row["paragraph"],
                question=row["question"],
                question_plus=row["question_plus"],
                choices=choices_string,
            )
        else:
            user_message = PROMPT_NO_QUESTION_PLUS.format(
                paragraph=row["paragraph"],
                question=row["question"],
                choices=choices_string,
            )

        test_dataset.append(
            {
                "id": row["id"],
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message},
                ],
                "label": row["answer"],
                "len_choices": len_choices,
            }
        )

    return test_dataset

model.eval()
model.to("cuda")

test_df = load_and_process_data('test.csv').to_pandas()
test_dataset = format_inference_dataset(test_df)

infer_results = []

with torch.inference_mode():
    for data in tqdm(test_dataset, total=len(test_dataset)):
        input_tensor = tokenizer.apply_chat_template(
            data["messages"],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            input_tensor,
            max_new_tokens=450,
            pad_token_id=tokenizer.eos_token_id,   
            eos_token_id=tokenizer.eos_token_id,
            do_sample=False,
            temperature=None, 
            top_p=None,
            top_k=None,
        )

        response = tokenizer.decode(outputs[0][input_tensor.size(1):], skip_special_tokens=True)

        infer_results.append({
            "id": data["id"],
            "answer": response.strip()
        })

pd.DataFrame(infer_results).to_csv("output_zeroshot_cot_v2.csv", index=False)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.
  0%|          | 0/869 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 869/869 [1:15:56<00:00,  5.24s/it]


In [3]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'output_zeroshot_cot_v2.csv'
df = pd.read_csv(file_path)

# Function to clean and convert 'answer' values
def process_answer(value):
    try:
        # Extract numeric value after "Answer:" if present, else use the value directly
        if isinstance(value, str) and "Answer:" in value:
            numeric_value = int(value.split("Answer:")[-1].strip())
        else:
            numeric_value = int(value)
        
        # Ensure the numeric value is within 1-5
        if 1 <= numeric_value <= 5:
            return numeric_value
        else:
            return 1  # Default to 1 if out of range
    except:
        return 1  # Default to 1 if conversion fails

# Apply the processing function to the 'answer' column
df['answer'] = df['answer'].apply(process_answer)

# Save the processed DataFrame
output_path = 'processed_output_zeroshot_cot_v2.csv'
df.to_csv(output_path, index=False)


# File paths
test_path = '../data/test_answer_gpt4o.csv'
predictions_path = 'processed_output_zeroshot_cot_v2.csv'

# Load data
test = pd.read_csv(test_path)
predictions = pd.read_csv(predictions_path)

# Calculate overall accuracy
accuracy = (test['answer'] == predictions['answer']).mean()
print(f"Overall Accuracy: {accuracy:.4f}")

# Accuracy for 434 rows sampled with random seed 42
sampled_indices = test.sample(n=434, random_state=42).index
sampled_accuracy = (test.loc[sampled_indices, 'answer'] == predictions.loc[sampled_indices, 'answer']).mean()
print(f"Accuracy for 434 random rows (seed=42): {sampled_accuracy:.4f}")

Overall Accuracy: 0.7883
Accuracy for 434 random rows (seed=42): 0.7765
