In [None]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
import string
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

llm41 = AzureOpenAI(
    azure_endpoint=os.getenv("GPT_4_1_ENDPOINT"),
    api_key=os.getenv("GPT_4_1_KEY"),
    api_version="2024-12-01-preview"
    
)

def get_41_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm41.chat.completions.create(
        model="gpt-4.1-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()


def get_41_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm41.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="validation")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    for i, example in enumerate(dataset):
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."

        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i+1} due to error: {e}")
            continue

        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1

        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })

        #print(f"Sample {i+1}:\n  True Answer: {true_answer}\n  Model Answer: {pred}\n  Correct: {is_correct}\n")
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy for {model_name}: {accuracy:.2f}\n")
    return accuracy, results

acc_41_mini, rows_41_mini = evaluate_model(get_41_mini_completion, "OpenAI GPT-4.1-mini", "41_mini_pred")
acc_41, rows_41 = evaluate_model(get_41_completion, "OpenAI GPT-4.1", "41_pred")


# Merge results across all models
df_41mini = pd.DataFrame(rows_41_mini)
df_41 = pd.DataFrame(rows_41)

df = df_41mini.merge(df_41, on=["q_number", "passage", "question", "actual_answer"])


print("Final Accuracies:")
print(f"GPT-41-mini: {acc_41_mini:.2f}")
print(f"GPT-41: {acc_41:.2f}")


Evaluating OpenAI GPT-4.1-mini...
Skipping sample 130 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'medium'}}}}}
Skipping sample 266 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retr

In [4]:
df.head(5)

Unnamed: 0,q_number,passage,question,actual_answer,41_mini_pred,41_pred
0,1,All biomass goes through at least some of thes...,does ethanol take more energy make that produces,False,True,False
1,2,Property tax or 'house tax' is a local tax on ...,is house tax and property tax are same,True,True,True
2,3,Phantom pain sensations are described as perce...,is pain experienced in a missing body part or ...,True,True,True
3,4,Harry Potter and the Escape from Gringotts is ...,is harry potter and the escape from gringotts ...,True,True,True
4,5,Hydroxyzine preparations require a doctor's pr...,is there a difference between hydroxyzine hcl ...,True,True,True


In [5]:
df.shape

(3262, 6)

In [6]:
df.to_csv("results3.csv", index=False)
print("Saved results to results.csv")


Saved results to results.csv
