In [None]:
import os
import json
from dotenv import load_dotenv
from datasets import load_dataset
import string
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

llm4o = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

llm41 = AzureOpenAI(
    azure_endpoint=os.getenv("GPT_4_1_ENDPOINT"),
    api_key=os.getenv("GPT_4_1_KEY"),
    api_version="2024-12-01-preview"
)

client_deepseek = ChatCompletionsClient(
    endpoint=os.getenv("DEEPSEEKR1_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("DEEPSEEKR1_KEY"))
)

client_phi = ChatCompletionsClient(
    endpoint=os.getenv("PHI_AZURE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("PHI_AZURE_KEY"))
)

client_mistral = ChatCompletionsClient(
    endpoint=os.getenv("MISTRAL_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("MISTRAL_KEY"))
)

client_llama = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_3_8B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_3_8B_KEY"))
)

def get_4o_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt4o",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def get_41_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm41.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def get_deepseek_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_deepseek.complete(
        messages=messages,
        model="DeepSeek-R1",
        temperature=temperature
    )
    answer = response.choices[0].message.content.strip()
    return answer

def get_4o_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def get_41_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm41.chat.completions.create(
        model="gpt-4.1-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def get_phi_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_phi.complete(messages=messages, model="phi", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_mistral_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_mistral.complete(messages=messages, model="mistral-nemo", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama.complete(messages=messages, model="llama-8b", temperature=temperature)
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="train")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name, flagged_indices):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    
    for i, example in enumerate(dataset):
        if i in flagged_indices:
            continue
        
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
        
        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i} due to error: {e}")
            flagged_indices.add(i)
            continue
        
        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1
        
        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })
        
        # print(f"Sample {i+1}:\n  True Answer: {true_answer}\n  Model Answer: {pred}\n  Correct: {is_correct}\n")
    
    accuracy = correct / total if total > 0 else 0
    # print(f"Accuracy for {model_name}: {accuracy:.2f}\n")
    print("Evaluated")
    return accuracy, correct, results

#separating flagged questions which violate openai policies and cannot be answered
flagged_indices = set()

acc_gpt4o, correct_gpt4o, rows_gpt4o = evaluate_model(get_4o_completion, "OpenAI GPT-4o", "4o_pred", flagged_indices)
acc_gpt41, correct_gpt41, rows_gpt41 = evaluate_model(get_41_completion, "OpenAI GPT-4.1", "41_pred", flagged_indices)
acc_deepseekr1, correct_deepseekr1, rows_deepseekr1 = evaluate_model(get_deepseek_completion, "DeepSeek R1", "r1_pred", flagged_indices)
acc_gpt4omini, correct_gpt4omini, rows_gpt4omini = evaluate_model(get_4o_mini_completion, "OpenAI GPT-4o-mini", "4omini_pred", flagged_indices)
acc_gpt41mini, correct_gpt41mini, rows_gpt41mini = evaluate_model(get_41_mini_completion, "OpenAI GPT-4.1-mini", "41_mini_pred", flagged_indices)
acc_phi, correct_phi, rows_phi = evaluate_model(get_phi_completion, "Phi", "phi_pred", flagged_indices)
acc_mistralnemo, correct_mistralnemo, rows_mistralnemo = evaluate_model(get_mistral_completion, "Mistral NeMo", "mistral_pred", flagged_indices)
acc_llama8b, correct_llama8b, rows_llama8b = evaluate_model(get_llama_completion, "LLaMA 8B", "llama_pred", flagged_indices)

df_gpt4o = pd.DataFrame(rows_gpt4o)
df_gpt41 = pd.DataFrame(rows_gpt41)
df_deepseekr1 = pd.DataFrame(rows_deepseekr1)
df_gpt4omini = pd.DataFrame(rows_gpt4omini)
df_gpt41mini = pd.DataFrame(rows_gpt41mini)
df_phi = pd.DataFrame(rows_phi)
df_mistralnemo = pd.DataFrame(rows_mistralnemo)
df_llama8b = pd.DataFrame(rows_llama8b)

df = df_gpt4o.merge(df_gpt41, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_deepseekr1, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_gpt4omini, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_gpt41mini, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_phi, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_mistralnemo, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_llama8b, on=["q_number", "passage", "question", "actual_answer"])

df.to_csv("results_combined.csv", index=False)
print("Saved results to results.csv")

print("Final Accuracies:")
print(f"OpenAI GPT-4o: {acc_gpt4o:.2f}")
print(f"GPT-41: {acc_gpt41:.2f}")
print(f"DeepSeek R1: {acc_deepseekr1:.2f}")
print(f"GPT-4o-mini: {acc_gpt4omini:.2f}")
print(f"GPT-41-mini: {acc_gpt41mini:.2f}")
print(f"Phi: {acc_phi:.2f}")
print(f"Mistral NeMo: {acc_mistralnemo:.2f}")
print(f"LLaMA 8B: {acc_llama8b:.2f}")

final_accuracies = {
    'final accuracies': {
        "GPT-4o": {"accuracy": acc_gpt4o, "correct_count": correct_gpt4o},
        "GPT-41": {"accuracy": acc_gpt41, "correct_count": correct_gpt41},
        "DeepSeek R1": {"accuracy": acc_deepseekr1, "correct_count": correct_deepseekr1},
        "GPT-4o-mini": {"accuracy": acc_gpt4omini, "correct_count": correct_gpt4omini},
        "GPT-41-mini": {"accuracy": acc_gpt41mini, "correct_count": correct_gpt41mini},
        "Phi": {"accuracy": acc_phi, "correct_count": correct_phi},
        "Mistral NeMo": {"accuracy": acc_mistralnemo, "correct_count": correct_mistralnemo},
        "LLaMA 8B": {"accuracy": acc_llama8b, "correct_count": correct_llama8b}
    },
    'flagged_indices' : list(flagged_indices)
}

with open("final_output.json", "w") as f:
    json.dump(final_accuracies, f, indent=2)
print("Saved accuracies to final_accuracies.json")


Evaluating OpenAI GPT-4o...
Skipping sample 451 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 'safe'}}}}}
Skipping sample 927 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To 

In [10]:
print(flagged_indices)

{1414, 6028, 927, 2978, 6306, 7588, 7464, 937, 6442, 8104, 7848, 6701, 2352, 4017, 6713, 1465, 451, 7620, 5708, 4301, 7377, 5203, 8148, 8660, 1879, 3163, 989, 3551, 8032, 6370, 8297, 5616, 4594}
