In [None]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
import string
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

llm4o_mini = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_4o_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o_mini.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

client_phi = ChatCompletionsClient(
    endpoint=os.getenv("PHI_AZURE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("PHI_AZURE_KEY"))
)

client_mistral = ChatCompletionsClient(
    endpoint=os.getenv("MISTRAL_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("MISTRAL_KEY"))
)

client_llama = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_3_8B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_3_8B_KEY"))
)

def get_phi_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_phi.complete(messages=messages, model="phi", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_mistral_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_mistral.complete(messages=messages, model="mistral-nemo", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama.complete(messages=messages, model="llama-8b", temperature=temperature)
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="validation")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    for i, example in enumerate(dataset):
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."

        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i+1} due to error: {e}")
            continue

        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1

        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })

        print(f"Sample {i+1}:\n  True Answer: {true_answer}\n  Model Answer: {pred}\n  Correct: {is_correct}\n")
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy for {model_name}: {accuracy:.2f}\n")
    return accuracy, results

acc_4omini, rows_4omini = evaluate_model(get_4o_mini_completion, "OpenAI GPT-4o-mini", "4omini_pred")
acc_phi, rows_phi = evaluate_model(get_phi_completion, "Phi", "phi_pred")
acc_mistral, rows_mistral = evaluate_model(get_mistral_completion, "Mistral NeMo", "mistral_pred")
acc_llama, rows_llama = evaluate_model(get_llama_completion, "LLaMA 8B", "llama_pred")

df_4omini = pd.DataFrame(rows_4omini)
df_phi = pd.DataFrame(rows_phi)
df_mistral = pd.DataFrame(rows_mistral)
df_llama = pd.DataFrame(rows_llama)

df = df_4omini.merge(df_phi, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_mistral, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_llama, on=["q_number", "passage", "question", "actual_answer"])

print("Final Accuracies:")
print(f"GPT-4o-mini: {acc_4omini:.2f}")
print(f"Phi: {acc_phi:.2f}")
print(f"Mistral NeMo: {acc_mistral:.2f}")
print(f"LLaMA 8B: {acc_llama:.2f}")
df.to_csv("results2.csv", index=False)
print("Saved results to results.csv")


  from .autonotebook import tqdm as notebook_tqdm


Evaluating OpenAI GPT-4o-mini...
Sample 1:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 2:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 3:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 4:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 5:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 6:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 7:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 8:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 9:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 10:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 11:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 12:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 13:
  True Answer: True
  Model Answer: False
  Correct: False

Sample 14:
  True Answer: True
  Model Answer: True
  Correct: True

Sa