In [1]:
import os
import json
from dotenv import load_dotenv
from datasets import load_dataset
import string
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

llm4o = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_4o_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt4o",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="train")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name, flagged_indices):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    
    for i, example in enumerate(dataset):
        if i in flagged_indices:
            continue
        
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
        
        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i} due to error: {e}")
            flagged_indices.add(i)
            continue
        
        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1
        
        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })
    
    accuracy = correct / total if total > 0 else 0
    print("Evaluated")
    return accuracy, correct, results

flagged_indices = set()

acc_gpt4o, correct_gpt4o, rows_gpt4o = evaluate_model(get_4o_completion, "OpenAI GPT-4o", "4o_pred", flagged_indices)

df = pd.DataFrame(rows_gpt4o)
df.to_csv("gpt4o(2)_train_results.csv", index=False)
print("Saved gpt4o(2)_train_results.csv")
print(f"Ensemble Accuracy: {acc_gpt4o:.2f}")

final_output = {
    "model_accuracy": acc_gpt4o,
    "correct_count": correct_gpt4o,
    "flagged_indices": list(flagged_indices)
}

with open("gpt4o(2)_train_output.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved gpt4o(2)_train_output.json")

  from .autonotebook import tqdm as notebook_tqdm


Evaluating OpenAI GPT-4o...
Skipping sample 451 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 'safe'}}}}}
Skipping sample 927 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To 

In [2]:
import os
import json
from dotenv import load_dotenv
from datasets import load_dataset
import string
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

llm4o = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_4o_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt4o",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="validation")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name, flagged_indices):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    
    for i, example in enumerate(dataset):
        if i in flagged_indices:
            continue
        
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
        
        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i} due to error: {e}")
            flagged_indices.add(i)
            continue
        
        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1
        
        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })
    
    accuracy = correct / total if total > 0 else 0
    print("Evaluated")
    return accuracy, correct, results

flagged_indices = set()

acc_gpt4o, correct_gpt4o, rows_gpt4o = evaluate_model(get_4o_completion, "OpenAI GPT-4o", "4o_pred", flagged_indices)

df = pd.DataFrame(rows_gpt4o)
df.to_csv("gpt4o(2)_valid_results.csv", index=False)
print("Saved gpt4o(2)_valid_results.csv")
print(f"Ensemble Accuracy: {acc_gpt4o:.2f}")

final_output = {
    "model_accuracy": acc_gpt4o,
    "correct_count": correct_gpt4o,
    "flagged_indices": list(flagged_indices)
}

with open("gpt4o(2)_valid_output.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved gpt4o(2)_valid_output.json")

Evaluating OpenAI GPT-4o...
Skipping sample 129 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'medium'}}}}}
Skipping sample 265 due to error: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To 