In [1]:
import os
import json
from dotenv import load_dotenv
import string
import pandas as pd
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage

load_dotenv("env.env")

client_phi = ChatCompletionsClient(
    endpoint=os.getenv("PHI_AZURE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("PHI_AZURE_KEY"))
)
client_mistral = ChatCompletionsClient(
    endpoint=os.getenv("MISTRAL_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("MISTRAL_KEY"))
)
client_llama = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_3_8B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_3_8B_KEY"))
)
llm4o = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
llm41 = AzureOpenAI(
    azure_endpoint=os.getenv("GPT_4_1_ENDPOINT"),
    api_key=os.getenv("GPT_4_1_KEY"),
    api_version="2024-12-01-preview"
)

def get_phi_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_phi.complete(messages=messages, model="phi", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_mistral_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_mistral.complete(messages=messages, model="mistral-nemo", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama.complete(messages=messages, model="llama-8b", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_4o_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def get_41_mini_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm41.chat.completions.create(
        model="gpt-4.1-mini",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def is_3_2_split(row):
    preds = [row['phi_pred'], row['mistral_pred'], row['llama_pred'], row['gpt4o_mini_pred'], row['gpt41_mini_pred']]
    yes_count = sum([p is True for p in preds])
    no_count = sum([p is False for p in preds])
    return (yes_count == 3 and no_count == 2) or (yes_count == 2 and no_count == 3)

def get_ensemble_prediction(passage, question, true_answer):
    prompt = f"Passage: {passage}\nQuestion: {question}\nBased on the provided passage, answer the provided question carefully with just yes or no."
    model_funcs = [
        ("phi", get_phi_completion),
        ("mistral", get_mistral_completion),
        ("llama", get_llama_completion),
        ("gpt4o_mini", get_4o_mini_completion),
        ("gpt41_mini", get_41_mini_completion)
    ]
    preds = []
    model_preds = {}
    for name, func in model_funcs:
        try:
            answer = func(prompt)
            parsed = parse_answer(answer)
            model_preds[f"{name}_raw"] = answer
            model_preds[f"{name}_pred"] = parsed
            if parsed is not None:
                preds.append(parsed)
        except Exception:
            model_preds[f"{name}_raw"] = None
            model_preds[f"{name}_pred"] = None
    if len(preds) == 0:
        return None, False, model_preds
    vote = sum(preds) > len(preds) // 2
    return vote, vote == true_answer, model_preds

def run_ensemble_on_3_2(csv_in, csv_out, json_out):
    df = pd.read_csv(csv_in)
    df = df[df.apply(is_3_2_split, axis=1)]
    flagged_indices = set()
    results = []
    correct = 0
    total = 0
    print(f"Evaluating Ensemble on {csv_in} 3-2/2-3 splits...")
    for i, row in df.iterrows():
        passage = row["passage"]
        question = row["question"]
        true_answer = row["actual_answer"]
        pred, is_correct, model_preds = get_ensemble_prediction(passage, question, true_answer)
        if pred is None:
            flagged_indices.add(i)
            continue
        entry = {
            "q_number": row["q_number"],
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            "ensemble_pred": pred,
            "phi_pred": model_preds["phi_pred"],
            "mistral_pred": model_preds["mistral_pred"],
            "llama_pred": model_preds["llama_pred"],
            "gpt4o_mini_pred": model_preds["gpt4o_mini_pred"],
            "gpt41_mini_pred": model_preds["gpt41_mini_pred"]
        }
        results.append(entry)
        correct += int(is_correct)
        total += 1
    acc_ensemble = correct / total if total > 0 else 0
    print("Evaluated Ensemble")
    df_ensemble = pd.DataFrame(results)
    df_ensemble.to_csv(csv_out, index=False)
    print(f"Saved {csv_out}")
    print(f"Ensemble Accuracy: {acc_ensemble:.2f}")
    final_output = {
        "ensemble_accuracy": acc_ensemble,
        "correct_count": correct,
        "flagged_indices": list(flagged_indices)
    }
    with open(json_out, "w") as f:
        json.dump(final_output, f, indent=2)
    print(f"Saved {json_out}")

run_ensemble_on_3_2("ensemble3_train.csv", "ensemble3_1_train.csv", "ensemble3_1_train.json")
run_ensemble_on_3_2("ensemble3_valid.csv", "ensemble3_1_valid.csv", "ensemble3_1_valid.json")

Evaluating Ensemble on ensemble3_train.csv 3-2/2-3 splits...
Evaluated Ensemble
Saved ensemble3_1_train.csv
Ensemble Accuracy: 0.66
Saved ensemble3_1_train.json
Evaluating Ensemble on ensemble3_valid.csv 3-2/2-3 splits...
Evaluated Ensemble
Saved ensemble3_1_valid.csv
Ensemble Accuracy: 0.69
Saved ensemble3_1_valid.json


In [2]:
import os
import json
from dotenv import load_dotenv
import string
import pandas as pd
from openai import AzureOpenAI

load_dotenv("env.env")

llm4o = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_4o_completion(prompt, temperature=0.1):
    messages = [{"role": "user", "content": prompt}]
    response = llm4o.chat.completions.create(
        model="gpt4o",
        messages=messages,
        temperature=temperature,
        stream=False
    )
    return response.choices[0].message.content.strip()

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def is_3_2_split(row):
    preds = [row['phi_pred'], row['mistral_pred'], row['llama_pred'], row['gpt4o_mini_pred'], row['gpt41_mini_pred']]
    yes_count = sum([p is True for p in preds])
    no_count = sum([p is False for p in preds])
    return (yes_count == 3 and no_count == 2) or (yes_count == 2 and no_count == 3)

def evaluate_model_on_ensemble3_1(csv_in, csv_out, json_out):
    df = pd.read_csv(csv_in)
    # Only keep 3-2/2-3 splits
    df = df[df.apply(is_3_2_split, axis=1)]
    results = []
    correct = 0
    total = 0
    print(f"Evaluating GPT-4o on {csv_in} (only 3-2/2-3 splits)...")
    for i, row in df.iterrows():
        passage = row["passage"]
        question = row["question"]
        true_answer = row["actual_answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
        try:
            answer_text = get_4o_completion(prompt)
        except Exception as e:
            print(f"Skipping sample {i} due to error: {e}")
            continue
        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1
        results.append({
            "q_number": row["q_number"],
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            "4o_pred": pred
        })
    accuracy = correct / total if total > 0 else 0
    print("Evaluated")
    df_out = pd.DataFrame(results)
    df_out.to_csv(csv_out, index=False)
    print(f"Saved {csv_out}")
    print(f"GPT-4o Accuracy: {accuracy:.2f}")
    final_output = {
        "model_accuracy": accuracy,
        "correct_count": correct
    }
    with open(json_out, "w") as f:
        json.dump(final_output, f, indent=2)
    print(f"Saved {json_out}")

evaluate_model_on_ensemble3_1("ensemble3_1_train.csv", "ensemble3_gpt_train.csv", "ensemble3_gpt_train.json")
evaluate_model_on_ensemble3_1("ensemble3_1_valid.csv", "ensemble3_gpt_valid.csv", "ensemble3_gpt_valid.json")

Evaluating GPT-4o on ensemble3_1_train.csv (only 3-2/2-3 splits)...
Evaluated
Saved ensemble3_gpt_train.csv
GPT-4o Accuracy: 0.70
Saved ensemble3_gpt_train.json
Evaluating GPT-4o on ensemble3_1_valid.csv (only 3-2/2-3 splits)...
Evaluated
Saved ensemble3_gpt_valid.csv
GPT-4o Accuracy: 0.73
Saved ensemble3_gpt_valid.json


In [3]:
import pandas as pd
import json

def is_3_2_split(row):
    preds = [row['phi_pred'], row['mistral_pred'], row['llama_pred'], row['gpt4o_mini_pred'], row['gpt41_mini_pred']]
    yes_count = sum([p is True for p in preds])
    no_count = sum([p is False for p in preds])
    return (yes_count == 3 and no_count == 2) or (yes_count == 2 and no_count == 3)

def compare_ensembles(csv_a, csv_b, out_json):
    df_a = pd.read_csv(csv_a)
    df_b = pd.read_csv(csv_b)
    df_a = df_a[df_a.apply(is_3_2_split, axis=1)]
    df_a = df_a.set_index("q_number")
    df_b = df_b.set_index("q_number")
    common = df_a.index.intersection(df_b.index)
    total = len(common)
    correct_a = 0
    correct_b = 0
    correct_a_not_b = 0
    correct_b_not_a = 0
    for qn in common:
        actual = df_a.loc[qn, "actual_answer"]
        pred_a = df_a.loc[qn, "ensemble_pred"]
        pred_b = df_b.loc[qn, "ensemble_pred"]
        is_a = (pred_a == actual)
        is_b = (pred_b == actual)
        if is_a:
            correct_a += 1
        if is_b:
            correct_b += 1
        if is_a and not is_b:
            correct_a_not_b += 1
        if is_b and not is_a:
            correct_b_not_a += 1
    out = {
        "total_questions": total,
        "correct_ensemble3": correct_a,
        "correct_ensemble3_1": correct_b,
        "correct_ensemble3_not_ensemble3_1": correct_a_not_b,
        "correct_ensemble3_1_not_ensemble3": correct_b_not_a
    }
    with open(out_json, "w") as f:
        json.dump(out, f, indent=2)
    print(f"Saved {out_json}")

compare_ensembles("ensemble3_train.csv", "ensemble3_1_train.csv", "ensemble3_vs_3_1_train.json")
compare_ensembles("ensemble3_valid.csv", "ensemble3_1_valid.csv", "ensemble3_vs_3_1_valid.json")

Saved ensemble3_vs_3_1_train.json
Saved ensemble3_vs_3_1_valid.json


In [4]:
import pandas as pd
import json

def is_3_2_split(row):
    preds = [row['phi_pred'], row['mistral_pred'], row['llama_pred'], row['gpt4o_mini_pred'], row['gpt41_mini_pred']]
    yes_count = sum([p is True for p in preds])
    no_count = sum([p is False for p in preds])
    return (yes_count == 3 and no_count == 2) or (yes_count == 2 and no_count == 3)

def compare_ensemble_and_gpt(csv_ens, csv_gpt, out_json):
    df_ens = pd.read_csv(csv_ens)
    df_gpt = pd.read_csv(csv_gpt)
    df_ens = df_ens[df_ens.apply(is_3_2_split, axis=1)]
    df_ens = df_ens.set_index("q_number")
    df_gpt = df_gpt.set_index("q_number")
    common = df_ens.index.intersection(df_gpt.index)
    total = len(common)
    correct_ens = 0
    correct_gpt = 0
    correct_ens_not_gpt = 0
    correct_gpt_not_ens = 0
    for qn in common:
        actual = df_ens.loc[qn, "actual_answer"]
        pred_ens = df_ens.loc[qn, "ensemble_pred"]
        pred_gpt = df_gpt.loc[qn, "4o_pred"]
        is_ens = (pred_ens == actual)
        is_gpt = (pred_gpt == actual)
        if is_ens:
            correct_ens += 1
        if is_gpt:
            correct_gpt += 1
        if is_ens and not is_gpt:
            correct_ens_not_gpt += 1
        if is_gpt and not is_ens:
            correct_gpt_not_ens += 1
    out = {
        "total_questions": total,
        "correct_ensemble": correct_ens,
        "correct_gpt": correct_gpt,
        "correct_ensemble_not_gpt": correct_ens_not_gpt,
        "correct_gpt_not_ensemble": correct_gpt_not_ens
    }
    with open(out_json, "w") as f:
        json.dump(out, f, indent=2)
    print(f"Saved {out_json}")

compare_ensemble_and_gpt("ensemble3_1_train.csv", "ensemble3_gpt_train.csv", "ensemble3_1_vs_gpt_train.json")
compare_ensemble_and_gpt("ensemble3_1_valid.csv", "ensemble3_gpt_valid.csv", "ensemble3_1_vs_gpt_valid.json")

Saved ensemble3_1_vs_gpt_train.json
Saved ensemble3_1_vs_gpt_valid.json
