In [None]:
import os
import json
from dotenv import load_dotenv
from datasets import load_dataset, concatenate_datasets
import string
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

client_phi = ChatCompletionsClient(
    endpoint=os.getenv("PHI_AZURE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("PHI_AZURE_KEY"))
)

client_mistral = ChatCompletionsClient(
    endpoint=os.getenv("MISTRAL_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("MISTRAL_KEY"))
)

client_llama = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_3_8B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_3_8B_KEY"))
)

def get_phi_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_phi.complete(messages=messages, model="phi", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_mistral_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_mistral.complete(messages=messages, model="mistral-nemo", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama.complete(messages=messages, model="llama-8b", temperature=temperature)
    return response.choices[0].message.content.strip()

# ds = load_dataset("boolq", split={"train": "train", "validation": "validation"})
# combined_dataset = concatenate_datasets([ds["train"], ds["validation"]])
# dataset = combined_dataset

dataset = load_dataset("boolq", split="train")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def get_ensemble_prediction(passage, question, true_answer):
    prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
    model_funcs = [
        ("phi", get_phi_completion),
        ("mistral", get_mistral_completion),
        ("llama", get_llama_completion)
    ]
    preds = []
    model_preds = {}
    for name, func in model_funcs:
        try:
            answer = func(prompt)
            parsed = parse_answer(answer)
            model_preds[f"{name}_raw"] = answer
            model_preds[f"{name}_pred"] = parsed
            if parsed is not None:
                preds.append(parsed)
        except Exception as e:
            model_preds[f"{name}_raw"] = None
            model_preds[f"{name}_pred"] = None
    if len(preds) == 0:
        return None, False, model_preds
    vote = sum(preds) > len(preds) // 2
    return vote, vote == true_answer, model_preds

flagged_indices = set()
results = []
correct = 0
total = 0

print("Evaluating Ensemble...")

for i, example in enumerate(dataset):
    if i in flagged_indices:
        continue
    passage = example["passage"]
    question = example["question"]
    true_answer = example["answer"]
    pred, is_correct, model_preds = get_ensemble_prediction(passage, question, true_answer)
    if pred is None:
        flagged_indices.add(i)
        continue
    entry = {
        "q_number": i + 1,
        "passage": passage,
        "question": question,
        "actual_answer": true_answer,
        "ensemble_pred": pred,
        "phi_pred": model_preds["phi_pred"],
        "mistral_pred": model_preds["mistral_pred"],
        "llama_pred": model_preds["llama_pred"]
    }
    results.append(entry)
    correct += int(is_correct)
    total += 1

acc_ensemble = correct / total if total > 0 else 0
print("Evaluated Ensemble")

df_ensemble = pd.DataFrame(results)
df_ensemble.to_csv("ensemble1_train.csv", index=False)
print("Saved ensemble_train.csv")
print(f"Ensemble Accuracy: {acc_ensemble:.2f}")

final_output = {
    "ensemble_accuracy": acc_ensemble,
    "correct_count": correct,
    "flagged_indices": list(flagged_indices)
}

with open("ensemble1_train.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved ensemble1_train.json")

  from .autonotebook import tqdm as notebook_tqdm


Evaluating Ensemble...
Evaluated Ensemble
Saved ensemble_train_results.csv
Ensemble Accuracy: 0.87
Saved ensemble_train_output.json
Evaluated Ensemble
Saved ensemble_train_results.csv
Ensemble Accuracy: 0.87
Saved ensemble_train_output.json


In [1]:
import os
import json
from dotenv import load_dotenv
from datasets import load_dataset, concatenate_datasets
import string
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd

load_dotenv("env.env")

client_phi = ChatCompletionsClient(
    endpoint=os.getenv("PHI_AZURE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("PHI_AZURE_KEY"))
)

client_mistral = ChatCompletionsClient(
    endpoint=os.getenv("MISTRAL_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("MISTRAL_KEY"))
)

client_llama = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_3_8B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_3_8B_KEY"))
)

def get_phi_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_phi.complete(messages=messages, model="phi", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_mistral_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_mistral.complete(messages=messages, model="mistral-nemo", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama.complete(messages=messages, model="llama-8b", temperature=temperature)
    return response.choices[0].message.content.strip()

# ds = load_dataset("boolq", split={"train": "train", "validation": "validation"})
# combined_dataset = concatenate_datasets([ds["train"], ds["validation"]])
# dataset = combined_dataset

dataset = load_dataset("boolq", split="validation")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def get_ensemble_prediction(passage, question, true_answer):
    prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."
    model_funcs = [
        ("phi", get_phi_completion),
        ("mistral", get_mistral_completion),
        ("llama", get_llama_completion)
    ]
    preds = []
    model_preds = {}
    for name, func in model_funcs:
        try:
            answer = func(prompt)
            parsed = parse_answer(answer)
            model_preds[f"{name}_raw"] = answer
            model_preds[f"{name}_pred"] = parsed
            if parsed is not None:
                preds.append(parsed)
        except Exception as e:
            model_preds[f"{name}_raw"] = None
            model_preds[f"{name}_pred"] = None
    if len(preds) == 0:
        return None, False, model_preds
    vote = sum(preds) > len(preds) // 2
    return vote, vote == true_answer, model_preds

flagged_indices = set()
results = []
correct = 0
total = 0

print("Evaluating Ensemble...")

for i, example in enumerate(dataset):
    if i in flagged_indices:
        continue
    passage = example["passage"]
    question = example["question"]
    true_answer = example["answer"]
    pred, is_correct, model_preds = get_ensemble_prediction(passage, question, true_answer)
    if pred is None:
        flagged_indices.add(i)
        continue
    entry = {
        "q_number": i + 1,
        "passage": passage,
        "question": question,
        "actual_answer": true_answer,
        "ensemble_pred": pred,
        "phi_pred": model_preds["phi_pred"],
        "mistral_pred": model_preds["mistral_pred"],
        "llama_pred": model_preds["llama_pred"]
    }
    results.append(entry)
    correct += int(is_correct)
    total += 1

acc_ensemble = correct / total if total > 0 else 0
print("Evaluated Ensemble")

df_ensemble = pd.DataFrame(results)
df_ensemble.to_csv("ensemble1_valid.csv", index=False)
print("Saved ensemble1_valid.csv")
print(f"Ensemble Accuracy: {acc_ensemble:.2f}")

final_output = {
    "ensemble_accuracy": acc_ensemble,
    "correct_count": correct,
    "flagged_indices": list(flagged_indices)
}

with open("ensemble1_valid.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved ensemble1_valid.json")

  from .autonotebook import tqdm as notebook_tqdm


Evaluating Ensemble...
Evaluated Ensemble
Saved ensemble1_valid.csv
Ensemble Accuracy: 0.86
Saved ensemble1_valid.json
Evaluated Ensemble
Saved ensemble1_valid.csv
Ensemble Accuracy: 0.86
Saved ensemble1_valid.json
