In [None]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
import string
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd
import json

load_dotenv("env.env")

client_jamba = ChatCompletionsClient(
    endpoint=os.getenv("JAMBA_LARGE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("JAMBA_LARGE_KEY"))
)

client_llama70 = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_70B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_70B_KEY"))
)

client_llama405 = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_405B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_405B_KEY"))
)

def get_jamba_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_jamba.complete(messages=messages, model="jamba-large", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama70_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama70.complete(messages=messages, model="llama-70b", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama405_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama405.complete(messages=messages, model="llama-405b", temperature=temperature)
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="train")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    for i, example in enumerate(dataset):
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."

        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i+1} due to error: {e}")
            continue

        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1

        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })

        print(f"Sample {i+1}:\n  True Answer: {true_answer}\n  Model Answer: {pred}\n  Correct: {is_correct}\n")
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy for {model_name}: {accuracy:.2f}\n")
    return accuracy, correct, results

acc_jamba, correct_jamba, rows_jamba = evaluate_model(get_jamba_completion, "Jamba Large", "jamba_pred")
acc_llama70, correct_llama70, rows_llama70 = evaluate_model(get_llama70_completion, "LLaMA 70B", "llama70_pred")
acc_llama405, correct_llama405, rows_llama405 = evaluate_model(get_llama405_completion, "LLaMA 405B", "llama405_pred")

df_jamba = pd.DataFrame(rows_jamba)
df_llama70 = pd.DataFrame(rows_llama70)
df_llama405 = pd.DataFrame(rows_llama405)

df = df_jamba.merge(df_llama70, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_llama405, on=["q_number", "passage", "question", "actual_answer"])

df.to_csv("modellarge_train.csv", index=False)
final_output = {
    "models": {
        "Jamba Large": {"accuracy": acc_jamba, "correct_count": correct_jamba},
        "LLaMA 70B": {"accuracy": acc_llama70, "correct_count": correct_llama70},
        "LLaMA 405B": {"accuracy": acc_llama405, "correct_count": correct_llama405}
    },
    "total_questions": len(df)
}
with open("modellarge_train.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved results to modellarge_train.csv")
print("Saved modellarge_train.json")

Evaluating Jamba Large...
Sample 1:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 2:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 3:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 4:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 5:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 6:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 7:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 8:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 9:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 10:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 11:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 12:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 13:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 14:
  True Answer: True
  Model Answer: True
  Correct: True

Sam

In [None]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
import string
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import UserMessage
import pandas as pd
import json

load_dotenv("env.env")

client_jamba = ChatCompletionsClient(
    endpoint=os.getenv("JAMBA_LARGE_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("JAMBA_LARGE_KEY"))
)

client_llama70 = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_70B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_70B_KEY"))
)

client_llama405 = ChatCompletionsClient(
    endpoint=os.getenv("LLAMA_405B_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("LLAMA_405B_KEY"))
)

def get_jamba_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_jamba.complete(messages=messages, model="jamba-large", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama70_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama70.complete(messages=messages, model="llama-70b", temperature=temperature)
    return response.choices[0].message.content.strip()

def get_llama405_completion(prompt, temperature=0.1):
    messages = [UserMessage(content=prompt)]
    response = client_llama405.complete(messages=messages, model="llama-405b", temperature=temperature)
    return response.choices[0].message.content.strip()

dataset = load_dataset("boolq", split="validation")

def parse_answer(answer_text):
    lines = answer_text.strip().splitlines()
    for line in reversed(lines):
        cleaned = line.strip().strip(string.punctuation).lower()
        if cleaned in ['yes', 'no']:
            return cleaned == 'yes'
    return None

def evaluate_model(get_completion_func, model_name, col_name):
    results = []
    correct = 0
    total = 0
    print(f"Evaluating {model_name}...")
    for i, example in enumerate(dataset):
        passage = example["passage"]
        question = example["question"]
        true_answer = example["answer"]
        prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer with yes or no."

        try:
            answer_text = get_completion_func(prompt)
        except Exception as e:
            print(f"Skipping sample {i+1} due to error: {e}")
            continue

        pred = parse_answer(answer_text)
        is_correct = (pred == true_answer)
        correct += int(is_correct)
        total += 1

        results.append({
            "q_number": i + 1,
            "passage": passage,
            "question": question,
            "actual_answer": true_answer,
            col_name: pred
        })

        print(f"Sample {i+1}:\n  True Answer: {true_answer}\n  Model Answer: {pred}\n  Correct: {is_correct}\n")
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy for {model_name}: {accuracy:.2f}\n")
    return accuracy, correct, results

acc_jamba, correct_jamba, rows_jamba = evaluate_model(get_jamba_completion, "Jamba Large", "jamba_pred")
acc_llama70, correct_llama70, rows_llama70 = evaluate_model(get_llama70_completion, "LLaMA 70B", "llama70_pred")
acc_llama405, correct_llama405, rows_llama405 = evaluate_model(get_llama405_completion, "LLaMA 405B", "llama405_pred")

df_jamba = pd.DataFrame(rows_jamba)
df_llama70 = pd.DataFrame(rows_llama70)
df_llama405 = pd.DataFrame(rows_llama405)

df = df_jamba.merge(df_llama70, on=["q_number", "passage", "question", "actual_answer"])
df = df.merge(df_llama405, on=["q_number", "passage", "question", "actual_answer"])

df.to_csv("modellarge_valid.csv", index=False)
final_output = {
    "models": {
        "Jamba Large": {"accuracy": acc_jamba, "correct_count": correct_jamba},
        "LLaMA 70B": {"accuracy": acc_llama70, "correct_count": correct_llama70},
        "LLaMA 405B": {"accuracy": acc_llama405, "correct_count": correct_llama405}
    },
    "total_questions": len(df)
}
with open("modellarge_valid.json", "w") as f:
    json.dump(final_output, f, indent=2)
print("Saved results to modellarge_valid.csv")
print("Saved modellarge_valid.json")

Evaluating Jamba Large...
Sample 1:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 2:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 3:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 4:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 5:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 6:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 7:
  True Answer: True
  Model Answer: False
  Correct: False

Sample 8:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 9:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 10:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 11:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 12:
  True Answer: False
  Model Answer: False
  Correct: True

Sample 13:
  True Answer: True
  Model Answer: False
  Correct: False

Sample 14:
  True Answer: True
  Model Answer: True
  Correct: True

Sample 