In [None]:
from openai import OpenAI
import os
import pandas as pd
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
dataset_folder = "/content/drive/MyDrive/mmml_project/outputs"
csv_files = {
    "gpt_vs_smolvlm": os.path.join(dataset_folder, "gpt_vs_smolvlm.csv"),
    "llava_vs_blip": os.path.join(dataset_folder, "llava_vs_blip.csv"),
}

client = OpenAI(api_key="OPENAI_KEY")

judge_prompt = """You are an AI evaluator tasked with grading model responses. Your job is to assess if the given model response correctly answers the question, using both the answer and full answer for reference.

Question: {question}

Answer: {answer}

Full Answer: {full_answer}

Model Response: {response}

Instructions:
- If the model response correctly aligns with the answer/full answer, return **1**.
- If the model response is incorrect or misleading, return **0**.
- **Only output a single digit (0 or 1), no explanations.**
"""

def evaluate_responses(csv_path, strong_model, weak_model):
    df = pd.read_csv(csv_path)

    strong_scores = []
    weak_scores = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Evaluating {strong_model} & {weak_model} responses"):
        strong_prompt = judge_prompt.format(
            question=row["question"],
            answer=row["answer"],
            full_answer=row["full_answer"],
            response=row["strong_model_response"]
        )

        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": strong_prompt}],
            )
            strong_score = response.choices[0].message.content.strip()
            strong_scores.append(int(strong_score))
        except Exception as e:
            print(f"Error processing strong model row: {row['imageId']} - {e}")
            strong_scores.append(-1)

        weak_prompt = judge_prompt.format(
            question=row["question"],
            answer=row["answer"],
            full_answer=row["full_answer"],
            response=row["weak_model_response"]
        )

        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": weak_prompt}],
            )
            weak_score = response.choices[0].message.content.strip()
            weak_scores.append(int(weak_score))
        except Exception as e:
            print(f"Error processing weak model row: {row['imageId']} - {e}")
            weak_scores.append(-1)

    df[f"{strong_model}_score"] = strong_scores
    df[f"{weak_model}_score"] = weak_scores

    output_csv = csv_path.replace(".csv", "_test_scored.csv")
    df.to_csv(output_csv, index=False)
    print(f"✅ Test Scored CSV saved: {output_csv}")

evaluate_responses(csv_files["gpt_vs_smolvlm"], "gpt", "smolvlm")
#evaluate_responses(csv_files["llava_vs_blip"], "llava", "blip")


Mounted at /content/drive


Evaluating gpt & smolvlm responses: 100%|██████████| 7371/7371 [1:46:14<00:00,  1.16it/s]


✅ Test Scored CSV saved: /content/drive/MyDrive/mmml_project/outputs/gpt_vs_smolvlm_test_scored.csv
