In [None]:
import json
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

dataset_folder = "/content/drive/MyDrive/mmml_project/outputs"
file_paths = {
    "gpt": os.path.join(dataset_folder, "mini_gqa_with_gpt.json"),
    "smolvlm": os.path.join(dataset_folder, "mini_gqa_with_smolvlm.json"),
    "blip": os.path.join(dataset_folder, "mini_gqa_with_blip.json"),
    "llava": os.path.join(dataset_folder, "mini_gqa_with_llava.json"),
}

for model, path in file_paths.items():
    with open(path, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    csv_path = os.path.join(dataset_folder, f"{model}.csv")
    df.to_csv(csv_path, index=False)
    print(f"{model.upper()} CSV saved: {csv_path}")

GPT CSV saved: /content/drive/MyDrive/mmml_project/outputs/gpt.csv
SMOLVLM CSV saved: /content/drive/MyDrive/mmml_project/outputs/smolvlm.csv
BLIP CSV saved: /content/drive/MyDrive/mmml_project/outputs/blip.csv
LLAVA CSV saved: /content/drive/MyDrive/mmml_project/outputs/llava.csv


In [None]:
# Define CSV paths
gpt_csv = os.path.join(dataset_folder, "gpt.csv")
smolvlm_csv = os.path.join(dataset_folder, "smolvlm.csv")
blip_csv = os.path.join(dataset_folder, "blip.csv")
llava_csv = os.path.join(dataset_folder, "llava.csv")

# Load CSVs
df_gpt = pd.read_csv(gpt_csv)
df_smolvlm = pd.read_csv(smolvlm_csv)
df_blip = pd.read_csv(blip_csv)
df_llava = pd.read_csv(llava_csv)

# Rename response columns for correct pairing
df_gpt.rename(columns={"gpt_response": "strong_model_response"}, inplace=True)
df_smolvlm.rename(columns={"smolvlm_response": "weak_model_response"}, inplace=True)
df_blip.rename(columns={"blip_response": "weak_model_response"}, inplace=True)  # BLIP is the weak model
df_llava.rename(columns={"llava_response": "strong_model_response"}, inplace=True)  # LLaVA is the strong model

# Perform INNER JOIN on imageId & question
gpt_vs_smolvlm = pd.merge(df_gpt, df_smolvlm, on=["imageId", "question"], how="inner")
llava_vs_blip = pd.merge(df_llava, df_blip, on=["imageId", "question"], how="inner")

# Select necessary columns
gpt_vs_smolvlm = gpt_vs_smolvlm[["imageId", "image_file_x", "question", "answer_x", "fullAnswer_x", "strong_model_response", "weak_model_response"]]
llava_vs_blip = llava_vs_blip[["imageId", "image_file_x", "question", "answer_x", "fullAnswer_x", "strong_model_response", "weak_model_response"]]

# Rename columns for consistency
gpt_vs_smolvlm.rename(columns={"image_file_x": "image_file", "answer_x": "answer", "fullAnswer_x": "full_answer"}, inplace=True)
llava_vs_blip.rename(columns={"image_file_x": "image_file", "answer_x": "answer", "fullAnswer_x": "full_answer"}, inplace=True)

# Save final matched CSVs
gpt_vs_smolvlm_csv = os.path.join(dataset_folder, "gpt_vs_smolvlm.csv")
llava_vs_blip_csv = os.path.join(dataset_folder, "llava_vs_blip.csv")

gpt_vs_smolvlm.to_csv(gpt_vs_smolvlm_csv, index=False)
llava_vs_blip.to_csv(llava_vs_blip_csv, index=False)

print(f"Final GPT vs SmolVLM CSV: {gpt_vs_smolvlm_csv}")
print(f"Final LLaVA vs BLIP CSV: {llava_vs_blip_csv}")

Final GPT vs SmolVLM CSV: /content/drive/MyDrive/mmml_project/outputs/gpt_vs_smolvlm.csv
Final LLaVA vs BLIP CSV: /content/drive/MyDrive/mmml_project/outputs/llava_vs_blip.csv
