# Export

In [53]:
import pandas as pd
import json

# Define the words and their corresponding JSONL file paths
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]
GPT4Outputs = [
    "./jsonl/Acknowledgement-2-batch_1cxWUaz0uTyHNceX7GqfMK4E_output.jsonl",
    "./jsonl/Subdermatoglyphic-2-batch_3xYMijVWunrFefv5fq03JCqO_output.jsonl",
    "./jsonl/tHyUiKaRbNqWeOpXcZvM-2-batch_00ESlnxS8xOOn5XUVXwSnAb2_output.jsonl",
]

# Initialize an empty list to store DataFrames
all_data_frames = []
missing_files = {}

# Loop through each word and its corresponding file
for index, WORD in enumerate(WORDs):
    file_path = GPT4Outputs[index]
    gt_data = pd.read_json(f"./images/{WORD}/configurations.json")
    custom_id_to_content = {}

    gt_data["fnames"] = gt_data["image_path"].apply(lambda x: x.split("/")[-1])
    expected_files = set(list(gt_data["fnames"]))

    # Read the JSONL file and extract data
    with open(file_path, "r") as file:
        for line in file:
            json_obj = json.loads(line)
            custom_id = json_obj.get("custom_id")
            message_content = json_obj["response"]["body"]["choices"][0]["message"][
                "content"
            ]
            if custom_id:
                custom_id_to_content[custom_id] = message_content

            if "text_image_0ca628c7-fc94-4e89-8fcc-ac6505ef4a90.png" in custom_id:
                print(f"baditem: {custom_id}, {message_content}")

    custom_id_to_content_original = custom_id_to_content.copy()
    # remove "uid__" from the keys
    custom_id_to_content_original = {
        key.split("uid__")[-1]: value
        for key, value in custom_id_to_content_original.items()
    }
    # Process the extracted data
    custom_id_to_content = {
        key.split("uid__")[-1]: value.split('"')[1] if '"' in value else value
        for key, value in custom_id_to_content.items()
    }
    custom_id_to_content = {
        key: value.lower().replace(".", "").strip()
        for key, value in custom_id_to_content.items()
    }

    custom_id_to_content = {
        key: value for key, value in custom_id_to_content.items() if len(value) == 1
    }

    # Convert to DataFrame and merge
    custom_id_df = pd.DataFrame(
        list(custom_id_to_content.items()), columns=["image_path", "predicted"]
    )
    combined_data = pd.merge(gt_data, custom_id_df, on="image_path", how="inner")
    combined_data["gt"] = combined_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    combined_data["word_label"] = WORD  # Add a column to identify the word

    # add custom_id_to_content_original as model-output-raw
    combined_data["model-output-raw"] = combined_data["image_path"].map(
        custom_id_to_content_original
    )

    # Append to the list
    all_data_frames.append(combined_data)
    # check for missing files
    combined_data["fname"] = combined_data["image_path"].apply(
        lambda x: x.split("/")[-1]
    )
    missing_files[WORD] = set(expected_files) - set(list(combined_data["fnames"]))
    print(f"Missing files: {missing_files}")

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)

Missing files: {'Acknowledgement': set()}
baditem: uid__./images/Subdermatoglyphic/text_image_0ca628c7-fc94-4e89-8fcc-ac6505ef4a90.png, The character being highlighted with a red oval is "d".
Missing files: {'Acknowledgement': set(), 'Subdermatoglyphic': set()}
Missing files: {'Acknowledgement': set(), 'Subdermatoglyphic': set(), 'tHyUiKaRbNqWeOpXcZvM': set()}


In [54]:
custom_id_to_content

{'./images/tHyUiKaRbNqWeOpXcZvM/text_image_a3822207-85a2-4ca5-b26c-2eb4cd873892.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_9168df2e-58d3-443a-8272-494a05c6361d.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_eb1fb3ed-4c16-4e31-9e2e-85a972da2ca4.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_56d82473-4467-4074-8f84-871207e1c297.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_a2300618-bb67-4987-bd9f-d33b18f39c0b.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_56d84b7c-c39b-4ec1-afb9-8fae3cbac8c5.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_e3d64779-e4dd-472e-95d6-78496a265ecb.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_92c1dce7-0313-46a9-994e-0ba4a341e5f8.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_9c11d1a2-d6d3-45ec-ae1c-c9c882a480ad.png': 'i',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_2ef66f6a-380c-4dd7-ab3e-28dbbc1b08c6.png': 't',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_f5de57dc-ee13-4278-b1dc-3ace9b19028a.png': 'i',

In [55]:
missing_files

{'Acknowledgement': set(),
 'Subdermatoglyphic': set(),
 'tHyUiKaRbNqWeOpXcZvM': set()}

In [56]:
custom_id_to_content_original

{'./images/tHyUiKaRbNqWeOpXcZvM/text_image_a3822207-85a2-4ca5-b26c-2eb4cd873892.png': 'The character being highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_9168df2e-58d3-443a-8272-494a05c6361d.png': 'The character being highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_eb1fb3ed-4c16-4e31-9e2e-85a972da2ca4.png': 'The character highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_56d82473-4467-4074-8f84-871207e1c297.png': 'The character being highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_a2300618-bb67-4987-bd9f-d33b18f39c0b.png': 'The character being highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_56d84b7c-c39b-4ec1-afb9-8fae3cbac8c5.png': 'The character being highlighted with a red oval is "t".',
 './images/tHyUiKaRbNqWeOpXcZvM/text_image_e3d64779-e4dd-472e-95d6-78496a265ecb.png': 'The character highlighted with a red oval is the 

In [57]:
final_data_frame["Model"] = ["GPT-4o"] * len(final_data_frame)
final_data_frame["prompt"] = ["Prompt 2"] * len(final_data_frame)

In [58]:
final_data_frame.to_pickle("./data/gpt-4o-2.pkl")

In [59]:
final_data_frame.shape

(1248, 21)

In [60]:
# WORD = WORDs[0]
# gt_data = pd.read_json(f"./images/{WORD}/configurations.json")
# gt_data["fnames"] = gt_data["image_path"].apply(lambda x: x.split("/")[-1])
# set(list(gt_data["fnames"]))

In [61]:
missing_files

{'Acknowledgement': set(),
 'Subdermatoglyphic': set(),
 'tHyUiKaRbNqWeOpXcZvM': set()}