# Export

In [37]:
import pandas as pd
import json

# Define the words and their corresponding JSONL file paths
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]
GPT4Outputs = [
    "./jsonl/Acknowledgement-batch_XbhrSOcSB5EuAalvaf6pPnwp_output.jsonl",
    "./jsonl/Subdermatoglyphic-batch_KsEDCtqcuiLUd0fctyVDLeuJ_output.jsonl",
    "./jsonl/tHyUiKaRbNqWeOpXcZvM-batch_zSapvpcg51piGdd3wDSRqoGH_output.jsonl",
]

# Initialize an empty list to store DataFrames
all_data_frames = []
missing_files = {}

# Loop through each word and its corresponding file
for index, WORD in enumerate(WORDs):
    file_path = GPT4Outputs[index]
    gt_data = pd.read_json(f"./images/{WORD}/configurations.json")
    custom_id_to_content = {}

    gt_data["fnames"] = gt_data["image_path"].apply(lambda x: x.split("/")[-1])
    expected_files = set(list(gt_data["fnames"]))

    # Read the JSONL file and extract data
    with open(file_path, "r") as file:
        for line in file:
            json_obj = json.loads(line)
            custom_id = json_obj.get("custom_id")
            message_content = json_obj["response"]["body"]["choices"][0]["message"][
                "content"
            ]
            if custom_id:
                custom_id_to_content[custom_id] = message_content
            else:
                print(f"WARNING: Custom id is None for {line}")

    print(f"size of custom_id_to_content: {len(custom_id_to_content)}")

    custom_id_to_content_original = custom_id_to_content.copy()
    custom_id_to_content_original = {
        key.split("uid__")[-1]: value
        for key, value in custom_id_to_content_original.items()
    }
    # Process the extracted data
    custom_id_to_content = {
        key.split("uid__")[-1]: value.split('"')[1] if '"' in value else value
        for key, value in custom_id_to_content.items()
    }
    custom_id_to_content = {
        key: value.lower().replace(".", "").strip()
        for key, value in custom_id_to_content.items()
    }

    print(f"size of custom_id_to_content: {len(custom_id_to_content)}")

    custom_id_to_content = {
        key: value for key, value in custom_id_to_content.items() if len(value) == 1
    }

    print(f"size of custom_id_to_content: {len(custom_id_to_content)}")
    print(
        f"size of custom_id_to_content_original: {len(custom_id_to_content_original)}"
    )

    # Convert to DataFrame and merge
    custom_id_df = pd.DataFrame(
        list(custom_id_to_content.items()), columns=["image_path", "predicted"]
    )
    combined_data = pd.merge(gt_data, custom_id_df, on="image_path", how="inner")
    combined_data["gt"] = combined_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    combined_data["word_label"] = WORD  # Add a column to identify the word

    # add custom_id_to_content_original as model-output-raw
    combined_data["model-output-raw"] = combined_data["image_path"].map(
        custom_id_to_content_original
    )

    # Append to the list
    all_data_frames.append(combined_data)
    # check for missing files
    combined_data["fname"] = combined_data["image_path"].apply(
        lambda x: x.split("/")[-1]
    )
    missing_files[WORD] = set(expected_files) - set(list(combined_data["fnames"]))
    print(f"Missing files: {missing_files}")

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)

size of custom_id_to_content: 360
size of custom_id_to_content: 360
size of custom_id_to_content: 360
size of custom_id_to_content_original: 360
Missing files: {'Acknowledgement': set()}
size of custom_id_to_content: 408
size of custom_id_to_content: 408
size of custom_id_to_content: 408
size of custom_id_to_content_original: 408
Missing files: {'Acknowledgement': set(), 'Subdermatoglyphic': set()}
size of custom_id_to_content: 480
size of custom_id_to_content: 480
size of custom_id_to_content: 480
size of custom_id_to_content_original: 480
Missing files: {'Acknowledgement': set(), 'Subdermatoglyphic': set(), 'tHyUiKaRbNqWeOpXcZvM': set()}


In [38]:
final_data_frame["Model"] = ["GPT-4o"] * len(final_data_frame)

In [39]:
final_data_frame.to_pickle("./data/gpt-4o.pkl")

In [40]:
# count per model and prompt
final_data_frame.groupby(["Model", "word"]).size()

Model   word                
GPT-4o  Acknowledgement         360
        Subdermatoglyphic       408
        tHyUiKaRbNqWeOpXcZvM    480
dtype: int64

In [41]:
len(final_data_frame)

1248