# Export

In [6]:
def extract_marked_text(text):

    import re

    text = text.lower()

    # if we have a text like "letter X is being circled" for any upper and lower letter X then return it
    if re.search(r"letter ([a-zA-Z]) is being circled", text):
        return re.search(r"letter ([a-zA-Z]) is being circled", text).group(1)

    # let first check if there is a single character wrapped around " and " with regex, it has to be a single character
    if re.search(r'"([a-zA-Z])"', text):
        return re.search(r'"([a-zA-Z])"', text).group(1)

    # if a single character wrapped around single quote ' and '
    if re.search(r"'([a-zA-Z])'", text):
        return re.search(r"'([a-zA-Z])'", text).group(1)

    # if we have a string like letter X is being circled for any X lower or upper then return it
    if re.search(r"([a-zA-Z]) is being circled", text):
        return re.search(r"([a-zA-Z]) is being circled", text).group(1)

    # if we have text like "the letter X" for any lower or upper case X then return it
    if re.search(r"\bthe letter ([a-zA-Z])\b", text):
        return re.search(r"\bthe letter ([a-zA-Z])\b", text).group(1)

    # if we have "in the image is X." in the text for any lower or upper case X then return it
    if re.search(r"in the image is ([a-zA-Z])\.", text):
        return re.search(r"in the image is ([a-zA-Z])\.", text).group(1)

    # if "in the image is X," for any lower or upper case X then return it
    if re.search(r"in the image is ([a-zA-Z]),", text):
        return re.search(r"in the image is ([a-zA-Z]),", text).group(1)

    if "copyright symbol" in text:
        return "©"

    if "@" in text:
        return "@"

    if "there is no red oval" in text:
        return "none"

    # if "is the uppercase letter X." for any lower or upper case X then return it
    if re.search(r"the uppercase letter ([a-zA-Z])", text):
        return re.search(r"the uppercase letter ([a-zA-Z])", text).group(1)

    # if 'is the lowercase letter X." for any lower or upper case X then return it
    if re.search(r"the lowercase letter ([a-zA-Z])", text):
        return re.search(r"the lowercase letter ([a-zA-Z])", text).group(1)

    # if we have "the number 9" for any number then return it
    if re.search(r"the number ([0-9])", text):
        return re.search(r"the number ([0-9])", text).group(1)

    if "there is no character" in text:
        return "none"

    if "there is no red oval" in text:
        return "none"

    if "does not contain" in text:
        return "none"

    if "there are no red ovals" in text:
        return "none"

    # if image is X. for any lower or upper case X then return it
    if re.search(r"image is ([a-zA-Z])\.", text):
        return re.search(r"image is ([a-zA-Z])\.", text).group(1)

    if re.search(r"letters is ([a-zA-Z])\.", text):
        return re.search(r"letters is ([a-zA-Z])\.", text).group(1)

    # if we have "the image is S," for any lower or upper case S then return it and there should be , after it
    if re.search(r"the image is ([a-zA-Z]),", text):
        return re.search(r"the image is ([a-zA-Z]),", text).group(1)

    # if the text ends with is X. for any lower or upper case X then return it
    if re.search(r"is ([a-zA-Z])\.", text):
        return re.search(r"is ([a-zA-Z])\.", text).group(1)

    # finally if we have patterns " is R " with leading and trailing space then return the character in between
    if re.search(r"\s([a-zA-Z])\s", text):
        return re.search(r"\s([a-zA-Z])\s", text).group(1)

    return "marker_not_found" + text

In [7]:
import pandas as pd
import json
import os
import re

# Define the words
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]

# Initialize an empty list to store DataFrames
all_data_frames = []

# Loop through each word
for WORD in WORDs:
    gt_data = pd.read_json(f"./images/{WORD}/configurations.json")

    # Generate model output file paths and read the content if the file exists
    gt_data["model-output-file"] = gt_data["image_path"].apply(
        lambda x: x.replace(".png", "") + "-claude-3-sonnet-20240229-output.md"
    )
    gt_data["model-output-raw"] = gt_data["model-output-file"].apply(
        lambda x: (open(x, "r").read() if os.path.exists(x) else None)
    )

    # Drop rows with missing sonnet output
    gt_data = gt_data.dropna(subset=["model-output-raw"])

    gt_data["predicted"] = gt_data["model-output-raw"].apply(extract_marked_text)
    print(gt_data["predicted"].value_counts())

    # Calculate ground truth and correctness
    gt_data["gt"] = gt_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    gt_data["is_prediction_correct"] = gt_data["gt"] == gt_data["predicted"]
    gt_data["word_label"] = WORD  # Add a column to identify the word

    # Append to the list
    all_data_frames.append(gt_data)

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)

predicted
e                                                                                                 93
o                                                                                                 37
c                                                                                                 27
n                                                                                                 27
g                                                                                                 27
w                                                                                                 26
d                                                                                                 25
a                                                                                                 22
k                                                                                                 22
t                                                                                

In [8]:
final_data_frame["Model"] = ["Sonnet"] * len(final_data_frame)

In [9]:
final_data_frame.to_pickle("./data/Sonnet.pkl")

In [10]:
# group by word and average is_prediction_correct
final_data_frame.groupby("word_label")["is_prediction_correct"].mean()

word_label
Acknowledgement         0.827778
Subdermatoglyphic       0.715686
tHyUiKaRbNqWeOpXcZvM    0.660417
Name: is_prediction_correct, dtype: float64