# Export

In [1]:
def extract_marked_text(text):
    # First try to extract text between **
    parts = text.split("**")
    if len(parts) > 1:
        return parts[1].lower().replace('"', "")
    else:
        # If not found, try to extract text between "
        parts = text.split('"')
        if len(parts) > 1:
            return parts[1].lower()
        else:
            # Check for special characters or symbols
            special_chars = ["©"]
            for char in special_chars:
                if char in text:
                    return char
            # Handle no red oval case
            if "no red oval" in text:
                return "no_red_oval"
            # Default case if no marker found
            return "marker_not_found"

In [2]:
import pandas as pd
import os

# Define the words
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]

# Initialize an empty list to store DataFrames
all_data_frames = []

# Loop through each word
for WORD in WORDs:
    gt_data = pd.read_json(f"./images/{WORD}/configurations.json")

    # Generate the output file paths and read the content
    gt_data["model-output-file"] = gt_data["image_path"].apply(
        lambda x: "./" + x.replace(".png", "") + "-gemini-output.md"
    )
    gt_data["model-output-raw"] = gt_data["model-output-file"].apply(
        lambda x: open(x, "r").read() if os.path.exists(x) else None
    )

    # Drop rows with missing gemini output
    gt_data = gt_data.dropna(subset=["model-output-raw"])

    # rename gemini-output-raw to model-output-raw

    gt_data["predicted"] = gt_data["model-output-raw"].apply(extract_marked_text)
    print(gt_data["predicted"].value_counts())
    # Prepare the cleaned data
    cleaned_data = gt_data.copy()
    cleaned_data["gt"] = cleaned_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    cleaned_data["is_prediction_correct"] = (
        cleaned_data["gt"] == cleaned_data["predicted"]
    )
    cleaned_data["word_label"] = WORD  # Add a column to identify the word

    # Append to the list
    all_data_frames.append(cleaned_data)

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)

predicted
e    73
n    48
w    29
a    24
c    24
k    24
o    24
d    24
g    24
m    24
t    24
l    18
Name: count, dtype: int64
predicted
o    48
y    30
e    27
h    25
s    24
c    24
b    24
d    24
m    24
a    24
u    24
g    24
p    24
i    23
r    18
l    16
t     4
n     1
Name: count, dtype: int64
predicted
a    34
o    31
v    28
t    24
n    24
p    24
e    24
w    24
h    24
m    24
b    24
r    24
i    24
u    24
y    24
c    22
q    22
z    20
x    19
k    11
@     3
g     2
Name: count, dtype: int64


In [4]:
final_data_frame["Model"] = ["Gemini-1.5-Pro"] * len(final_data_frame)

# rename gemini-output-raw to model-output-raw
# final_data_frame.rename(columns={"gemini-output-raw": "model-output-raw"}, inplace=True)

# rename gemini-output-file to model-output-file
# final_data_frame.rename(
#     columns={"gemini-output-file": "model-output-file"}, inplace=True
# )

In [5]:
final_data_frame.to_pickle("./data/gemini-1.5-pro.pkl")

In [6]:
final_data_frame

Unnamed: 0,word,font_path,circle_index,thickness,scale_factor,padding,x_offset,y_offset,canvas_width,canvas_height,final_width,final_height,image_path,model-output-file,model-output-raw,predicted,gt,is_prediction_correct,word_label,Model
0,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,25,0,0,10,2,512,512,./images/Acknowledgement/text_image_2445b2a9-9...,././images/Acknowledgement/text_image_2445b2a9...,The letter **A** is being circled. \n,a,a,True,Acknowledgement,Gemini-1.5-Pro
1,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,50,0,0,10,2,512,512,./images/Acknowledgement/text_image_3bac479b-3...,././images/Acknowledgement/text_image_3bac479b...,The letter **A** is being circled. \n,a,a,True,Acknowledgement,Gemini-1.5-Pro
2,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,100,0,0,10,2,512,512,./images/Acknowledgement/text_image_7ddbe27c-5...,././images/Acknowledgement/text_image_7ddbe27c...,The letter **A** is being circled. \n,a,a,True,Acknowledgement,Gemini-1.5-Pro
3,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,200,0,0,10,2,512,512,./images/Acknowledgement/text_image_f22668e3-c...,././images/Acknowledgement/text_image_f22668e3...,The letter **A** is being circled. \n,a,a,True,Acknowledgement,Gemini-1.5-Pro
4,Acknowledgement,fonts/Helvetica.ttf,0,5,1.4,25,0,0,10,2,512,512,./images/Acknowledgement/text_image_4181a30f-3...,././images/Acknowledgement/text_image_4181a30f...,The letter **A** is being circled. \n,a,a,True,Acknowledgement,Gemini-1.5-Pro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,5,1.4,200,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_42f35...,././images/tHyUiKaRbNqWeOpXcZvM/text_image_42f...,The letter **M** is circled. \n,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1244,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,25,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_1418c...,././images/tHyUiKaRbNqWeOpXcZvM/text_image_141...,The letter **M** is being circled. \n,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1245,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,50,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_8e326...,././images/tHyUiKaRbNqWeOpXcZvM/text_image_8e3...,The letter **M** is being circled. \n,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1246,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,100,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_94466...,././images/tHyUiKaRbNqWeOpXcZvM/text_image_944...,The letter **M** is being circled. \n,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
