# Export

In [3]:
def extract_marked_text(text):
    # First try to extract text between **
    parts = text.split("**")
    if len(parts) > 1:
        return parts[1].lower().replace('"', "")
    else:
        # If not found, try to extract text between "
        parts = text.split('"')
        if len(parts) > 1:
            return parts[1].lower()
        else:
            # Check for special characters or symbols
            special_chars = ["©"]
            for char in special_chars:
                if char in text:
                    return char
            # Handle no red oval case
            if "no red oval" in text:
                return "no_red_oval"
            # Default case if no marker found
            return "marker_not_found"

In [4]:
import pandas as pd
import os

# Define the words
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]

# Initialize an empty list to store DataFrames
all_data_frames = []

# Loop through each word
for WORD in WORDs:
    gt_data = pd.read_json(f"./images_second_prompt/{WORD}/configurations.json")

    # remplace ./images/ with ./images_second_prompt/
    gt_data["image_path"] = gt_data["image_path"].apply(
        lambda x: x.replace("./images/", "./images_second_prompt/")
    )

    # Generate the output file paths and read the content
    gt_data["model-output-file"] = gt_data["image_path"].apply(
        lambda x: "./" + x.replace(".png", "") + "-gemini-output.md"
    )
    gt_data["model-output-raw"] = gt_data["model-output-file"].apply(
        lambda x: open(x, "r").read() if os.path.exists(x) else None
    )

    # Drop rows with missing gemini output
    gt_data = gt_data.dropna(subset=["model-output-raw"])

    gt_data["predicted"] = gt_data["model-output-raw"].apply(extract_marked_text)

    print(gt_data["predicted"].value_counts())

    # Prepare the cleaned data
    cleaned_data = gt_data.copy()
    cleaned_data["gt"] = cleaned_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    cleaned_data["is_prediction_correct"] = (
        cleaned_data["gt"] == cleaned_data["predicted"]
    )
    cleaned_data["word_label"] = WORD  # Add a column to identify the word

    # Append to the list
    all_data_frames.append(cleaned_data)

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)

predicted
e    70
n    48
w    26
a    24
k    24
o    24
d    24
g    24
m    24
t    24
l    22
c    16
©     8
@     2
Name: count, dtype: int64
predicted
o              46
y              29
e              29
h              25
a              24
u              24
b              24
d              24
m              24
p              24
i              23
s              23
g              23
c              19
l              17
r              16
t               9
©               4
no_red_oval     1
Name: count, dtype: int64
predicted
a    44
v    35
b    30
o    29
y    26
c    26
p    26
i    25
t    24
w    24
e    24
m    24
n    24
u    23
q    23
h    22
r    18
x    15
z    13
@     2
k     2
g     1
Name: count, dtype: int64


In [5]:
final_data_frame["Model"] = ["Gemini-1.5-Pro"] * len(final_data_frame)

In [6]:
final_data_frame.to_pickle("./data/gemini-1.5-pro-2.pkl")

In [7]:
final_data_frame

Unnamed: 0,word,font_path,circle_index,thickness,scale_factor,padding,x_offset,y_offset,canvas_width,canvas_height,final_width,final_height,image_path,model-output-file,model-output-raw,predicted,gt,is_prediction_correct,word_label,Model
0,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,25,0,0,10,2,512,512,./images_second_prompt/Acknowledgement/text_im...,././images_second_prompt/Acknowledgement/text_...,The character highlighted with a red oval is t...,a,a,True,Acknowledgement,Gemini-1.5-Pro
1,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,50,0,0,10,2,512,512,./images_second_prompt/Acknowledgement/text_im...,././images_second_prompt/Acknowledgement/text_...,The character highlighted with a red oval is t...,a,a,True,Acknowledgement,Gemini-1.5-Pro
2,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,100,0,0,10,2,512,512,./images_second_prompt/Acknowledgement/text_im...,././images_second_prompt/Acknowledgement/text_...,The character being highlighted is the capital...,a,a,True,Acknowledgement,Gemini-1.5-Pro
3,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,200,0,0,10,2,512,512,./images_second_prompt/Acknowledgement/text_im...,././images_second_prompt/Acknowledgement/text_...,The character highlighted with a red oval is t...,a,a,True,Acknowledgement,Gemini-1.5-Pro
4,Acknowledgement,fonts/Helvetica.ttf,0,5,1.4,25,0,0,10,2,512,512,./images_second_prompt/Acknowledgement/text_im...,././images_second_prompt/Acknowledgement/text_...,The character highlighted with a red oval is t...,a,a,True,Acknowledgement,Gemini-1.5-Pro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,5,1.4,200,0,0,10,2,512,512,./images_second_prompt/tHyUiKaRbNqWeOpXcZvM/te...,././images_second_prompt/tHyUiKaRbNqWeOpXcZvM/...,The character highlighted with a red oval is t...,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1244,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,25,0,0,10,2,512,512,./images_second_prompt/tHyUiKaRbNqWeOpXcZvM/te...,././images_second_prompt/tHyUiKaRbNqWeOpXcZvM/...,The character highlighted in the red oval is *...,m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1245,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,50,0,0,10,2,512,512,./images_second_prompt/tHyUiKaRbNqWeOpXcZvM/te...,././images_second_prompt/tHyUiKaRbNqWeOpXcZvM/...,"The character highlighted with a red oval is ""...",m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
1246,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,100,0,0,10,2,512,512,./images_second_prompt/tHyUiKaRbNqWeOpXcZvM/te...,././images_second_prompt/tHyUiKaRbNqWeOpXcZvM/...,"The character highlighted with a red oval is ""...",m,m,True,tHyUiKaRbNqWeOpXcZvM,Gemini-1.5-Pro
