In [1]:
import pandas as pd
import json
import os
import re

In [2]:
pickle_files = ["./data/Sonnet.pkl", "./data/GPT-4o.pkl", "./data/gemini-1.5-pro.pkl"]

dfs = [pd.read_pickle(file) for file in pickle_files]
df = pd.concat(dfs)

In [3]:
df_sonnet = df[df["Model"] == "Sonnet"]

In [4]:
# # set pandas to show every row and cell and column
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

# filter for word Acknowledgement and is_prediction_correct is False
MYDF = df_sonnet[
    (df_sonnet["word"] == "Acknowledgement")
    & (df_sonnet["is_prediction_correct"] == False)
]

# only show  model-output-raw, predicted and gt
MYDF = MYDF[["model-output-raw", "predicted", "gt"]]
MYDF

Unnamed: 0,model-output-raw,predicted,gt
27,"The letter ""c"" in the word ""Acknowledgement"" i...",c,k
49,"The letter ""w"" is being circled in the word ""A...",w,o
56,"The letter ""w"" is being circled in the word ""A...",w,o
58,"The letter ""n"" is being circled in the word ""A...",n,o
72,"The letter 'I' is being circled in the word ""A...",i,l
...,...,...,...
346,The letter being circled in the image is the l...,e,n
347,"The letter being circled in the word ""Acknowle...",o,n
348,The letter being circled in the image is the l...,g,t
352,"The letter ""g"" in the word ""Acknowledgement"" i...",g,t


In [5]:
df["is_prediction_correct"] = df["gt"] == df["predicted"]

# Group by model and word, then calculate the accuracy
accuracy_per_model_and_word = df.groupby(["Model", "word"])[
    "is_prediction_correct"
].mean()

# Convert the accuracy to percentage and print the result
accuracy_per_model_and_word = accuracy_per_model_and_word * 100
# round 2 decimal places
accuracy_per_model_and_word = accuracy_per_model_and_word.round(2)

print(accuracy_per_model_and_word)

Model           word                
GPT-4o          Acknowledgement         68.63
                Subdermatoglyphic       63.77
                tHyUiKaRbNqWeOpXcZvM    78.53
Gemini-1.5-Pro  Acknowledgement         98.33
                Subdermatoglyphic       91.42
                tHyUiKaRbNqWeOpXcZvM    92.92
Sonnet          Acknowledgement         82.78
                Subdermatoglyphic       71.57
                tHyUiKaRbNqWeOpXcZvM    65.62
Name: is_prediction_correct, dtype: float64


In [11]:
# get  average per model
df_average = df.groupby("Model")["is_prediction_correct"].mean()
df_average = df_average * 100
df_average = df_average.round(2)
df_average

Model
GPT-4o            70.85
Gemini-1.5-Pro    93.99
Sonnet            72.52
Name: is_prediction_correct, dtype: float64

In [6]:
print(accuracy_per_model_and_word.to_latex())

\begin{tabular}{llr}
\toprule
 &  & is_prediction_correct \\
Model & word &  \\
\midrule
\multirow[t]{3}{*}{GPT-4o} & Acknowledgement & 68.630000 \\
 & Subdermatoglyphic & 63.770000 \\
 & tHyUiKaRbNqWeOpXcZvM & 78.530000 \\
\cline{1-3}
\multirow[t]{3}{*}{Gemini-1.5-Pro} & Acknowledgement & 98.330000 \\
 & Subdermatoglyphic & 91.420000 \\
 & tHyUiKaRbNqWeOpXcZvM & 92.920000 \\
\cline{1-3}
\multirow[t]{3}{*}{Sonnet} & Acknowledgement & 82.780000 \\
 & Subdermatoglyphic & 71.570000 \\
 & tHyUiKaRbNqWeOpXcZvM & 65.620000 \\
\cline{1-3}
\bottomrule
\end{tabular}



In [7]:
# Assuming 'df' has a column named 'font' that contains the font information for each entry.

# Group by model, word, and font, then calculate the accuracy
accuracy_per_model_word_font = df.groupby(["Model", "word", "font_path"])[
    "is_prediction_correct"
].mean()

# Convert the accuracy to percentage and round to 2 decimal places
accuracy_per_model_word_font = (accuracy_per_model_word_font * 100).round(2)

print(accuracy_per_model_word_font)

Model           word                  font_path                                
GPT-4o          Acknowledgement       fonts/Helvetica.ttf                          69.66
                                      fonts/OpenSans-VariableFont_wdth,wght.ttf    67.60
                Subdermatoglyphic     fonts/Helvetica.ttf                          67.98
                                      fonts/OpenSans-VariableFont_wdth,wght.ttf    59.50
                tHyUiKaRbNqWeOpXcZvM  fonts/Helvetica.ttf                          82.28
                                      fonts/OpenSans-VariableFont_wdth,wght.ttf    74.79
Gemini-1.5-Pro  Acknowledgement       fonts/Helvetica.ttf                          98.89
                                      fonts/OpenSans-VariableFont_wdth,wght.ttf    97.78
                Subdermatoglyphic     fonts/Helvetica.ttf                          91.18
                                      fonts/OpenSans-VariableFont_wdth,wght.ttf    91.67
                tHyUiKaRbNqWeO

In [8]:
print(accuracy_per_model_word_font.to_latex())

\begin{tabular}{lllr}
\toprule
 &  &  & is_prediction_correct \\
Model & word & font_path &  \\
\midrule
\multirow[t]{6}{*}{GPT-4o} & \multirow[t]{2}{*}{Acknowledgement} & fonts/Helvetica.ttf & 69.660000 \\
 &  & fonts/OpenSans-VariableFont_wdth,wght.ttf & 67.600000 \\
\cline{2-4}
 & \multirow[t]{2}{*}{Subdermatoglyphic} & fonts/Helvetica.ttf & 67.980000 \\
 &  & fonts/OpenSans-VariableFont_wdth,wght.ttf & 59.500000 \\
\cline{2-4}
 & \multirow[t]{2}{*}{tHyUiKaRbNqWeOpXcZvM} & fonts/Helvetica.ttf & 82.280000 \\
 &  & fonts/OpenSans-VariableFont_wdth,wght.ttf & 74.790000 \\
\cline{1-4} \cline{2-4}
\multirow[t]{6}{*}{Gemini-1.5-Pro} & \multirow[t]{2}{*}{Acknowledgement} & fonts/Helvetica.ttf & 98.890000 \\
 &  & fonts/OpenSans-VariableFont_wdth,wght.ttf & 97.780000 \\
\cline{2-4}
 & \multirow[t]{2}{*}{Subdermatoglyphic} & fonts/Helvetica.ttf & 91.180000 \\
 &  & fonts/OpenSans-VariableFont_wdth,wght.ttf & 91.670000 \\
\cline{2-4}
 & \multirow[t]{2}{*}{tHyUiKaRbNqWeOpXcZvM} & fonts/Helveti