In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
import os

In [None]:
# Path to the JSONL file
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]

index = 1
WORD = WORDs[index]

gt_data = pd.read_json(f"./images/{WORD}/configurations.json")

In [None]:
gt_data

In [None]:
# text_image_0a5fd2d1-d0ad-490d-a4d4-a01955a8de8c

gt_data["model-output-file"] = gt_data["image_path"].apply(
    lambda x: x.replace(".png", "") + "-claude-3-sonnet-20240229-output.md"
)

gt_data["model-output-raw"] = gt_data["model-output-file"].apply(
    lambda x: (open(x, "r").read() if os.path.exists(os.path.join(x)) else None)
)

In [None]:
gt_data

In [None]:
# drop rows with missing sonnet output
gt_data = gt_data.dropna(subset=["model-output-raw"])
gt_data

In [None]:
def extract_marked_text(text):
    import re

    # Check if the description explicitly states that no letter is being circled
    no_circled_letter_patterns = [
        "no letter being circled",
        "no individual letter",
        "no circles or other annotations",
        "no circled letter",
        "no letters being circled",
    ]
    if any(phrase in text.lower() for phrase in no_circled_letter_patterns):
        return "none"

    # Use regular expressions to find single characters or explicitly mentioned letters
    # This pattern looks for single letters mentioned after certain phrases or enclosed in quotes
    patterns = [
        r"the letter being circled in the [^\.]* is ['\"]?([a-zA-Z])['\"]?",
        r"['\"]([a-zA-Z])['\"]",
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).lower()  # Return the matched letter in lowercase

    return "marker_not_found" + text


gt_data["predicted"] = gt_data["model-output-raw"].apply(extract_marked_text)

In [None]:
# show full column and row
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)


gt_data["predicted"].value_counts()

In [None]:
cleaned_data = gt_data.copy()

In [None]:
cleaned_data

In [None]:
cleaned_data["gt"] = cleaned_data.apply(
    lambda row: row["word"][row["circle_index"]].lower(), axis=1
)

In [None]:
cleaned_data.head()

In [None]:
cleaned_data["is_prediction_correct"] = cleaned_data["gt"] == cleaned_data["predicted"]
# get accuracy
accuracy = cleaned_data["is_prediction_correct"].mean()
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

In [None]:
len(cleaned_data)

In [None]:
errors = cleaned_data[cleaned_data["is_prediction_correct"] == False]
common_errors = (
    errors.groupby(["predicted", "gt"])
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
print(common_errors.head(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the matplotlib figure with a more professional color palette and layout
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 12))
fig.suptitle(f"Detailed Analysis of Model Predictions -- {WORD}", fontsize=16)

# Customize the color palette
sns.set(style="whitegrid", palette="muted")

# Plot Accuracy by Font Path
sns.barplot(
    ax=axes[0, 0],
    x="is_prediction_correct",
    y="font_path",
    data=cleaned_data,
    estimator=lambda x: x.mean(),
    palette="Blues_d",
)
axes[0, 0].set_title("Accuracy by Font Path")
axes[0, 0].set_xlabel("Accuracy")
axes[0, 0].set_ylabel("Font Path")

# Plot Accuracy by Circle Index
sns.barplot(
    ax=axes[0, 1],
    x="circle_index",
    y="is_prediction_correct",
    data=cleaned_data,
    estimator=lambda x: x.mean(),
    palette="Greens_d",
)
axes[0, 1].set_title("Accuracy by Circle Index")
axes[0, 1].set_xlabel("Circle Index")
axes[0, 1].set_ylabel("Accuracy")
# Set x-axis labels to characters from the word
axes[0, 1].set_xticklabels(list("Subdermatoglyphic"))


# Plot Distribution of Incorrect Predictions
sns.countplot(
    ax=axes[1, 0],
    x="predicted",
    data=errors,
    order=errors["predicted"].value_counts().index,
    palette="Reds_d",
)
axes[1, 0].set_title("Distribution of Incorrect Predictions")
axes[1, 0].set_xlabel("Predicted Characters")
axes[1, 0].set_ylabel("Count")

# Plot Distribution of Ground Truth for Incorrect Predictions
sns.countplot(
    ax=axes[1, 1],
    x="gt",
    data=errors,
    order=errors["gt"].value_counts().index,
    palette="Purples_d",
)
axes[1, 1].set_title("Distribution of Ground Truth for Incorrect Predictions")
axes[1, 1].set_xlabel("Ground Truth Characters")
axes[1, 1].set_ylabel("Count")

# Plot Accuracy by Thickness
sns.lineplot(
    ax=axes[2, 0],
    x="thickness",
    y="is_prediction_correct",
    data=cleaned_data,
    estimator=lambda x: x.mean(),
    marker="o",
    color="deepskyblue",
)
axes[2, 0].set_title("Accuracy by Thickness")
axes[2, 0].set_xlabel("Thickness")
axes[2, 0].set_ylabel("Accuracy")


plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust subplots to fit into figure area.
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Extract ground truth and predictions
ground_truth = cleaned_data["gt"]
predictions = cleaned_data["predicted"]

# Define the order of labels based on the word "Acknowledgement" and any extra characters
desired_order = list(
    "Acknowledgement"
)  # Ensure it's in lowercase if your data is in lowercase
all_labels = np.unique(np.concatenate((ground_truth, predictions)))
extra_labels = [label for label in all_labels if label not in desired_order]
final_labels = desired_order + extra_labels

# Create the confusion matrix with the specified label order
conf_matrix = confusion_matrix(ground_truth, predictions, labels=final_labels)

# Plot the confusion matrix with a more professional appearance
plt.figure(figsize=(14, 12))
sns.set(font_scale=1.4)  # Increase font size for readability
heatmap = sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=final_labels,
    yticklabels=final_labels,
    cbar_kws={"label": "Frequency"},
)
plt.title(f"Confusion Matrix -- {WORD}", fontsize=18, fontweight="bold")
plt.xlabel("Predicted Label", fontsize=14)
plt.ylabel("True Label", fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

# Export

In [1]:
import pandas as pd
import json
import os
import re

# Define the words
WORDs = [
    "Acknowledgement",
    "Subdermatoglyphic",
    "tHyUiKaRbNqWeOpXcZvM",
]

# Initialize an empty list to store DataFrames
all_data_frames = []

# Loop through each word
for WORD in WORDs:
    gt_data = pd.read_json(f"./images/{WORD}/configurations.json")

    # Generate model output file paths and read the content if the file exists
    gt_data["model-output-file"] = gt_data["image_path"].apply(
        lambda x: x.replace(".png", "") + "-claude-3-sonnet-20240229-output.md"
    )
    gt_data["model-output-raw"] = gt_data["model-output-file"].apply(
        lambda x: (open(x, "r").read() if os.path.exists(x) else None)
    )

    # Drop rows with missing sonnet output
    gt_data = gt_data.dropna(subset=["model-output-raw"])

    # Extract marked text
    def extract_marked_text(text):
        no_circled_letter_patterns = [
            "no letter being circled",
            "no individual letter",
            "no circles or other annotations",
            "no circled letter",
            "no letters being circled",
        ]
        if any(phrase in text.lower() for phrase in no_circled_letter_patterns):
            return "none"

        patterns = [
            r"the letter being circled in the [^\.]* is ['\"]?([a-zA-Z])['\"]?",
            r"['\"]([a-zA-Z])['\"]",
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).lower()

        return "marker_not_found"

    gt_data["predicted"] = gt_data["model-output-raw"].apply(extract_marked_text)

    # Calculate ground truth and correctness
    gt_data["gt"] = gt_data.apply(
        lambda row: row["word"][row["circle_index"]].lower(), axis=1
    )
    gt_data["is_prediction_correct"] = gt_data["gt"] == gt_data["predicted"]
    gt_data["word_label"] = WORD  # Add a column to identify the word

    # Append to the list
    all_data_frames.append(gt_data)

# Concatenate all DataFrames into one
final_data_frame = pd.concat(all_data_frames, ignore_index=True)
final_data_frame

Unnamed: 0,word,font_path,circle_index,thickness,scale_factor,padding,x_offset,y_offset,canvas_width,canvas_height,final_width,final_height,image_path,model-output-file,model-output-raw,predicted,gt,is_prediction_correct,word_label
0,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,25,0,0,10,2,512,512,./images/Acknowledgement/text_image_2445b2a9-9...,./images/Acknowledgement/text_image_2445b2a9-9...,"The letter ""A"" is being circled in the word ""A...",a,a,True,Acknowledgement
1,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,50,0,0,10,2,512,512,./images/Acknowledgement/text_image_3bac479b-3...,./images/Acknowledgement/text_image_3bac479b-3...,The letter being circled in the image is 'A' w...,a,a,True,Acknowledgement
2,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,100,0,0,10,2,512,512,./images/Acknowledgement/text_image_7ddbe27c-5...,./images/Acknowledgement/text_image_7ddbe27c-5...,"The letter being circled in the word ""Acknowle...",t,a,False,Acknowledgement
3,Acknowledgement,fonts/Helvetica.ttf,0,4,1.4,200,0,0,10,2,512,512,./images/Acknowledgement/text_image_f22668e3-c...,./images/Acknowledgement/text_image_f22668e3-c...,"The letter 'A' is being circled in the word ""A...",a,a,True,Acknowledgement
4,Acknowledgement,fonts/Helvetica.ttf,0,5,1.4,25,0,0,10,2,512,512,./images/Acknowledgement/text_image_4181a30f-3...,./images/Acknowledgement/text_image_4181a30f-3...,"The letter being circled in the word ""Acknowle...",t,a,False,Acknowledgement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,5,1.4,200,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_42f35...,./images/tHyUiKaRbNqWeOpXcZvM/text_image_42f35...,The letter being circled in the image is 'M'.,m,m,True,tHyUiKaRbNqWeOpXcZvM
1244,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,25,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_1418c...,./images/tHyUiKaRbNqWeOpXcZvM/text_image_1418c...,The letter 'M' is being circled in the given s...,m,m,True,tHyUiKaRbNqWeOpXcZvM
1245,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,50,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_8e326...,./images/tHyUiKaRbNqWeOpXcZvM/text_image_8e326...,The letter 'M' is being circled in the given t...,m,m,True,tHyUiKaRbNqWeOpXcZvM
1246,tHyUiKaRbNqWeOpXcZvM,"fonts/OpenSans-VariableFont_wdth,wght.ttf",19,6,1.4,100,0,0,10,2,512,512,./images/tHyUiKaRbNqWeOpXcZvM/text_image_94466...,./images/tHyUiKaRbNqWeOpXcZvM/text_image_94466...,The letter being circled in the given text str...,t,m,False,tHyUiKaRbNqWeOpXcZvM


In [2]:
final_data_frame["Model"] = ["Sonnet"] * len(final_data_frame)

In [3]:
final_data_frame.to_pickle("./data/Sonnet.pkl")