In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [5]:
CSV_ONE_PATH = "test_delegation_model.csv"
CSV_TWO_PATH = "test_human_annotations.csv"

In [6]:
df1 = pd.read_csv(CSV_ONE_PATH)
df2 = pd.read_csv(CSV_TWO_PATH)

In [7]:
df1 = df1.copy()
df1["source"] = "model_pred"
df1["y_true"] = df1["true_label_index"]
df1["y_pred"] = df1["top1_index"]

df2 = df2.copy()
df2["source"] = "human_anno"
df2["y_true"] = df2["true_label_index"]
df2["y_pred"] = df2["human_label_index"]

In [8]:
common_cols = ["image_path", "true_label_index", "true_label_name", "y_true", "y_pred", "source"]
df_combined = pd.concat(
    [
        df1.reindex(columns=common_cols),
        df2.reindex(columns=common_cols),
    ],
    ignore_index=True
)

In [9]:
df_combined = df_combined.dropna(subset=["y_true", "y_pred"]).copy()
df_combined["y_true"] = df_combined["y_true"].astype(int)
df_combined["y_pred"] = df_combined["y_pred"].astype(int)

In [11]:
dup_mask = df_combined.duplicated(subset=["image_path"], keep=False)

df_dups = df_combined[dup_mask].sort_values("image_path")

if len(df_dups) > 0:
    print("\n=== Duplicate image_path rows found ===")
    print(df_dups.to_string(index=False))
else:
    print("\nNo duplicate image_path entries found.")


=== Duplicate image_path rows found ===
                                                                                                                                                        image_path  true_label_index       true_label_name  y_true  y_pred     source
/Users/kaelytham/Documents/RUG/Yr2 Sem 1B/Hybrid Intelligence/PJ code/hybrid_intelligence_project/test_and_evaluation/data/gtsrb/GTSRB/Final_Test/Images/12628.ppm                 7 Speed limit (100km/h)       7       7 human_anno
/Users/kaelytham/Documents/RUG/Yr2 Sem 1B/Hybrid Intelligence/PJ code/hybrid_intelligence_project/test_and_evaluation/data/gtsrb/GTSRB/Final_Test/Images/12628.ppm                 7 Speed limit (100km/h)       7       7 human_anno


In [12]:
df_combined = df_combined.drop_duplicates(keep="first").reset_index(drop=True)

In [13]:
acc = accuracy_score(df_combined["y_true"], df_combined["y_pred"])
f1  = f1_score(df_combined["y_true"], df_combined["y_pred"], average="macro")

print("=== Combined (csv_one uses top1, csv_two uses human_label) ===")
print(f"Rows: {len(df_combined)}")
print(f"Accuracy: {acc:.6f}")
print(f"F1 (macro): {f1:.6f}")

=== Combined (csv_one uses top1, csv_two uses human_label) ===
Rows: 12630
Accuracy: 0.961203
F1 (macro): 0.933092


In [14]:
acc1 = accuracy_score(df1["y_true"], df1["y_pred"])
f1_1 = f1_score(df1["y_true"], df1["y_pred"], average="macro")

print("=== resnet keep ===")
print(f"Examples: {len(df1)}")
print(f"Accuracy: {acc1:.6f}")
print(f"F1 (macro): {f1_1:.6f}")

=== resnet keep ===
Examples: 11955
Accuracy: 0.964701
F1 (macro): 0.935339
