In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [41]:
CSV_ONE_PATH = "test_delegation_model.csv"
CSV_TWO_PATH = "test_human_annotations.csv"
CSV_THREE_PATH = "test_delegation_human.csv"

In [42]:
df1 = pd.read_csv(CSV_ONE_PATH)
df2 = pd.read_csv(CSV_TWO_PATH)
df3 = pd.read_csv(CSV_THREE_PATH)

In [43]:
df1 = df1.copy()
df1["source"] = "model_pred"
df1["y_true"] = df1["true_label_index"]
df1["y_pred"] = df1["top1_index"]

df2 = df2.copy()
df2["source"] = "human_anno"
df2["y_true"] = df2["true_label_index"]
df2["y_pred"] = df2["human_label_index"]

df3 = df3.copy()
df3["source"] = "model_pred"
df3["y_true"] = df3["true_label_index"]
df3["y_pred"] = df3["top1_index"]

In [38]:
common_cols = ["image_path", "true_label_index", "true_label_name", "y_true", "y_pred", "source"]
df_combined = pd.concat(
    [
        df1.reindex(columns=common_cols),
        df2.reindex(columns=common_cols),
    ],
    ignore_index=True
)

In [30]:
df_combined = df_combined.dropna(subset=["y_true", "y_pred"]).copy()
df_combined["y_true"] = df_combined["y_true"].astype(int)
df_combined["y_pred"] = df_combined["y_pred"].astype(int)

In [31]:
dup_mask = df_combined.duplicated(subset=["image_path"], keep=False)

df_dups = df_combined[dup_mask].sort_values("image_path")

if len(df_dups) > 0:
    print("\n=== Duplicate image_path rows found ===")
    print(df_dups.to_string(index=False))
else:
    print("\nNo duplicate image_path entries found.")


No duplicate image_path entries found.


In [32]:
df_combined = df_combined.drop_duplicates(keep="first").reset_index(drop=True)

In [33]:
acc = accuracy_score(df_combined["y_true"], df_combined["y_pred"])
f1  = f1_score(df_combined["y_true"], df_combined["y_pred"], average="macro")

print("=== Combined (csv_one uses top1, csv_two uses human_label) ===")
print(f"Rows: {len(df_combined)}")
print(f"Accuracy: {acc:.6f}")
print(f"F1 (macro): {f1:.6f}")

=== Combined (csv_one uses top1, csv_two uses human_label) ===
Rows: 12630
Accuracy: 0.958907
F1 (macro): 0.932929


In [34]:
acc1 = accuracy_score(df1["y_true"], df1["y_pred"])
f1_1 = f1_score(df1["y_true"], df1["y_pred"], average="macro")

print("=== resnet keep ===")
print(f"Examples: {len(df1)}")
print(f"Accuracy: {acc1:.6f}")
print(f"F1 (macro): {f1_1:.6f}")

=== resnet keep ===
Examples: 12415
Accuracy: 0.962384
F1 (macro): 0.934682


In [40]:
acc2 = accuracy_score(df2["y_true"], df2["y_pred"])
f1_2 = f1_score(df2["y_true"], df2["y_pred"], average="macro")

print("=== human ===")
print(f"Examples: {len(df2)}")
print(f"Accuracy: {acc2:.6f}")
print(f"F1 (macro): {f1_2:.6f}")

=== human ===
Examples: 215
Accuracy: 0.758140
F1 (macro): 0.714268


In [44]:
acc3 = accuracy_score(df3["y_true"], df3["y_pred"])
f1_3 = f1_score(df3["y_true"], df3["y_pred"], average="macro")

print("=== human pre anno===")
print(f"Examples: {len(df3)}")
print(f"Accuracy: {acc3:.6f}")
print(f"F1 (macro): {f1_3:.6f}")

=== human pre anno===
Examples: 215
Accuracy: 0.506977
F1 (macro): 0.293149
