# Model output and overbite classification testing

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import cohen_kappa_score

# === Paths ===
test_path = os.path.join("..", "..", "Data", "Raw Data", "2025-05-08 TRANSLATE_KEY_set7.csv")  # Now using CSV
pred_path = os.path.join("..", "Output", "Pixel Matrix", "Pixel_Matrix_With_Distance_54999_Nye.csv")
summary_path = os.path.join("..", "Output", "Overbite Detection", "patient_level_summary7.csv")

# === Load data (CSV-only now) ===
df_test = pd.read_csv(test_path, delimiter=";")
df_pred = pd.read_csv(pred_path)

# === Clean filenames ===
df_test["Filename"] = df_test["Filename"].str.replace(".png", "", regex=False)
df_pred["Filename"] = df_pred["Filename"].str.replace(".png", "", regex=False)

# === Keep only rows where model made a prediction ===
df_test = df_test[df_test["Filename"].isin(df_pred["Filename"])].copy()

# === Merge predictions + ground truth into test data ===
df = pd.merge(
    df_test,
    df_pred[["Filename", "X_Refined", "Y_Refined", "X_True", "Y_True", 
             "X_Model", "Y_Model", "Refined_mm_Dist"]],
    on="Filename",
    how="left"
)

# === Keypoint‐level successful detection rate (SDR) ===

# Euclidean distance in pixels between model and ground truth
df["distance_px_model"] = np.sqrt(
    (df["X_Model"] - df["X_True"])**2 +
    (df["Y_Model"] - df["Y_True"])**2
)

# Convert to millimeters
df["distance_mm_model"] = df["distance_px_model"] * 0.08

# Compute mean radial error (MRE)
mre = df["distance_mm_model"].mean()

# Print out the MRE
print(f"\n Mean Radial Error: {mre:.2} mm")

# Compute success and overall SDR at z = 0.5, 1, 2 mm
thresholds = [0.5, 1.0, 2.0]
sdr = {}
for z in thresholds:
    col = f"success_{int(z)}mm" # Adds to the correct column
    df[col] = df["Refined_mm_Dist"] <= z # True or false depending on success or no success
    sdr[z] = df[col].mean() # Calculates share of successful detection in relation to non successful detection

# Print out the SDR
print("\Successful Detection Rate:")
for z in thresholds:
    print(f"  • ≤ {z:.1f} mm: {sdr[z]:.2%}")

# === Extract metadata ===
df["Jaw"] = df["Filename"].str.extract(r"_(upper|lower)")
df["Side"] = df["Filename"].str.extract(r"_(left|right)")
df["Base_ID"] = df["Filename"].str.extract(r"([A-Z0-9]+)")

# === Compute Y_star for upper jaw ===
df["Y_flipped"] = 1023 - df["Y_Refined"]
df["Y_star"] = df["Y_flipped"] - df["Y_vertical_translate"]

# === Compute overbite per side ===
df_upper = df[df["Jaw"] == "upper"].copy()
df_lower = df[df["Jaw"] == "lower"][["Base_ID", "Side", "Y_Refined"]].rename(columns={"Y_Refined": "Y_Refined_lower"})
df_upper = pd.merge(df_upper, df_lower, on=["Base_ID", "Side"], how="left")
df_upper["overbite_mm"] = (df_upper["Y_star"] - df_upper["Y_Refined_lower"]) * 0.08

# === Classify overbite
def classify_overbite(mm):
    if pd.isna(mm):
        return ""
    if mm < 1:
        return "A"
    elif mm < 2:
        return "B"
    elif mm < 3:
        return "C"
    elif mm < 4:
        return "D"
    else:
        return "E"

# === Classify overbite based on mm values ===
df_upper["Predicted_Class"] = df_upper["overbite_mm"].apply(classify_overbite) 

# === Average overbite per patient (include all, even NONE)
df_upper["Base_ID"] = df_upper["Filename"].str.extract(r"([A-Za-z0-9]+)")
df_avg_all = df_upper.groupby("Base_ID", as_index=False).agg({
    "overbite_mm": "mean",
    "Class_FINAL": "first"
})

# === classify overbite for average values ===
df_avg_all["Predicted_Class"] = df_avg_all["overbite_mm"].apply(classify_overbite)
df_avg_all["Overbite_pixel_AVG"] = df_avg_all["overbite_mm"] / 0.08

# === Match column and compare predicted class with Class_FINAL ===
df_avg_all["Match"] = df_avg_all.apply(
    lambda row: row["Predicted_Class"] == row["Class_FINAL"]
    if pd.notna(row["Class_FINAL"]) and row["Class_FINAL"] != "NONE"
    else "", axis=1
)

# === Format final patient-level summary
df_patient_summary = df_avg_all.rename(columns={
    "Base_ID": "Filename",
    "overbite_mm": "Overbite_mm_AVG",
    "Class_FINAL": "True Class"
})[["Filename", "Overbite_pixel_AVG", "Overbite_mm_AVG", "Predicted_Class", "True Class", "Match"]]

# === Add Overbite_AVG from test data for comparison
df_overbite_avg = df[df["Jaw"] == "upper"][["Base_ID", "Overbite_AVG"]].dropna().drop_duplicates(subset="Base_ID")
df_patient_summary = pd.merge(
    df_patient_summary,
    df_overbite_avg.rename(columns={"Base_ID": "Filename"}),
    on="Filename",
    how="left"
)

# === differnece between predicted and annotated overbite
df_patient_summary["Overbite_pixel_DIFF"] = df_patient_summary["Overbite_pixel_AVG"] - df_patient_summary["Overbite_AVG"]

# === Reorder columns
df_patient_summary = df_patient_summary[
    ["Filename", "Overbite_pixel_AVG", "Overbite_AVG", "Overbite_pixel_DIFF",
     "Overbite_mm_AVG", "Predicted_Class", "True Class", "Match"]
]

# === Save final summary CSV
df_patient_summary.to_csv(summary_path, index=False)
print(f"Patient-level summary saved to: {summary_path}")

# === Accuracy and kappa (on evaluable cases only)
df_eval = df_patient_summary[df_patient_summary["True Class"].notna() & (df_patient_summary["True Class"] != "NONE")]
df_eval = df_eval[df_eval["Predicted_Class"] != ""]

accuracy = (df_eval["Predicted_Class"] == df_eval["True Class"]).mean()
kappa = cohen_kappa_score(
    df_eval["True Class"],
    df_eval["Predicted_Class"],
    labels=["A", "B", "C", "D", "E"],
    weights="quadratic"
)

print(f"\nToothset Classification Accuracy: {accuracy:.2%} ({df_eval['Match'].sum()} out of {len(df_eval)})")
print(f"Weighted Cohen’s Kappa: {kappa:.4f}")
print(f"Total patients evaluated: {len(df_eval)}")
print(f"Patients excluded (NONE): {df_patient_summary['True Class'].eq('NONE').sum()}")

# Results
2025-05-08 TRANSLATE_KEY_set1.csv

| Field       | Value                 |
| ----------- | --------------------- |
| `Toothset Classification Accuracy`  | `94.37% (67 out of 71)` |
| `Weighted Cohen’s Kappa` | `0.9852`                 |
| `Total patients evaluated`  | `71` |
| `Patients excluded (NONE)` | `3`                 |

2025-05-08 TRANSLATE_KEY_set2.csv

| Field       | Value                 |
| ----------- | --------------------- |
| `Toothset Classification Accuracy`  | `95.77% (68 out of 71)` |
| `Weighted Cohen’s Kappa` | `0.9879`                 |
| `Total patients evaluated`  | `71` |
| `Patients excluded (NONE)` | `3`                 |

2025-05-08 TRANSLATE_KEY_set3.csv

| Field       | Value                 |
| ----------- | --------------------- |
| `Toothset Classification Accuracy`  | `97.22% (70 out of 72)` |
| `Weighted Cohen’s Kappa` | `0.9936`                 |
| `Total patients evaluated`  | `72` |
| `Patients excluded (NONE)` | `2`                 |


2025-05-08 TRANSLATE_KEY_set4.csv

 Field                                      | Value                      |
| ------------------------------------------ | -------------------------- |
| `Toothset Classification Accuracy`         | `95.77% (68 out of 71)`    |
| `Weighted Cohen’s Kappa`                   | `0.9905`                   |
| `Total patients evaluated`                 | `71`                       |
| `Patients excluded (NONE)`                 | `3`                        |

2025-05-08 TRANSLATE_KEY_set5.csv

 Field                                       | Value                      |
| ------------------------------------------ | -------------------------- |
| `Toothset Classification Accuracy`         | `95.95% (71 out of 74)`    |
| `Weighted Cohen’s Kappa`                   | `0.9896`                   |
| `Total patients evaluated`                 | `74`                       |
| `Patients excluded (NONE)`                 | `1`                        |

2025-05-08 TRANSLATE_KEY_set6.csv

 Field                                       | Value                      |
| ------------------------------------------ | -------------------------- |
| `Toothset Classification Accuracy`         | `97.30% (72 out of 74)`    |
| `Weighted Cohen’s Kappa`                   | `0.9938`                   |
| `Total patients evaluated`                 | `74`                       |
| `Patients excluded (NONE)`                 | `1`                        |

2025-05-08 TRANSLATE_KEY_set7.csv

 Field                                       | Value                      |
| ------------------------------------------ | -------------------------- |
| `Toothset Classification Accuracy`         | `94.52% (69 out of 73)`    |
| `Weighted Cohen’s Kappa`                   | `0.9859`                   |
| `Total patients evaluated`                 | `73`                       |
| `Patients excluded (NONE)`                 | `2`                        |

2025-05-08 TRANSLATE_KEY_set8.csv

 Field                                       | Value                      |
| ------------------------------------------ | -------------------------- |
| `Toothset Classification Accuracy`         | `94.52% (69 out of 73)`    |
| `Weighted Cohen’s Kappa`                   | `0.9881`                   |
| `Total patients evaluated`                 | `73`                       |
| `Patients excluded (NONE)`                 | `2`                        |


# MRE and SDR computation
MRE is calculated with the following formula:
\begin{align*}
MRE = \frac{D}{N}
\end{align*}

Here D is the mm distance between the predicted keypoint before the pixel matrix refinement function and the ground truth. N is the amount of images that the predictions is made on which in our case is 300.

SDR is calculated with the following formula:
\begin{align*}
SDR = \frac{K}{N} \cdot 100%
\end{align*}

Here K is the amount of correctly predicted ketpoints in the error-range of z which is a predetermined range of error that the predticted keypoint is allowed to be in. This z is decided by Ruben to be 0.5, 1, and 2 respectively. N is the amount of images that the predictions is made on which as said in our case is 300. In the end we multiply by 100 to get the rate in percent.


To know if the predicted keypoint is correctly places with respect to z we calculate the euclidian distance between the predticted keypoint and the ground truth and converts it to mm. By doing it this way we can simply add another column that is true if the point is in the euclidian radius of z and false if not. Because true or false is also interpreted as 0 or 1 we can take the mean of the entire column and multiply it with 100\% giving us the desired SDR.

# Calculations

Patient = 013FHA7K

\------------------------------

Filename	013FHA7K_upper_left

Y_Refined	276

\------------------------------

Y_flipped	747

Y_vertical_translate	347

Y_target	400

Class_FINAL	C

\------------------------------

Y_flipped = 1023 - Y_Refined = 1023 - 276 = 747

Y_star = Y_flipped - Y_vertical_translate = 747 - 347 = 400


| Field       | Value                 |
| ----------- | --------------------- |
| `Filename`  | `013FHA7K_lower_left` |
| `Y_Refined` | `369`                 |

overbite_pixels = Y_star - Y_Refined_lower = 400 - 369 = 31

overbite_mm = 31 * 0.08 = 2.48


013FHA7K_upper_right.png

| Field      | Value |
| ---------- | ----- |
| Y\_Refined | `276` |

| Field                  | Value |
| ---------------------- | ----- |
| Y\_flipped             | `747` |
| Y\_vertical\_translate | `342` |


Y_flipped = 1023 - 276 = 747

Y_star = 747 - 342 = 405


013FHA7K_lower_right.png

overbite_pixels = 405 - 369 = 36

overbite_mm = 36 * 0.08 = 2.88


avg_overbite = (2.48 + 2.88) / 2 = 2.68 mm

Predicted_Class = "C"  # Because 2 mm ≤ x < 3 mm
