In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_fscore_support, f1_score, cohen_kappa_score, average_precision_score

In [None]:
STAGE = 'ss'
MOD_PREFIX = "mod_smallimg3"
NEPOCH = 'latest'


DATAPATH = config.OUTPUT_PATH
base_dir = config.RAW_DATA_PATH
img_path = config.SCHULTHESS_DATAPATH
proc_dir = config.PROC_DATA_PATH

# #for rawq:
# folder = "2025-11-19_hdbscan"
# run = "run10"  #"run175"

feature = 'img_features'
#for img features:
folder = "2025-08-11_hdbscan"
folder_date = folder.split('_')[0]
run = "run22"

anomalyscore_metric = "centre_mean"
cluster_col = "cluster_label"

In [None]:
today = datetime.date.today()
folder_date = folder.split('_')[0]



if feature == 'rawq':
    filepath = os.path.join(proc_dir, folder, "pipeline", run)
    hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled.csv'))
elif feature == 'img_features':
        filepath = os.path.join(proc_dir, "radiographic_features", folder, run)
        hdbscan_df = pd.read_csv(os.path.join(filepath, f'{folder}_{run}_umap_hdbscan_scaled.csv'))
   
kl = pd.read_csv(os.path.join(base_dir,  "brul_knee_annotations.csv"))
kl2 = pd.read_csv(os.path.join(base_dir, "rosand1_knee_annotations.csv"))
mri = pd.read_csv(os.path.join(base_dir, '2025-09-25_mrismall.csv'))

# with open(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_model_info.json')) as f:
#     model_info= json.load(f)

In [None]:
hdbscan_df['cluster_label'].value_counts().reset_index()

In [None]:
hdbscan_df = hdbscan_df.merge(kl, left_on = 'id', right_on='name', how='left', validate='one_to_one')

In [None]:
hdbscan_df = hdbscan_df.merge(kl2, left_on = 'id', right_on='name', how='left', validate='one_to_one', suffixes=('', '2'))

In [None]:
try:
    hdbscan_df.drop(columns=['Unnamed: 0'], inplace=True)
except:
    pass

In [None]:
hdbscan_df.columns
## need to add mri data

In [None]:
df = pd.read_csv(os.path.join(DATAPATH, 'outputs', 'dfs', 'ss', 'mod_smallimg3_ss_aggregated_scores.csv'))

In [None]:
df['id_temp'] = df['id'].apply(lambda x: x.split('/')[-1])

In [None]:
df['id'] = df['id_temp'].apply(lambda x: x.split('.')[0])

In [None]:
df.drop(columns=['id_temp'], inplace=True)

In [None]:
hdbscan_df.head()

In [None]:
hdbscan_df[hdbscan_df['cluster_label'].isna()]

In [None]:
hdbscan_df = hdbscan_df.merge(df, on='id', how='left', validate='one_to_one')

In [None]:
print(hdbscan_df.shape)
print(df.shape)


In [None]:
hdbscan_df.columns

In [None]:
# if 'KL-Score'  is na, fill with 'KL-Score2'
hdbscan_df['KL-Score'] = hdbscan_df['KL-Score'].fillna(hdbscan_df['KL-Score2'])
hdbscan_df['KL-Score2'] = hdbscan_df['KL-Score2'].fillna(hdbscan_df['KL-Score'])

In [None]:
hdbscan_df[hdbscan_df['KL-Score'].isna()]

In [None]:
hdbscan_df['KL-Score'].fillna(-1, inplace=True)
hdbscan_df['KL-Score2'].fillna(-1, inplace=True)

# Quick KL-comparison

In [None]:
hdbscan_df['KL-Score'].value_counts().sort_index()

In [None]:
hdbscan_df['KL-Score2'].value_counts().sort_index()

# Quick Cluster Label Overview

In [None]:
hdbscan_df['cluster_label'].value_counts().sort_index() 

# Comparison KL-Score (AUC/SRC)

In [None]:
def get_metrics(df, score, label_name = 'KL-Score'):

    res = stats.spearmanr(df[score].tolist(), df[label_name].tolist())

    df['binary_label'] = 0
    df.loc[df[label_name] > 0, 'binary_label'] = 1
    fpr, tpr, thresholds = roc_curve(np.array(df['binary_label']),np.array(df[score]))
    auc = metrics.auc(fpr, tpr)


    df['binary_label'] = 0
    df.loc[df[label_name] > 1, 'binary_label'] = 1

    fpr, tpr, thresholds = roc_curve(np.array(df['binary_label']),np.array(df[score]))
    auc_mid = metrics.auc(fpr, tpr)


    df['binary_label'] = 0
    df.loc[df[label_name] > 2, 'binary_label'] = 1
    fpr, tpr, thresholds = roc_curve(np.array(df['binary_label']),np.array(df[score]))
    auc_mid2 = metrics.auc(fpr, tpr)


    df['binary_label'] = 0
    df.loc[df[label_name] == 4, 'binary_label'] = 1
    fpr, tpr, thresholds = roc_curve(np.array(df['binary_label']),np.array(df[score]))
    auc_sev = metrics.auc(fpr, tpr)



    return res[0], auc, auc_mid, auc_mid2, auc_sev

## Compare Cluster Label to KL-Score

In [None]:
# hdbscan_df_dropna = hdbscan_df.dropna(subset=['KL-Score', 'KL-Score2'])
hdbscan_df_dropna = hdbscan_df.copy()
hdbscan_df_dropna = hdbscan_df_dropna[hdbscan_df_dropna['KL-Score'] != -1]
hdbscan_df_dropna = hdbscan_df_dropna[hdbscan_df_dropna['KL-Score2'] != -1]

metrics_clkl = get_metrics(hdbscan_df_dropna, 'cluster_label', label_name = 'KL-Score')
metrics_clkl2 = get_metrics(hdbscan_df_dropna, 'cluster_label', label_name = 'KL-Score2')

In [None]:
print(metrics_clkl)

In [None]:
print(metrics_clkl2)

'Osteophytes', 'Joint-Space-Narrowing', 'pain'

In [None]:
def evaluate_all_as(df, as_cols, label_name='KL-Score'):

    results = {}
    for col in as_cols:
        results[col] = get_metrics(df, col, label_name=label_name)

    # Create DataFrame
    results_df = pd.DataFrame(results).T
    results_df.columns = ['spearmanr', 'auc', 'auc_mid', 'auc_mid2', 'auc_sev']

    # Calculate mean and std for each metric
    for metric in ['spearmanr', 'auc', 'auc_mid', 'auc_mid2', 'auc_sev']:
        results_df[f'{metric}_mean'] = results_df[metric].mean()
        results_df[f'{metric}_std'] = results_df[metric].std()

    return results_df


## Compare KL-Score to AS

In [None]:
hdbscan_df.columns
as_col = df.iloc[:, 1:-3].columns
print(as_col)

In [None]:
as_col = df.iloc[:, 1:-3].columns

# use get_metrics to get src, auc, auc_mid, auc_mid2, auc_sev for all as_col and then calculate mean and std of those values
results_df = evaluate_all_as(hdbscan_df_dropna.dropna(subset=as_col), as_col, label_name='KL-Score')
results_df2 = evaluate_all_as(hdbscan_df_dropna.dropna(subset=as_col), as_col, label_name='KL-Score2')

In [None]:
def ensemble_results(df):
    print(f'SRC mean: {np.round(df['spearmanr_mean'].iloc[0], 3)} with std {np.round(df['spearmanr_std'].iloc[0], 3)}')
    print(f'AUC mean: {np.round((df['auc_mean'].iloc[0])*100, 1)} with std {np.round((df['auc_std'].iloc[0])*100, 1)}')
    print(f'AUC Mid mean: {np.round((df['auc_mid_mean'].iloc[0])*100, 1)} with std {np.round((df['auc_mid_std'].iloc[0])*100, 1)}')
    print(f'AUC Mid2 mean: {np.round((df['auc_mid2_mean'].iloc[0])*100, 1)} with std {np.round((df['auc_mid2_std'].iloc[0])*100, 1)}')
    print(f'AUC Sev mean: {np.round((df['auc_sev_mean'].iloc[0])*100, 1)} with std {np.round((df['auc_sev_std'].iloc[0])*100, 1)}')

In [None]:
ensemble_results(results_df)

In [None]:
ensemble_results(results_df2)

## Compare Cluster Label to AS

In [None]:
# hdbscan_df = hdbscan_df.fillna(-1)

In [None]:
results_df3 = evaluate_all_as(hdbscan_df[hdbscan_df['cluster_label']!=-1].dropna(subset=['mean']), as_col, label_name='cluster_label')

In [None]:
ensemble_results(results_df3)

## Compare KL-Score to MRI

In [None]:
mri.head()

In [None]:
hdbscan_df_dropna.merge(mri, on ='id', how='left', validate='one_to_one').isna().sum()

In [None]:
mri_hdbscan_kl = hdbscan_df_dropna.merge(mri, on ='id', how='inner', validate='one_to_one')
print(mri_hdbscan_kl.shape)

In [None]:
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score'], label_name='mri_cart_yn')
print('Cartilage:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score'], label_name='mri_osteo_yn')
print('Osteophytes:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score'], label_name='mri_bml_yn')
print('Bone Marrow Lesions:')
ensemble_results(results_kl_mri)

In [None]:
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score2'], label_name='mri_cart_yn')
print('Cartilage:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score2'], label_name='mri_osteo_yn')
print('Osteophytes:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl, ['KL-Score2'], label_name='mri_bml_yn')
print('Bone Marrow Lesions:')
ensemble_results(results_kl_mri)

## Compare Cluster Label to MRI

In [None]:
mri_hdbscan_kl2 = hdbscan_df.merge(mri, on ='id', how='inner', validate='one_to_one')

In [None]:
print(mri_hdbscan_kl2.shape)

In [None]:
results_kl_mri = evaluate_all_as(mri_hdbscan_kl2, ['cluster_label'], label_name='mri_cart_yn')
print('Cartilage:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl2, ['cluster_label'], label_name='mri_osteo_yn')
print('Osteophytes:')
ensemble_results(results_kl_mri)
print()
results_kl_mri = evaluate_all_as(mri_hdbscan_kl2, ['cluster_label'], label_name='mri_bml_yn')
print('Bone Marrow Lesions:')
ensemble_results(results_kl_mri)