In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from utils.data_exploration_utils import kruskal_wallis_analysis, barplots
from utils.hdbscan_utils import plot_hdbscan, plot_hdbscan_highlight_kl, make_cluster_color_map
from utils.plot_utils import plotly_hdbscan_highlight_kl

In [None]:
STAGE = 'ss'
MOD_PREFIX = "mod_smallimg3"
NEPOCH = 'latest'


DATAPATH = config.OUTPUT_PATH
base_dir = config.RAW_DATA_PATH
img_path = config.SCHULTHESS_DATAPATH
proc_dir = config.PROC_DATA_PATH

folder = "2025-10-17_hdbscan"
run = "run17"

anomalyscore_metric = "centre_mean"
cluster_col = "cluster_label"

## Load HDBSCAN Data

In [None]:
today = datetime.date.today()
folder_date = folder.split('_')[0]

filepath = os.path.join(proc_dir, folder, "pipeline", run)
save_path = os.path.join(filepath, "img")
os.makedirs(save_path, exist_ok=True)

try:
    hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_allpoints_wKL.csv'))
except:
    hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled.csv'))
    #.merge(kl_df, left_on = 'id', right_on='name', how='left', validate='one_to_one')
    kl = pd.read_csv(os.path.join(proc_dir, "2025-08-11_data_exploration", "inmodi_data_questionnaire_kl_woSC.csv"))
    hdbscan_df = hdbscan_df.merge(kl, left_on = 'id', right_on='name', how='left', validate='one_to_one')

with open(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_model_info.json')) as f:
    model_info= json.load(f)

In [None]:
ids = model_info['files']['ids']

## Load Embeddings

In [None]:
embeddings_path = os.path.join(filepath, "X_umap_embeddings.npy")
X_umap = np.load(embeddings_path)


## Load MRI Data

In [None]:
mri = pd.read_csv(os.path.join(base_dir, '2025-09-25_mrismall.csv'))

## Load SS-FewSome Results

In [None]:
outputs = os.path.join(DATAPATH, 'outputs', 'dfs', STAGE)

filepath2 =  []
for file in os.listdir(outputs):
    if MOD_PREFIX in file and str(NEPOCH) in file and '_all' in file:
        filepath2.append(os.path.join(outputs, file))
dfs = []
for path in filepath2:
    df = pd.read_csv(path)[['id', anomalyscore_metric]]  # only keep id + target col
    dfs.append(df.rename(columns={anomalyscore_metric: os.path.basename(path)})) 
combined = dfs[0]
for df in dfs[1:]:
    combined = pd.merge(combined, df, on='id', how="inner")  # 'inner' keeps only common IDs

experiment_cols = [c for c in combined.columns if c != 'id']
combined["mean"] = combined[experiment_cols].mean(axis=1)
combined["std"] = combined[experiment_cols].std(axis=1)
combined.to_csv(os.path.join(outputs, f"{MOD_PREFIX}_{STAGE}_aggregated_scores.csv"), index = False)
combined['filepath'] = combined['id']
combined['id'] = combined['id'].apply(lambda x: x.split('/')[-1].replace('.png', ''))

In [None]:
print(len(mri))
print(len(combined))
print(len(hdbscan_df))

## Create Combined Data

In [None]:
print(len(combined), "samples in combined dataframe")
print(len(hdbscan_df), "samples in hdbscan dataframe")

In [None]:
dfc = combined.merge(hdbscan_df, on='id', how = 'right')

In [None]:
print(len(dfc), "samples in combined dataframe")

In [None]:
dfc[dfc['mean'].isna()]

In [None]:
dfc2 = mri.merge(dfc, on='id', how='left')

In [None]:
df = hdbscan_df.copy()

In [None]:
print(dfc2[dfc2['mri_cart_yn'].isna()])

# Anomaly Score distribution

In [None]:
# comb2 = combined.iloc[:, :-3]

# Boxplot
plt.figure(figsize=(6, 4))
plt.hist(combined['mean'], bins=20)
plt.title('Distribution of Mean Values')
plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x='cluster_label', y='mean', data=dfc)
plt.title('Boxplot of Mean Anomaly Scores by Cluster Label')
plt.xlabel('Cluster Label') 
plt.ylabel('Mean Anomaly Score')
plt.show()

In [None]:
print(f"Min. Anomaly Score: {combined['mean'].min():.3f}")
print(f"Max. Anomaly Score: {combined['mean'].max():.3f}")

In [None]:
for cluster in df['cluster_label'].unique():
    cluster_data = dfc[dfc['cluster_label'] == cluster]['mean']
    print(f"Cluster {cluster}: n={len(cluster_data)}, mean={cluster_data.mean():.3f}, std={cluster_data.std():.3f}, min={cluster_data.min():.3f}, max={cluster_data.max():.3f}")

# Test Grouping of AS

In [None]:
# Create Groups of Anomaly Scores
def assign_as_group(mean_score):
    if mean_score < 0.3:
        return 'Low'
    elif 0.3 <= mean_score < 0.6:
        return 'Medium'
    else:
        return 'High'

In [None]:
mri_columns = [ 'mri_operator',
 'mri_side',
 'mri_bml_yn',
 'mri_cart_yn',
 'mri_osteo_yn',
 'mri_syn_yn',
 'mri_mnsc_yn',
 'mri_lig_yn']


In [None]:
# dfc2['AS_Group'] = dfc2['mean'].apply(assign_as_group)

# display(dfc2['AS_Group'].value_counts())

In [None]:
# dfc2_nonan = dfc2.dropna(subset=['AS_Group', 'cluster_label'])
# pd.crosstab(dfc2_nonan['AS_Group'], dfc2_nonan['cluster_label'], normalize='columns')

# Some Exploration

In [None]:
df['cluster_label'].value_counts().reset_index().sort_values('cluster_label')

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.show()

### Cluster Label vs KL-Score

In [None]:
# scatterplot
color_map = make_cluster_color_map(df['KL-Score'].unique())
plt.figure(figsize=(20, 10))
sns.catplot(data = df, x='cluster_label', y='probability', hue='KL-Score', palette=color_map, jitter = 0.3)
plt.show()

In [None]:
kls = sorted(df['cluster_label'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df[df['cluster_label'] == kl],
        x='KL-Score', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"cluster_label = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_v2_rawq.png'))
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='cluster_label', hue='KL-Score', multiple='dodge', palette=color_map)
plt.show()

## Cluster Label vs MRI Data

In [None]:
mri_cols = ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']

for mri_col in mri_cols:
    kls = sorted(dfc2[mri_col].unique())

    ncols = 2
    nrows = math.ceil(len(kls)/ncols)
    fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
    ax = np.ravel(ax)  # flatten to 1D

    for idx, kl in enumerate(kls):
        sns.boxplot(
            data=dfc2[dfc2[mri_col] == kl],
            x='cluster_label', y='probability',
            ax=ax[idx], color=color_map[kl]
        )
        ax[idx].set_title(f"{mri_col} = {kl}")

    # hide any unused axes
    for j in range(len(kls), len(ax)):
        ax[j].set_visible(False)

    plt.tight_layout()
    plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_{mri_col}_rawq.png'))
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.histplot(data=dfc2, x='cluster_label', hue=mri_col, multiple='dodge', palette=color_map)
    plt.savefig(os.path.join(save_path, f'{folder}_{run}_histogram_cluster_{mri_col}_rawq.png'))
    plt.show()

# Correlation

In [None]:
from scipy.stats import kruskal

def kruskal_wallis(df, feature, cluster_col = 'cluster_label'):
    groups = [df.loc[df[cluster_col]==cluster, feature] for cluster in df[cluster_col].unique()]
    stat, p = kruskal(*groups)
    return stat, p

## Correlation with KL-Score and Pain

### Kruscal Wallis

In [None]:
columns_corr = [  
 'pain', 'age',
       'ce_bmi', 'ce_fm'
       ] 

for feature in columns_corr:
    for c in dfc2["cluster_label"].unique():
        print(f"NaN values: {dfc2[feature].isna().sum()}")
        vals = dfc2.loc[dfc2["cluster_label"]==c, feature].dropna()
        print(f"For Feature {feature}")
        print(f"Cluster {c}: n={len(vals)}, unique={vals.nunique()}, min={vals.min()}, max={vals.max()}")
    print()


In [None]:
results = []
for feature in columns_corr:
       dfc2_wonan = dfc2.dropna(subset=[feature])
       stat, p = kruskal_wallis(dfc2_wonan, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)

results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

In [None]:
for i in range(len(columns_corr)):
    dfc_wonan = dfc2.copy()
    dfc_wonan = dfc_wonan.dropna(subset=[columns_corr[i]])
    print(f"Kruskal-Wallis analysis for feature: {columns_corr[i]}")
    kruskal_wallis_analysis(dfc_wonan, columns_corr[i], cluster_col='cluster_label')

In [None]:
mri_col = ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']

results = []
for feature in mri_col:
       dfc2_wonan = dfc2.dropna(subset=[feature, 'cluster_label'])
       stat, p = kruskal_wallis(dfc2_wonan, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)

results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

for col in mri_col:
    dfc_wonan = dfc2.copy()
    dfc_wonan = dfc_wonan.dropna(subset=[col, 'cluster_label'])
    print(len(dfc_wonan), "samples after dropping NaNs for", col)
    kruskal_wallis_analysis(dfc_wonan, col, cluster_col='cluster_label')

### Plots

In [None]:
columns_corr =  ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn'] 
barplots(dfc2, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6), savepath=save_path)

# Majority Vote

In [None]:
def majority_vote(df, cluster_col, feature_col):
    clusters = df[cluster_col].unique()
    clusters.sort()

    results = pd.DataFrame({cluster_col: clusters})

    for feature in feature_col:
        majority_vote = df.groupby(cluster_col)[feature].agg(lambda x: list(x.mode()))
        majority_vote = pd.DataFrame(majority_vote).reset_index()
        majority_vote = majority_vote.rename(columns={feature: f'MV_{feature}'})
        results = results.merge(majority_vote, on = cluster_col, how = 'left')

    return results.dropna(axis=0, how='all')

def handle_modes(x, id_):
    if len(x) == 1:
        try:
            return float(x[0])
        except Exception as e:
            print(f"Conversion error for id={id_}: {e}")
            return None
    else:
        print(f"Tie detected for id={id_}: {x}")
        return None  
    
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
def get_metrics(df, feature):
    y_true = df[feature]
    y_pred = df[f'MV_{feature}']

    print(f'For Feature {feature}:')
    precision=precision_score(y_true, y_pred, average='macro')
    print("Precision:", precision)
    recall = recall_score(y_true, y_pred, average='macro')
    print("Recall:", recall)
    f1 = f1_score(y_true, y_pred, average='macro')
    print("F1 Score:", f1)
    # if len(np.unique(y_true))>2:
    #     print("ROC_AUC:", roc_auc_score(y_true, y_pred, multi_class='ovo'))
    # else:
    #     print("ROC_AUC:", roc_auc_score(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n")
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.savefig(os.path.join(filepath, f'{feature}_confusionmatrix.png'))
    plt.show()

    return precision, recall, f1

def get_metrics_percluster(df, feature, cluster_col = 'cluster_label', normalized=False):
    print(f'For Feature {feature}:')
    clusters = np.unique(df[cluster_col])

    for c in clusters:
        mask = df[cluster_col] ==c

        if mask.sum()<2:
            print(f'Skipping cluster {c}: only {mask.sum()} samples')
            continue
        y_pred = df.loc[mask, f'MV_{feature}']
        y_true = df.loc[mask, feature]

        cm = confusion_matrix(
                y_true,
                y_pred,
                normalize='true' if normalized else None
            )
        
        labels = sorted(np.unique(y_true.tolist() + y_pred.tolist()))
        plt.figure(figsize=(5, 4))
        sns.heatmap(
            cm,
            annot=True,
            fmt='.2f' if normalized else 'd',
            cmap='YlGnBu',
            cbar=False,
            xticklabels=labels,
            yticklabels=labels
        )
        plt.title(f'Confusion Matrix â€“ Cluster {c} ({mask.sum()} samples)')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.show()

## Plot Distribution per cluster first

In [None]:
feature_col = ['mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn']

for feature in feature_col:
    clusters = dfc2['cluster_label'].unique()
    clusters.sort()

    counts = dfc2.groupby(['cluster_label', feature]).size().unstack(fill_value=0)

    plt.figure(figsize=(10, 6))
    counts.plot(kind='bar', stacked=False)
    plt.title(f'Distribution of {feature} across clusters')
    plt.xlabel('Cluster Label')
    plt.ylabel('Count')
    plt.show()

## Majority Vote official

In [None]:
maj_vote = majority_vote(dfc2, 'cluster_label', feature_col)

In [None]:
for feature in feature_col:
    col_name = f'MV_{feature}'
    maj_vote[col_name] = [
        handle_modes(row[col_name], row['cluster_label'])
        for _, row in maj_vote.iterrows()
    ]

In [None]:
display(maj_vote)

In [None]:
maj_vote.to_csv(os.path.join(filepath, 'maj_vote_eval.csv'), index=False)

## Calculate Precision, Recall, F1-Score etc.

In [None]:
dfc3 = dfc2.merge(maj_vote, how = 'left', on= 'cluster_label')
dfc3 = dfc3.dropna(subset=['cluster_label'])

In [None]:
try:
    feature_col.remove('KL-Score')
except Exception as e:
    print(e)
    pass
   

In [None]:
metrics = pd.DataFrame()

for feature in feature_col:
    precision, recall, f1 = get_metrics(dfc3, feature)
    results = {'feature': feature,
               'precision': precision,
               'recall': recall,
               'f1_score': f1}
    results = pd.DataFrame([results])
    metrics = pd.concat([metrics, results])

In [None]:
for feature in feature_col:
    get_metrics_percluster(dfc3, feature, normalized=False)