In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from utils.data_exploration_utils import plot_hist, scatterplot, barplots, check_img_resp_cluster_klscore, boxplot
from utils.hdbscan_utils import plot_hdbscan, plot_hdbscan_highlight_kl, make_cluster_color_map

In [None]:
# data/processed/2025-08-11_hdbscan/run150

today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH

# folder = "2025-08-12_hdbscan"
# folder_date = folder.split('_')[0]
# run = "run110"
# run = "run69"
# folder = "2025-08-11_hdbscan"
# folder_date = folder.split('_')[0]
# run = "run22"
folder = "2025-09-13_hdbscan_img"
folder_date = folder.split('_')[0]
run = "run32"


date = folder.split('_')[0]

img_path = config.SCHULTHESS_DATAPATH
# img_path = os.path.join(img_path, "600x600_imgs")

filepath = os.path.join(proc_dir, "radiographic_features", folder, run)
save_path = os.path.join(filepath, "img")
os.makedirs(save_path, exist_ok=True)

df = pd.read_csv(os.path.join(filepath, f'{folder}_{run}_umap_hdbscan_scaled.csv'))

kl = os.path.join(proc_dir, "2025-08-11_data_exploration", "inmodi_data_questionnaire_kl_woSC.csv")#data/processed/2025-08-11_data_exploration/inmodi_data_questionnaire_kl_woSC.csv
kl = pd.read_csv(kl)

print(f"DF Shape before merge: {df.shape}")
# print(df.columns)
# display(df.head())
# print(kl.columns)
# display(kl[['record_id', 'name']].head())
df = df.merge(kl, left_on='id', right_on='name', how='inner')
print(f"DF Shape after merge: {df.shape}")
display(df.head())

# Some Exploration

In [None]:
df['KL-Score'].value_counts()

In [None]:
df['cluster_label'].value_counts()

In [None]:
df_filtered = df[df['cluster_label'] != -1]  # remove noise points

In [None]:
df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label', ascending=True)

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.savefig(os.path.join(save_path, f'{folder}_{run}_distribution_cluster_Xray.png'))
plt.show()

# Plot

In [None]:
#load npz file = 
embeddings = np.load(os.path.join(filepath, 'embeddings.npz'), allow_pickle=True)

In [None]:
ids = pd.DataFrame(embeddings['names'], columns=['id'])

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df2 = df.copy()
df2 = df2[df2['cluster_label'] != -1]  # remove noise points
display(df2['cluster_label'].value_counts().reset_index().sort_values('cluster_label'))

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.savefig(os.path.join(save_path, f'{folder}_{run}_distribution_cluster_Xray.png'))
plt.show()

### Cluster Label vs KL-Score

In [None]:
# scatterplot
color_map = make_cluster_color_map(df2['KL-Score'].unique())
plt.figure(figsize=(20, 10))
sns.catplot(data = df2, x='cluster_label', y='probability', hue='KL-Score', palette=color_map, jitter = 0.3)
plt.show()

In [None]:
kls = sorted(df2['KL-Score'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 1
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 6*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['KL-Score'] == kl],
        x='cluster_label', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"KL-Score = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_Xray.png'))
plt.show()

In [None]:
kls = sorted(df2['cluster_label'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 1
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 6*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['cluster_label'] == kl],
        x='KL-Score', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"cluster_label = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_v2_Xray.png'))
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x='cluster_label', hue='KL-Score', multiple='dodge', palette=color_map)
plt.show()

### Cluster Label vs KL-Score vs Probability

In [None]:
sns.pairplot(df[['cluster_label', 'probability', 'KL-Score']], kind = "scatter")

## Other patterns

In [None]:
base_col = ['cluster_label', 'KL-Score']
cols = [ 'pain', 'age',  'ce_bmi', 'ce_fm', 'gender']

plt.figure(figsize=(20, 10))
sns.pairplot(df2[base_col + cols], hue = 'cluster_label',palette=color_map, kind = "scatter")
plt.savefig(os.path.join(save_path, f'{folder}_{run}_pairplot_Xray.png'))
plt.show()

In [None]:
# sns.histplot(data=df2, x='KL-Score', y='pain', hue='cluster_label', palette=color_map, element='step')

# Correlation

## Correlation with Questionnaire Scores

### Kruskal-Wallis (non-parametric ANOVA)

To test whether distributions of a feature differ significantly across clusters.Only tests for differences, not effect size.

H_0: The samples have the same central tendency, so samples originate from the same distribution.
H_1: at least one sample doesn't have the same central tendency, so at least one sample stochastically dominates one other sample.

In [None]:
from scipy.stats import kruskal

def kruskal_wallis(df, feature, cluster_col = 'cluster_label'):
    groups = [df.loc[df[cluster_col]==cluster, feature] for cluster in df[cluster_col].unique()]
    stat, p = kruskal(*groups)
    return stat, p

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4'
       #, 'cluster_label'
       ] 

results = []
for feature in columns_corr:
       stat, p = kruskal_wallis(df, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)


# corr_types = ['spearman']
# for corr in corr_types:
#     print(f"Calculating {corr} correlation...")

#     df_corr = df[columns_corr].corr(method=corr)
#     plt.figure(figsize=(12, 8))
#     sns.heatmap(df_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
#     plt.title(f"{corr.capitalize()} Correlation Heatmap")
#     #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
#     plt.show()
results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

## Correlation with KL-Score and Pain

In [None]:
for feature in [ 'pain', 'age',
       'ce_bmi', 'ce_fm']:
    for c in df["cluster_label"].unique():
        print(f"NaN values: {df[feature].isna().sum()}")
        vals = df.loc[df["cluster_label"]==c, feature].dropna()
        print(f"For Feature {feature}")
        print(f"Cluster {c}: n={len(vals)}, unique={vals.nunique()}, min={vals.min()}, max={vals.max()}")
    print()

In [None]:
columns_corr = [  
 'pain', 'age',
       'ce_bmi', 'ce_fm'
       ] 

results = []
for feature in columns_corr:
       df_wonan = df.dropna(subset=[feature])
       stat, p = kruskal_wallis(df_wonan, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)

results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

### Plots

In [None]:
columns_corr =  ['pain', 'KL-Score'] 
barplots(df2, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6), savepath=save_path)

#### Boxplot

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4', 'pain', 'KL-Score', 'age', 'ce_bmi', 'ce_fm'
       #, 'cluster_label'
       ] 


boxplot(df, y_list=columns_corr, n_cols = 2, x='cluster_label', hue=None, 
         savepath=save_path)

# KL-Score Visualization

In [None]:
labels = list(df['cluster_label'].unique())
labels.sort()

for i in labels:
    df_temp = df.copy()
    df_temp = df_temp[df_temp['cluster_label']==i]
    print(f"For label {i}:")
    display(df_temp['KL-Score'].value_counts().reset_index().sort_values(by="KL-Score"))
    print()

In [None]:
k=2

## Cluster 0

### KL score 0

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=0, img_path=img_path, k=k)

### KL-Score 4

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=4, img_path=img_path, k=k)

## Cluster 3

### KL-Score 0

In [None]:
try:
    _ = check_img_resp_cluster_klscore(df, cluster_label=3, klscore=0, img_path=img_path, k=k)
except Exception as e:
    print(f"Error: {e}. This cluster might not enough images.")
    try:
        _ = check_img_resp_cluster_klscore(df, cluster_label=3, klscore=0, img_path=img_path)
        
    except Exception as e:
        print(f"Error: {e}. This cluster might not enough images.")


### KL-Score 4

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=3, klscore=4, img_path=img_path, k=k)

# Get examples

In [None]:
# # Give me a df showing which clusters have which min max KL-Score
# kl_diffs = df_merged.groupby('cluster_label')['KL-Score'].agg(['min', 'max'])
# kl_diffs.sort_values('max', ascending=False, inplace=True)

In [None]:
# kl_diffs

In [None]:
# #for each cluster label give me 2 examples with different KL-Score

# clusters = df_merged['cluster_label'].unique()
# sorted_clusters = sorted(clusters)
# for cluster in sorted_clusters:
#     print(f"Cluster {cluster}:")
#     cluster_df = df_merged[df_merged['cluster_label'] == cluster]
    
#     if len(cluster_df) > 0:
#         for kl_score in cluster_df['KL-Score'].unique():
#             subset = cluster_df[cluster_df['KL-Score'] == kl_score]
#             if len(subset) >= 2:
#                 examples = subset.sample(n=2, random_state=42)
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#             else:
#                 examples = subset
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#     else:
#         print("  No data available for this cluster.")
