In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from utils.data_exploration_utils import plot_hist, scatterplot, barplots, check_img_resp_cluster_klscore, boxplot
from utils.hdbscan_utils import plot_hdbscan, plot_hdbscan_highlight_kl, make_cluster_color_map
from utils.plot_utils import plotly_hdbscan_highlight_kl

In [None]:
# data/processed/2025-08-11_hdbscan/run150

today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH

folder = "2025-08-23_hdbscan"
folder_date = folder.split('_')[0]
run = "run2"

img_path = config.SCHULTHESS_DATAPATH
# img_path = os.path.join(img_path, "600x600_imgs")

filepath = os.path.join(proc_dir, folder, "questionnaire", run)
save_path = os.path.join(filepath, "img")
os.makedirs(save_path, exist_ok=True)
df = pd.read_csv(os.path.join(filepath, f'questionnaire_{run}_umap_hdbscan_scaled_wKL.csv'))

display(df.head())

In [None]:
ids = pd.read_csv(os.path.join(filepath, "x_umap_ids.csv"))
print(ids.shape)

# Some Exploration

In [None]:
df['KL-Score'].value_counts()

In [None]:
df['cluster_label'].value_counts().reset_index().sort_values('cluster_label')

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.show()

# Plot

In [None]:
folder2 = "2025-08-11_data_exploration"
df_filename = "inmodi_data_questionnaire_kl_woSC.csv"

df2 = pd.read_csv(os.path.join(proc_dir, folder2, df_filename))

df2 = ids.merge(df2, right_on='name', left_on='id', how='inner')

print(df2.shape)

In [None]:
embeddings_path = os.path.join(filepath, "X_umap_embeddings.npy")

X_umap = np.load(embeddings_path)

print(X_umap.shape)

In [None]:
df2 = df2.merge(df[['name', 'cluster_label', 'probability']], on = "name", how='left')
df2.drop(columns=['Unnamed: 0'], inplace=True)
print(df2.shape)

In [None]:
# replace NaN values with -1
df2['cluster_label'].fillna(-1, inplace=True)

# add index to df2 from 1 to x
df2.reset_index(inplace=True)
display(df2.head())

df2.drop(columns=['index'], inplace=True)
df2.to_csv(os.path.join(filepath, f"questionnaire_{run}_umap_hdbscan_scaled_wKL_v2.csv"), index=True, index_label = 'index')

In [None]:
display(df2['cluster_label'].value_counts().reset_index().sort_values('cluster_label'))

In [None]:
values = df2['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.savefig(os.path.join(save_path, f'{folder}_{run}_distribution_cluster_rawq.png'))
plt.show()

### Cluster Label vs KL-Score

In [None]:
# scatterplot
color_map = make_cluster_color_map(df2['KL-Score'].unique())
plt.figure(figsize=(20, 10))
sns.catplot(data = df2, x='cluster_label', y='probability', hue='KL-Score', palette=color_map, jitter = 0.3)
plt.show()

In [None]:
kls = sorted(df2['KL-Score'].unique())

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['KL-Score'] == kl],
        x='cluster_label', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"KL-Score = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_rawq.png'))
plt.show()

In [None]:
kls = sorted(df2['cluster_label'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['cluster_label'] == kl],
        x='KL-Score', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"cluster_label = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_v2_rawq.png'))
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df2, x='cluster_label', hue='KL-Score', multiple='dodge', palette=color_map)
plt.show()

Cluster 0.0 only has one patient with KL-score of 3.0 the rest is between 0 and 2

### Cluster Label vs KL-Score vs Probability

In [None]:
df3 = df2.copy()
df3 = df3[df3['cluster_label'] != -1]  # remove noise

In [None]:
sns.pairplot(df3[['cluster_label', 'probability', 'KL-Score']], kind = "scatter")

scatter probability, cluster label => looks relatively even, except cluster 0 has a few with low probability.
kl score and probability also show some of the lower KL-Scores having low probability, while for KL score 4, the model seems to be quite certain
=> guess it shows again how in the higher KL-Scores its easier to distinguish them from the incorrect ones, are all in the same cluster though

## Other patterns

In [None]:
base_col = ['cluster_label', 'KL-Score']
cols = [ 'pain', 'age',  'ce_bmi', 'ce_fm', 'gender']

plt.figure(figsize=(20, 10))
sns.pairplot(df3[base_col + cols], hue = 'cluster_label',palette=color_map, kind = "scatter")
plt.savefig(os.path.join(save_path, f'{folder}_{run}_pairplot_rawq.png'))
plt.show()

In [None]:
# sns.histplot(data=df2, x='KL-Score', y='pain', hue='cluster_label', palette=color_map, element='step')

## Plot HDBSCAN

In [None]:
color_map = make_cluster_color_map(df2['cluster_label'].unique())

In [None]:
kl = list(df2['KL-Score'].unique())

kl_indexes = {}
for i in kl:
    index_ids = df2[df2['KL-Score'] == i].index
    kl_indexes[i] = index_ids
    print(len(index_ids))

print("To Double check that the indexing works, should only see on KL-Score in Value Counts:")
for i in kl:
    print(df2['KL-Score'].iloc[kl_indexes[i]].value_counts())

See plotly_scatter.py for 3D visualization: src/symptomatic/plotly_scatter.py

# Correlation

## Correlation with Questionnaire Scores

**!!! Careful, this actually does not make sense, since the clusters are not ranked, therefore we need to think of another way to test/measure ordinal features (questionnaire data) with categorical values (cluster labels)**

### Kruskal-Wallis (non-parametric ANOVA)

To test whether distributions of a feature differ significantly across clusters.Only tests for differences, not effect size.

H_0: The samples have the same central tendency, so samples originate from the same distribution.
H_1: at least one sample doesn't have the same central tendency, so at least one sample stochastically dominates one other sample.

In [None]:
from scipy.stats import kruskal

def kruskal_wallis(df, feature, cluster_col = 'cluster_label'):
    groups = [df.loc[df[cluster_col]==cluster, feature] for cluster in df[cluster_col].unique()]
    stat, p = kruskal(*groups)
    return stat, p

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4'
       #, 'cluster_label'
       ] 

results = []
for feature in columns_corr:
       stat, p = kruskal_wallis(df, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)


# corr_types = ['spearman']
# for corr in corr_types:
#     print(f"Calculating {corr} correlation...")

#     df_corr = df[columns_corr].corr(method=corr)
#     plt.figure(figsize=(12, 8))
#     sns.heatmap(df_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
#     plt.title(f"{corr.capitalize()} Correlation Heatmap")
#     #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
#     plt.show()
results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

H-statistic:
* small: if all clusters have similar distribution, their average rank will be similar
* large: at least one group's distribution is shifted, relative to the others.

## Correlation with KL-Score and Pain

### Kruscal Wallis

In [None]:

for feature in [ 'pain', 'age',
       'ce_bmi', 'ce_fm']:
    for c in df["cluster_label"].unique():
        print(f"NaN values: {df[feature].isna().sum()}")
        vals = df.loc[df["cluster_label"]==c, feature].dropna()
        print(f"For Feature {feature}")
        print(f"Cluster {c}: n={len(vals)}, unique={vals.nunique()}, min={vals.min()}, max={vals.max()}")
    print()


In [None]:
columns_corr = [  
 'pain', 'age',
       'ce_bmi', 'ce_fm'
       ] 

results = []
for feature in columns_corr:
       df_wonan = df.dropna(subset=[feature])
       stat, p = kruskal_wallis(df_wonan, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)

results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

### Plots

In [None]:
columns_corr =  ['pain', 'KL-Score'] 
barplots(df, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6), savepath=save_path)

#### Boxplot

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4', 'pain', 'KL-Score', 'age', 'ce_bmi', 'ce_fm'
       #, 'cluster_label'
       ] 


boxplot(df, y_list=columns_corr, n_cols = 2, x='cluster_label', hue=None, 
         savepath=save_path)

# KL-Score Visualization

In [None]:
labels = list(df['cluster_label'].unique())
labels.sort()

for i in labels:
    df_temp = df.copy()
    df_temp = df_temp[df_temp['cluster_label']==i]
    print(f"For label {i}:")
    display(df_temp['KL-Score'].value_counts().reset_index().sort_values(by="KL-Score"))
    print()

In [None]:
k=2

## Cluster 0

### KL score 0

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=0, img_path=img_path, k=k)

### KL-Score 3

In [None]:
img = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=3, img_path=img_path)

## Cluster 1

### KL score 0

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=1, klscore=0, img_path=img_path, k=k)

### KL-Score 1

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=1, klscore=1, img_path=img_path, k=2)

## Cluster 2

### KL score 0

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=2, klscore=0, img_path=img_path, k=k)

### KL-Score 3

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=2, klscore=3, img_path=img_path, k=k)

## Cluster 3

### KL-Score 0

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=3, klscore=0, img_path=img_path, k=k)

### KL-Score 4

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=3, klscore=4, img_path=img_path, k=k)

# Get examples

In [None]:
# # Give me a df showing which clusters have which min max KL-Score
# kl_diffs = df_merged.groupby('cluster_label')['KL-Score'].agg(['min', 'max'])
# kl_diffs.sort_values('max', ascending=False, inplace=True)

In [None]:
# kl_diffs

In [None]:
# #for each cluster label give me 2 examples with different KL-Score

# clusters = df_merged['cluster_label'].unique()
# sorted_clusters = sorted(clusters)
# for cluster in sorted_clusters:
#     print(f"Cluster {cluster}:")
#     cluster_df = df_merged[df_merged['cluster_label'] == cluster]
    
#     if len(cluster_df) > 0:
#         for kl_score in cluster_df['KL-Score'].unique():
#             subset = cluster_df[cluster_df['KL-Score'] == kl_score]
#             if len(subset) >= 2:
#                 examples = subset.sample(n=2, random_state=42)
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#             else:
#                 examples = subset
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#     else:
#         print("  No data available for this cluster.")
