In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from utils.data_exploration_utils import plot_hist, scatterplot, barplots, check_img_resp_cluster_klscore, boxplot
from utils.hdbscan_utils import plot_hdbscan, plot_hdbscan_highlight_kl, make_cluster_color_map
from utils.plot_utils import plotly_hdbscan_highlight_kl

In [None]:
# data/processed/2025-08-11_hdbscan/run150

today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH


folder = "2025-10-12_hdbscan"
run = "run123"
folder_date = folder.split('_')[0]


img_path = config.SCHULTHESS_DATAPATH

filepath = os.path.join(proc_dir, folder, "pipeline", run)
save_path = os.path.join(filepath, "img")
os.makedirs(save_path, exist_ok=True)
df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_wKL.csv'))

df_ids = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled.csv'))

display(df.head())

In [None]:
embeddings_path = os.path.join(filepath, "X_umap_embeddings.npy")

X_umap = np.load(embeddings_path)


In [None]:
X_umap.shape

In [None]:
ids = df_ids['id'].to_list()

# Some Exploration

In [None]:
df['KL-Score'].value_counts()

In [None]:
df['cluster_label'].value_counts().reset_index().sort_values('cluster_label')

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.show()

In [None]:
folder2 = "2025-08-11_data_exploration"
df_filename = "inmodi_data_questionnaire_kl_woSC.csv"

pi = pd.read_csv(os.path.join(proc_dir, folder2, df_filename))

In [None]:
df2 = df_ids.merge(pi, left_on='id', right_on = 'name', how='left', validate='one_to_one')
df2.drop(columns=['Unnamed: 0'], inplace=True)
# df2.rename(columns={'cluster_label_x': 'cluster_label', 'probability_x': 'probability'}, inplace=True)

In [None]:
df2.shape

In [None]:
df2.head()

In [None]:
df2.to_csv(os.path.join(filepath, f"pipeline_{run}_umap_hdbscan_scaled_wKL_v2.csv"), index=False)

In [None]:
values = df2['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.savefig(os.path.join(save_path, f'{folder}_{run}_distribution_cluster_rawq.png'))
plt.show()

### Cluster Label vs KL-Score

In [None]:
# scatterplot
color_map = make_cluster_color_map(df2['KL-Score'].unique())
plt.figure(figsize=(20, 10))
sns.catplot(data = df2, x='cluster_label', y='probability', hue='KL-Score', palette=color_map, jitter = 0.3)
plt.show()

In [None]:
kls = sorted(df2['KL-Score'].unique())

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['KL-Score'] == kl],
        x='cluster_label', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"KL-Score = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_rawq.png'))
plt.show()

In [None]:
kls = sorted(df2['cluster_label'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df2[df2['cluster_label'] == kl],
        x='KL-Score', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"cluster_label = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_v2_rawq.png'))
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df2, x='cluster_label', hue='KL-Score', multiple='dodge', palette=color_map)
plt.show()

### Cluster Label vs KL-Score vs Probability

In [None]:
df3 = df2.copy()
df3 = df3[df3['cluster_label'] != -1]  # remove noise

In [None]:
sns.pairplot(df3[['cluster_label', 'probability', 'KL-Score']], kind = "scatter")

## Other patterns

In [None]:
base_col = ['cluster_label', 'KL-Score']
cols = [ 'pain', 'age',  'ce_bmi', 'ce_fm', 'gender']

plt.figure(figsize=(20, 10))
sns.pairplot(df3[base_col + cols], hue = 'cluster_label',palette=color_map, kind = "scatter")
plt.savefig(os.path.join(save_path, f'{folder}_{run}_pairplot_rawq.png'))
plt.show()

## Barplots

In [None]:
df2['is_male'] = df2['gender'].apply(lambda x: 1 if x=='male' else 0)
feature_cols = ['pain', 'age', 'ce_bmi', 'ce_fm', 'is_male', 'OKS_score', 'UCLA_score',
       'FJS_score', 'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport', 'KOOS_adl',
       'KOOS_qol', 'KL-Score', 'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5',
       'koos_s6', 'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4',
       'koos_p5', 'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1',
       'koos_a2', 'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7',
       'koos_a8', 'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13',
       'koos_a14', 'koos_a15', 'koos_a16', 'koos_a17', 'koos_sp1', 'koos_sp2',
       'koos_sp3', 'koos_sp4', 'koos_sp5', 'koos_q1', 'koos_q2', 'koos_q3',
       'koos_q4']
group_by= 'cluster_label'

In [None]:
def boxplot_with_stats(df, group_by, col, order=None, annotate=('min','mean','max')):
    # 1) compute per-group stats
    stats = (
        df.groupby(group_by)[col]
          .agg(['min','max','mean','median','std','count'])
          .reset_index()
    )
    if order is None:
        # keep natural order of groups as they appear in data
        order = list(pd.Categorical(df[group_by]).categories) \
                if pd.api.types.is_categorical_dtype(df[group_by]) \
                else list(stats[group_by])

    # 2) plot
    plt.figure(figsize=(10, 6))
    ax = sns.boxplot(data=df, x=group_by, y=col, order=order)

    # 3) overlay mean as a diamond marker
    means = stats.set_index(group_by).loc[order, 'mean'].values
    ax.scatter(range(len(order)), means, marker='D', s=60, zorder=3, label='Mean')

    # 4) annotate per box
    # place annotations a bit above the top whisker for readability
    # get y-max per group from stats['max']; add a small padding based on data range
    y_min = df[col].min()
    y_max = df[col].max()
    pad   = 0.02 * (y_max - y_min if y_max > y_min else 1.0)

    st = stats.set_index(group_by).loc[order]
    for i, g in enumerate(order):
        parts = []
        for k in annotate:
            v = st.loc[g, k]
            parts.append(f"{k}={v:.3g}")  # 3 sig figs; change to :.2f if you prefer
        text = ", ".join(parts)

        # y position just above that group's max
        y = st.loc[g, 'max'] + pad
        ax.annotate(text, xy=(i, y), xytext=(0, 3),
                    textcoords='offset points', ha='center', va='bottom',
                    fontsize=9, bbox=dict(boxstyle='round,pad=0.2', fc='white', ec='0.6', alpha=0.8))

    ax.set_title(f"{col} by {group_by}")
    ax.legend(loc='best')
    plt.tight_layout()
    plt.show()

In [None]:
for col in feature_cols:
    boxplot_with_stats(df2, group_by, col, annotate=('min','mean','max'))
    # plt.figure(figsize=(10, 6))
    # #plt.title(f"Min: {df2[col].min():.2f}, Max: {df2[col].max():.2f}, Mean: {df2[col].mean():.2f}, Median: {df2[col].median():.2f}, Std: {df2[col].std():.2f}")
    # sns.boxplot(data=df2, x=group_by, y=col)
    # #plt.savefig(os.path.join(save_path, f'{folder}_{run}_boxplot_{col}_by_{group_by}_rawq.png'))
    # plt.show()

# Correlation

In [None]:
from scipy.stats import kruskal

def kruskal_wallis(df, feature, cluster_col = 'cluster_label'):
    groups = [df.loc[df[cluster_col]==cluster, feature] for cluster in df[cluster_col].unique()]
    stat, p = kruskal(*groups)
    return stat, p

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4'
       #, 'cluster_label'
       ] 

results = []
for feature in columns_corr:
       stat, p = kruskal_wallis(df, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)


# corr_types = ['spearman']
# for corr in corr_types:
#     print(f"Calculating {corr} correlation...")

#     df_corr = df[columns_corr].corr(method=corr)
#     plt.figure(figsize=(12, 8))
#     sns.heatmap(df_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
#     plt.title(f"{corr.capitalize()} Correlation Heatmap")
#     #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
#     plt.show()
results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

## Correlation with KL-Score and Pain

### Kruscal Wallis

In [None]:

for feature in [ 'pain', 'age',
       'ce_bmi', 'ce_fm']:
    for c in df["cluster_label"].unique():
        print(f"NaN values: {df[feature].isna().sum()}")
        vals = df.loc[df["cluster_label"]==c, feature].dropna()
        print(f"For Feature {feature}")
        print(f"Cluster {c}: n={len(vals)}, unique={vals.nunique()}, min={vals.min()}, max={vals.max()}")
    print()


In [None]:
columns_corr = [  
 'pain', 'age',
       'ce_bmi', 'ce_fm'
       ] 

results = []
for feature in columns_corr:
       df_wonan = df.dropna(subset=[feature])
       stat, p = kruskal_wallis(df_wonan, feature, cluster_col = 'cluster_label')
       # print(f"Kruskal-Wallis test for {feature}: H-statistic = {stat:.3f}, p-value = {p:.3e}")
       results.append({'feature': feature, 'H-statistic': stat, 'p-value': p})

results_df = pd.DataFrame(results)
# results_df = results_df.sort_values('p-value')

display(results_df.sort_values('p-value').head())
results_df.to_csv(os.path.join(filepath, f"kruskal_wallis_results_{run}.csv"), index=False)

results_df[results_df['p-value'] >= 0.05]
plt.figure(figsize=(10, 6))
sns.barplot(data = results_df, x='feature', y='H-statistic')
plt.xticks(rotation=90)
plt.show()

### Plots

In [None]:
columns_corr =  ['pain', 'KL-Score'] 
barplots(df, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6), savepath=save_path)

# Boxplot

In [None]:
columns_corr = [  
       'oks_q1', 'oks_q2', 'oks_q3', 'oks_q4',
       'oks_q5', 'oks_q6', 'oks_q7', 'oks_q8', 'oks_q9', 'oks_q10', 'oks_q11',
       'oks_q12', 'koos_s1', 
       'koos_s2', 'koos_s3', 'koos_s4', 'koos_s5', 'koos_s6',
       'koos_s7', 'koos_p1', 'koos_p2', 'koos_p3', 'koos_p4', 'koos_p5',
       'koos_p6', 'koos_p7', 'koos_p8', 'koos_p9', 'koos_a1', 'koos_a2',
       'koos_a3', 'koos_a4', 'koos_a5', 'koos_a6', 'koos_a7', 'koos_a8',
       'koos_a9', 'koos_a10', 'koos_a11', 'koos_a12', 'koos_a13', 'koos_a14',
       'koos_a15', 'koos_a16', 'koos_a17',  'koos_sp1', 'koos_sp2', 'koos_sp3',
       'koos_sp4', 'koos_sp5', 
       'koos_q1', 'koos_q2', 'koos_q3', 'koos_q4', 'pain', 'KL-Score', 'age', 'ce_bmi', 'ce_fm'
       #, 'cluster_label'
       ] 


boxplot(df, y_list=columns_corr, n_cols = 2, x='cluster_label', hue=None, 
         savepath=save_path)

# Comparison First/2nd visit


In [None]:
df_patient = df2[['record_id', 'side']]

df_patient['name'] = df_patient.apply(lambda row: f"{row['record_id']}_{row['side']}", axis=1)

df2['name_v2'] = df2.apply(lambda row: f"{row['record_id']}_{row['side']}", axis=1)
first = df2[df2['visit']==1]
df_patient = df_patient.merge(first[['name_v2', 'cluster_label', 'KL-Score']], left_on = ['name'], right_on=['name_v2'], how='left')
second = df2[df2['visit']==2]
df_patient = df_patient.merge(second[['name_v2', 'cluster_label', 'KL-Score']], left_on = ['name'], right_on=['name_v2'], how='left', suffixes=('_first', '_second'))

In [None]:
#count nan values
print(df_patient.isna().sum())

df_patient.dropna(inplace=True)

df_patient['cluster_change'] = df_patient['cluster_label_first'] != df_patient['cluster_label_second']

df_patient

In [None]:
df_patient[df_patient['cluster_change']==True].shape
display(df_patient[df_patient['cluster_change']==True].sort_values(by=['cluster_label_first']))

# KL-Score Visualization

In [None]:
labels = list(df['cluster_label'].unique())
labels.sort()

for i in labels:
    df_temp = df.copy()
    df_temp = df_temp[df_temp['cluster_label']==i]
    print(f"For label {i}:")
    display(df_temp['KL-Score'].value_counts().reset_index().sort_values(by="KL-Score"))
    print()

In [None]:
k=2

In [None]:
img_path

In [None]:
for i in labels:
    min_kl = df[df['cluster_label']==i]['KL-Score'].min()
    max_kl = df[df['cluster_label']==i]['KL-Score'].max()
    print(f"For cluster {i}, min KL-Score: {min_kl}, max KL-Score: {max_kl}")
    _ = check_img_resp_cluster_klscore(df, cluster_label = i, klscore = min_kl, img_path=img_path, k=k)
    _ = check_img_resp_cluster_klscore(df, cluster_label = i, klscore = max_kl, img_path=img_path, k=k)