In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime

from utils.data_exploration_utils import plot_hist, scatterplot, barplots, check_img_resp_cluster_klscore

In [None]:
# data/processed/2025-08-11_hdbscan/run150

today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH

folder = "2025-08-11_hdbscan"
folder_date = folder.split('_')[0]
run = "run150"
# run = "run145"

img_path = config.SCHULTHESS_DATAPATH
img_path = os.path.join(img_path, "600x600_imgs")

filepath = os.path.join(proc_dir, folder, run)

df = pd.read_csv(os.path.join(filepath, f'{folder_date}_hdbscan_{run}_umap_hdbscan_scaled_wKL.csv'))

display(df.head())

# Some Exploration

In [None]:
df['KL-Score'].value_counts()

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.show()

# Correlation

## Correlation with Questionnaire Scores

In [None]:
columns_corr = ['OKS_score', 'UCLA_score', 'FJS_score', 'KOOS_pain', 'KOOS_symptoms',
       'KOOS_sport', 'KOOS_adl', 'KOOS_qol', 'cluster_label'] 

corr_types = ['spearman']
for corr in corr_types:
    print(f"Calculating {corr} correlation...")

    df_corr = df[columns_corr].corr(method=corr)
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    plt.title(f"{corr.capitalize()} Correlation Heatmap")
    #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
    plt.show()

## Correlation with KL-Score and Pain

In [None]:
columns_corr = ['pain', 'age',
       'ce_bmi', 'ce_fm',  'cluster_label'] 

corr_types = ['spearman']
for corr in corr_types:
    print(f"Calculating {corr} correlation...")

    df_corr = df[columns_corr].corr(method=corr)
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    plt.title(f"{corr.capitalize()} Correlation Heatmap")
    #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
    plt.show()

In [None]:
#'label' is ground truth

df_merged2 = df.copy()
df_merged2 = df_merged2.dropna(subset=['cluster_label', 'KL-Score', 'pain'])

res = stats.spearmanr(df_merged2['cluster_label'].tolist(), df_merged2['KL-Score'].tolist())

print(f"Spearman correlation between cluster label and KL-Score: {res[0]:.3f}")

res = stats.spearmanr(df_merged2['cluster_label'].tolist(), df_merged2['pain'].tolist())
print(f"Spearman correlation between cluster label and pain: {res[0]:.3f}")

In [None]:
columns_corr.remove('cluster_label')

In [None]:
columns_corr =  ['pain', 'KL-Score'] 
barplots(df, y_list=columns_corr, x='cluster_label', hue=None, figsize = (12, 10), savepath=None)

In [None]:
barplots(df, y_list=columns_corr, x='cluster_label', hue='gender', figsize = (6, 6), savepath=None)

# KL-Score Visualization

In [None]:
labels = list(df['cluster_label'].unique())
labels.sort()

for i in labels:
    df_temp = df.copy()
    df_temp = df_temp[df_temp['cluster_label']==i]
    print(f"For label {i}:")
    display(df_temp['KL-Score'].value_counts().reset_index())
    print()

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=0, img_path=img_path, k=5)

In [None]:
_ = check_img_resp_cluster_klscore(df, cluster_label=0, klscore=3, img_path=img_path)

In [None]:
tocheck = df[(df['cluster_label']==0.0) & (df['KL-Score']==3)]
idtocheck = list(tocheck['id'])


In [None]:
l = []
for dirp, dirn, _ in os.walk(img_path):
    for folder in dirn:
        basedir = os.path.join(dirp, folder, '3')
        possible_paths=[]
        for id in idtocheck:
            possible_path = id + '.png'
            possible_paths.append(possible_path)
        l_dir = os.listdir(basedir)
        for path in possible_paths:
            if path in l_dir:
                l.append(os.path.join(basedir, path))
        break
    break

In [None]:
for path in l:
    img = load_image(path)
    plt.title(f'{os.path.basename(path)}')
    plt.imshow(img, cmap='gray')
    plt.show()

# Get examples

In [None]:
# # Give me a df showing which clusters have which min max KL-Score
# kl_diffs = df_merged.groupby('cluster_label')['KL-Score'].agg(['min', 'max'])
# kl_diffs.sort_values('max', ascending=False, inplace=True)

In [None]:
# kl_diffs

In [None]:
# #for each cluster label give me 2 examples with different KL-Score

# clusters = df_merged['cluster_label'].unique()
# sorted_clusters = sorted(clusters)
# for cluster in sorted_clusters:
#     print(f"Cluster {cluster}:")
#     cluster_df = df_merged[df_merged['cluster_label'] == cluster]
    
#     if len(cluster_df) > 0:
#         for kl_score in cluster_df['KL-Score'].unique():
#             subset = cluster_df[cluster_df['KL-Score'] == kl_score]
#             if len(subset) >= 2:
#                 examples = subset.sample(n=2, random_state=42)
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#             else:
#                 examples = subset
#                 print(f"  KL-Score {kl_score}:")
#                 display(examples[['name', 'id', 'KL-Score', 'pain', 'age', 'ce_bmi', 'ce_fm']])
#     else:
#         print("  No data available for this cluster.")
