In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display
import json
import datetime

from utils.data_exploration_utils import plot_hist, scatterplot, barplots

In [None]:
today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH

folder = "2025-07-14_data_exploration"
kl_file = "inmodi_data_personalinformation_kl.csv"
kl_filepath = os.path.join(proc_dir, folder, kl_file)

kl = pd.read_csv(kl_filepath)

folder = "2025-07-23_hdbscan"

save_dir = os.path.join(proc_dir, folder + "_eval")
os.makedirs(save_dir, exist_ok=True)

filepath = os.path.join(proc_dir, folder)

for f in os.listdir(filepath):
    if os.path.isdir(os.path.join(filepath, f)):
        if f == "run50":
            for file in os.listdir(os.path.join(filepath, f)):
                if file.endswith("hdbscan_scaled.csv"):
                    print(f"Processing file: {file}")
                    df = pd.read_csv(os.path.join(filepath, f, file))
                    display(df.head())
                elif file.endswith("hdbscan_scaled_wKL.csv"):
                    df_kl = pd.read_csv(os.path.join(filepath, f, file))
    else:
        continue

In [None]:
noise_points = list(set(list(df[df['cluster_label'] == -1]['id'])))

In [None]:
kl[kl['name'].isin(noise_points)]['KL-Score'].value_counts()

# Preprocessing

In [None]:
df_merged = kl.merge(df, left_on='name', right_on='id', how='left')

display(df_merged.head())

# Correlation

## Correlation with KL-Score and Pain

In [None]:
columns_corr = ['pain', 'age',
       'ce_bmi', 'ce_fm',  'cluster_label', 'KL-Score'] 

corr_types = ['pearson', 'kendall', 'spearman']
for corr in corr_types:
    print(f"Calculating {corr} correlation...")

    df_merged_corr = df_merged[columns_corr].corr(method=corr)
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_merged_corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
    plt.title(f"{corr.capitalize()} Correlation Heatmap")
    #plt.savefig(os.path.join(img_save_dir, f"{corr}corr.png"))
    plt.show()

In [None]:
columns_corr.remove('cluster_label')

In [None]:
barplots(df_merged, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6), savepath=None)

In [None]:
barplots(df_merged, y_list=columns_corr, x='cluster_label', hue='gender', figsize = (6, 6), savepath=None)