In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import seaborn as sns
from IPython.display import display
import datetime
from utils.data_exploration_utils import investigate_data

from sklearn.preprocessing import StandardScaler
#import hdbscan
from sklearn.cluster import HDBSCAN

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH
save_dir = os.path.join(proc_dir, f"{today}_hdbscan")
os.makedirs(save_dir, exist_ok=True)

folder = "2025-07-03_data_exploration"
unpivoted = True

if unpivoted:
    df = pd.read_csv(os.path.join(proc_dir, folder, "inmodi_data_personalinformation_unpivoted.csv"))
else:
    df = pd.read_csv(os.path.join(proc_dir, folder, "inmodi_data_personalinformation.csv"))

## Parameters

In [None]:
# Default parameters
params = {
    'min_cluster_size': 5,
    'min_samples': None,
    'cluster_selection_epsilon': 0.0,
    'max_cluster_size': None,
    'metric': 'euclidean',
    'metric_params': None,
    'alpha': 1.0,
    'algorithm': 'auto',
    'leaf_size': 40,
    'cluster_selection_method': 'eom',
    'store_centers': 'centroid' #not default, but want to keep this
}

# Functions

In [None]:
def get_unique_filepath(base_path):
    """If file exists, append _2, _3, etc. until unique."""
    if not os.path.exists(base_path):
        return base_path
    base, ext = os.path.splitext(base_path)
    counter = 2
    new_path = f"{base}_{counter}{ext}"
    while os.path.exists(new_path):
        counter += 1
        new_path = f"{base}_{counter}{ext}"
    return new_path

def save_results(df, clusterer, params, scaler, save_dir, filename):
    df_filename = f"{filename}.csv"
    results_df = pd.DataFrame({
                    'record_id': df['record_id'],
                    'cluster_label': clusterer.labels_,
                    'probability': clusterer.probabilities_,
                })
    df_savepath = get_unique_filepath(os.path.join(save_dir, df_filename))
    results_df.to_csv(df_savepath, index=False)

    model_info = {
            'df_savepath': df_savepath,
            'params': params,
            'scaler': scaler.__class__.__name__,
            'n_clusters': len(set(clusterer.labels_)) - (1 if -1 in clusterer.labels_ else 0),
            'centroids': clusterer.centroids_.tolist(),
        }
    model_info_filename = f"{filename}_model_info.json"
    model_info_savepath = get_unique_filepath(os.path.join(save_dir, model_info_filename))
    with open(model_info_savepath, 'w') as f:
        json.dump(model_info, f, indent=4)
    return os.path.basename(df_savepath).split('.')[0]

def plot_hdbscan(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None, save_path = None):
    if ax is None:
        _, ax = plt.subplots(figsize=(10, 4))
    labels = labels if labels is not None else np.ones(X.shape[0])
    probabilities = probabilities if probabilities is not None else np.ones(X.shape[0])
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
    # The probability of a point belonging to its labeled cluster determines
    # the size of its marker
    proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_index = (labels == k).nonzero()[0]
        for ci in class_index:
            ax.plot(
                X[ci, 0],
                X[ci, 1],
                "x" if k == -1 else "o",
                markerfacecolor=tuple(col),
                markeredgecolor="k",
                markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
            )
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    preamble = "True" if ground_truth else "Estimated"
    title = f"{preamble} number of clusters: {n_clusters_}"
    if parameters is not None:
        parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
        title += f" | {parameters_str}"
    ax.set_title(title)
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path)
        plt.show()

# Preprocessing

scaler?

In [None]:
# Nan Values
df_nanids = investigate_data(df)

In [None]:
# for df_nanids, these are the columns with NaN values
df_nanids

## Choose Relevant columns, Check remaining columns for NaN values & remove

In [None]:
cols = ['record_id', # id column
        #'visit', 'side', 
        'pain', 
        'age', 
        # 'ce_height', 
        # 'ce_weight',
       'ce_bmi', 
       'ce_fm', 
       'gender', 
       'OKS_score', 
       'UCLA_score', 
       'FJS_score',
       'KOOS_pain', 
       'KOOS_symptoms', 
       'KOOS_sport', 
       'KOOS_adl', 
       'KOOS_qol'
]

df2 = df[cols].copy()

df2_missingna = investigate_data(df2)

In [None]:
print("Dataframe before dropping NaN values: ", df2.shape)
df2 = df2.dropna(axis=0, how='any')

In [None]:
print("Dataframe after dropping NaN values: ", df2.shape)

## Create dummy variables

In [None]:
df2.dtypes

In [None]:
# 'gender' convert to int
df2['is_male'] = df['gender'].apply(lambda x: 1 if x=='male' else 0)
df2 = df2.drop(columns= 'gender')

## Scaler

In [None]:
df2_scaled = df2.copy()
scaler = StandardScaler()
X = df2_scaled.drop(columns=['record_id'])
X_scaled = scaler.fit_transform(X)

# HDBSCAN

## Scaled X

In [None]:
clusterer = HDBSCAN(**params)
clusterer = clusterer.fit(X_scaled)

In [None]:
base_name = save_results(df2, clusterer, params, scaler, save_dir, 'default_hdbscan_scaled')

In [None]:
plot_hdbscan(X_scaled, clusterer.labels_, 
             probabilities=clusterer.probabilities_, 
             parameters={'parameters': 'default'},
             save_path = os.path.join(save_dir, f"{base_name}_plot.png"))

x if it's noise

## Non-Scaled X

In [None]:
clusterer = HDBSCAN(**params)
clusterer = clusterer.fit(X)

In [None]:
base_name = save_results(df2, clusterer, params, scaler, save_dir, 'default_hdbscan')
plot_hdbscan(X_scaled, clusterer.labels_, 
             probabilities=clusterer.probabilities_, 
             parameters={'parameters': 'default'},
             save_path = os.path.join(save_dir, f"{base_name}_plot.png"))