In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math

from utils.data_exploration_utils import kruskal_wallis_analysis, barplots
from utils.hdbscan_utils import plot_hdbscan, plot_hdbscan_highlight_kl, make_cluster_color_map
from utils.plot_utils import plotly_hdbscan_highlight_kl

In [None]:
STAGE = 'ss'
MOD_PREFIX = "mod_smallimg3"
NEPOCH = 'latest'


DATAPATH = config.OUTPUT_PATH
base_dir = config.RAW_DATA_PATH
img_path = config.SCHULTHESS_DATAPATH
proc_dir = config.PROC_DATA_PATH

feature = 'rawq'
methods = 'comb_modalities'
folder = "2026-01-17_hdbscan"
run = "run28"  
# mapping_list = [0, 4, 2, -1, 1]
# order = [0, 4, 2, 1]
# order_wnoise = [0,4,2,-1,1]

anomalyscore_metric = "centre_mean"
cluster_col = "cluster_label"

In [None]:
#{methods}_{run}_umap_hdbscan_severity_scores.csv

## Load HDBSCAN Data

In [None]:
today = datetime.date.today()
folder_date = folder.split('_')[0]

filepath = os.path.join(proc_dir, folder, methods, run)
save_path = os.path.join(filepath, "img")
os.makedirs(save_path, exist_ok=True)

# try:
#     hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_allpoints_wKL.csv'))
# except:
hdbscan_df = pd.read_csv(os.path.join(filepath, f'{methods}_{run}_umap_hdbscan_severity_scores.csv'))


with open(os.path.join(filepath, f'{methods}_{run}_umap_hdbscan_scaled_model_info.json')) as f:
    model_info= json.load(f)

In [None]:
hdbscan_df.shape

In [None]:
# folder_df = "2025-09-11_data_exploration"
# df_filename = "inmodi_data_questionnaire_kl_woSC.csv"
# qdf = pd.read_csv(os.path.join(proc_dir, folder_df, df_filename))

In [None]:
if methods =='comb_modalities':
    train = hdbscan_df[hdbscan_df['train_test']=='train']
    test = hdbscan_df[hdbscan_df['train_test']=='test']

    ids = train['id']
    ids_test = test['id']
else:
    ids = hdbscan_df['id']



In [None]:
# hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled.csv'))
hdbscan_df['cluster_label'].value_counts()

In [None]:
hdbscan_df.columns

## Load Embeddings

In [None]:
# embeddings_path = os.path.join(filepath, "X_umap_embeddings.npy")
# X_umap = np.load(embeddings_path)

# ids = model_info['files']['ids']
# id_to_index = {id_: index for index, id_ in enumerate(ids)}

## Load MRI Data

In [None]:
mri = pd.read_csv(os.path.join(base_dir, '2025-09-25_mrismall.csv'))

# Plot Embeddings

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_hdbscan_clusters(
    X,
    labels,
    probabilities=None,
    parameters=None,
    ground_truth=False,
    ax=None,
    size_min=8,
    size_max=80,
    use_first_three_dims=True,
    palette="Set2",
):
    """
    Plot HDBSCAN clustering results (2D or 3D).

    - Colors from seaborn qualitative palette (default: Set2)
    - Noise (label == -1) shown as black 'x'
    - Point sizes scale with probabilities
    - Legend placed outside
    - Shows figure (does not save)
    """

    X = np.asarray(X)
    labels = np.asarray(labels)
    n, d = X.shape

    # --- choose 2D vs 3D ---
    is_3d = d >= 3
    if is_3d and use_first_three_dims:
        Xp = X[:, :3]
    else:
        if d < 2:
            raise ValueError("X must have at least 2 features.")
        Xp = X[:, :2]

    # --- probabilities â†’ sizes ---
    if probabilities is None:
        probabilities = np.ones(n)
    else:
        probabilities = np.asarray(probabilities, float)
        pmin, pmax = probabilities.min(), probabilities.max()
        if pmax > 1.0 or pmin < 0.0:
            probabilities = (probabilities - pmin) / (pmax - pmin + 1e-12)

    sizes = size_min + (size_max - size_min) * probabilities

    # --- figure / axes ---
    if ax is None:
        fig = plt.figure(figsize=(9, 5))
        ax = fig.add_subplot(111, projection="3d") if is_3d else fig.add_subplot(111)

    unique_labels = np.unique(labels)
    non_noise = [lab for lab in unique_labels if lab != -1]

    # --- seaborn color palette ---
    n_colors = max(len(non_noise), 1)
    palette_colors = sns.color_palette(palette, n_colors=n_colors)
    color_map = {
        lab: palette_colors[i % n_colors]
        for i, lab in enumerate(sorted(non_noise))
    }

    handles, legend_labels = [], []

    # --- plot clusters ---
    for k in sorted(unique_labels, key=lambda x: (x == -1, x)):
        mask = labels == k
        if not np.any(mask):
            continue

        if k == -1:
            # noise
            h = ax.scatter(
                *Xp[mask].T,
                marker="x",
                c="k",
                s=size_min,
                linewidths=0.8,
                alpha=0.9,
            )
            handles.append(h)
            legend_labels.append("Noise")
        else:
            h = ax.scatter(
                *Xp[mask].T,
                c=[color_map[k]],
                s=sizes[mask],
                edgecolors="k",
                linewidths=0.2,
                alpha=0.9,
            )
            handles.append(h)
            legend_labels.append(f"Cluster {k}")

    # --- title ---
    n_clusters = len(non_noise)
    prefix = "True" if ground_truth else "Estimated"
    title = f"{prefix} number of clusters: {n_clusters}"

    if isinstance(parameters, dict) and parameters:
        params = ", ".join(f"{k}={v}" for k, v in parameters.items())
        title += f" | {params}"

    ax.set_title(title)

    # --- axis labels ---
    ax.set_xlabel("dim 0")
    ax.set_ylabel("dim 1")
    if is_3d:
        ax.set_zlabel("dim 2")
        ax.view_init(elev=18, azim=35)

    # --- legend outside ---
    if len(handles) <= 20:
        ax.legend(
            handles,
            legend_labels,
            title="Clusters",
            fontsize="small",
            loc="center left",
            bbox_to_anchor=(1.02, 0.5),
            borderaxespad=0.0,
        )

    plt.tight_layout()
    plt.show()

In [None]:
# plot_hdbscan_clusters(np.array(X_new), labels)

In [None]:
#save X_new as npy 
# np.save(os.path.join(filepath, f'X_umap_embeddings_v2.npy'), np.array(X_new))


In [None]:
# # SMote
# X_umap_samp = np.load(os.path.join(filepath, "X_umap_samp_embeddings.npy"))
# smote = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_generated_samples.csv'))

In [None]:
# scaler_path = os.path.join(proc_dir, folder, 'pipeline', run, 'scaler.pkl')
# umapmodel_path = os.path.join(proc_dir, folder, 'pipeline', run, 'umap_model.pkl')
# hdbscan_path = os.path.join(proc_dir, folder, 'pipeline', run, 'pipeline_run10_umap_hdbscan_scaled_clusterer.pkl')

In [None]:
# scaler = joblib.load(scaler_path)
# umap_model = joblib.load(umapmodel_path)
# clusterer = joblib.load(hdbscan_path)

In [None]:
# cluster_labels_smote, strengths = hdbscan.approximate_predict(clusterer, X_umap_samp)
# smote['old_cluster'] = cluster_labels_smote

In [None]:
# plot_hdbscan_clusters(X_umap_samp, smote['cluster_label'])

In [None]:
# smote.to_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_generated_samples_wclusters.csv'), index=False)

## Load SS-FewSome Results

In [None]:
# outputs = os.path.join(DATAPATH, 'outputs', 'dfs', STAGE)

# filepath2 =  []
# for file in os.listdir(outputs):
#     if MOD_PREFIX in file and str(NEPOCH) in file and '_all' in file:
#         filepath2.append(os.path.join(outputs, file))
# dfs = []
# for path in filepath2:
#     df = pd.read_csv(path)[['id', anomalyscore_metric]]  # only keep id + target col
#     dfs.append(df.rename(columns={anomalyscore_metric: os.path.basename(path)})) 
# combined = dfs[0]
# for df in dfs[1:]:
#     combined = pd.merge(combined, df, on='id', how="inner")  # 'inner' keeps only common IDs

# experiment_cols = [c for c in combined.columns if c != 'id']
# combined["mean"] = combined[experiment_cols].mean(axis=1)
# combined["std"] = combined[experiment_cols].std(axis=1)
# combined.to_csv(os.path.join(outputs, f"{MOD_PREFIX}_{STAGE}_aggregated_scores.csv"), index = False)
# combined['filepath'] = combined['id']
# combined['id'] = combined['id'].apply(lambda x: x.split('/')[-1].replace('.png', ''))

In [None]:
# print(len(mri))
# print(len(combined))
# print(len(hdbscan_df))

## Create Combined Data

In [None]:
# print(len(combined), "samples in combined dataframe")
print(len(hdbscan_df), "samples in hdbscan dataframe")

In [None]:
# dfc = combined.merge(hdbscan_df, on='id', how = 'right')

In [None]:
# print(len(dfc), "samples in combined dataframe")

In [None]:
# dfc2 = mri.merge(dfc, on='id', how='left')
# df = hdbscan_df.copy()
# print(dfc2[dfc2['mri_cart_yn'].isna()])
# dfc2.shape

In [None]:
hdbscan_df = hdbscan_df.merge(mri, on = 'id', how='left')

In [None]:
# hdbscan_df.drop(columns=['visit_y', 'side_y','name_y', 'KL-Score_y', 'Unnamed: 0', 'record_id_y'], inplace=True)

In [None]:
# hdbscan_df.rename(columns={'visit_x': 'visit', 'side_x': 'side', 'record_id_x': 'record_id', 'KL-Score_x': 'KL-Score', 'name_x': 'name'}, inplace=True)

In [None]:
hdbscan_df.to_csv(os.path.join(filepath, f'{methods}_{run}_umap_hdbscan_scaled_allpoints_wKL_mri_newclusters.csv'), index=False)

# Anomaly Score distribution

In [None]:
# comb2 = combined.iloc[:, :-3]

# Boxplot
plt.figure(figsize=(6, 4))
plt.hist(hdbscan_df['mean'], bins=20)
plt.title('Distribution of Mean Values')
plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x='cluster_label', y='mean', data=hdbscan_df, order=sorted(hdbscan_df['cluster_label'].unique()))
plt.title('Boxplot of Mean Anomaly Scores by Cluster Label')
plt.xlabel('Cluster Label') 
plt.ylabel('Mean Anomaly Score')
plt.show()

In [None]:
print(f"Min. Anomaly Score: {hdbscan_df['mean'].min():.3f}")
print(f"Max. Anomaly Score: {hdbscan_df['mean'].max():.3f}")

In [None]:
for cluster in hdbscan_df['cluster_label'].unique():
    cluster_data = hdbscan_df[hdbscan_df['cluster_label'] == cluster]['mean']
    print(f"Cluster {cluster}: n={len(cluster_data)}, mean={cluster_data.mean():.3f}, std={cluster_data.std():.3f}, min={cluster_data.min():.3f}, max={cluster_data.max():.3f}")

# Test Grouping of AS

In [None]:
# Create Groups of Anomaly Scores
def assign_as_group(mean_score):
    if mean_score < 0.3:
        return 'Low'
    elif 0.3 <= mean_score < 0.6:
        return 'Medium'
    else:
        return 'High'

In [None]:
mri_columns = [ 'mri_operator',
 'mri_side',
 'mri_bml_yn',
 'mri_cart_yn',
 'mri_osteo_yn',
 'mri_syn_yn',
 'mri_mnsc_yn',
 'mri_lig_yn']


In [None]:
# dfc2['AS_Group'] = dfc2['mean'].apply(assign_as_group)

# display(dfc2['AS_Group'].value_counts())

In [None]:
# dfc2_nonan = dfc2.dropna(subset=['AS_Group', 'cluster_label'])
# pd.crosstab(dfc2_nonan['AS_Group'], dfc2_nonan['cluster_label'], normalize='columns')

# Some Exploration

In [None]:
hdbscan_df['cluster_label'].value_counts().reset_index().sort_values('cluster_label')

In [None]:
df=hdbscan_df

In [None]:
values = df['cluster_label'].value_counts().reset_index().sort_values(by='cluster_label')

plt.bar(values['cluster_label'], values['count'], color = 'skyblue')
plt.xlabel('Cluster Label')
plt.ylabel('Count')
plt.show()

## Feature Analysis Clusters

In [None]:
import math

In [None]:
cols = ['pain', 'age', 'ce_bmi', 'ce_fm']
col_names = ['Pain', 'Age', 'BMI', 'Body Fat Percentage']

n = 2
rows = math.ceil(n / 2)

plt.figure(figsize=(5*n, 5))

for i, col in enumerate(cols, 1):
    rows = math.ceil(len(cols) / n)
    plt.subplot(rows, n, i)
    sns.violinplot(
        data=df,
        x='old_cluster',
        y=col,
        palette='Set3',
        inner='quartile'
    )
    plt.title(f'{col_names[i-1]} by Cluster Label')
    plt.xlabel('cluster_label')
    plt.ylabel(col_names[i-1])
    sns.despine()

plt.tight_layout()
plt.show()

In [None]:
cols = ['OKS_score', 
       'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport', 'KOOS_adl',
       'KOOS_qol']
n = 2
rows = math.ceil(n / 2)

plt.figure(figsize=(5*n, 10))

for i, col in enumerate(cols, 1):
    rows = math.ceil(len(cols) / n)
    plt.subplot(rows, n, i)
    sns.violinplot(
        data=df,
        x='old_cluster',
        y=col,
        palette='Set3',
        inner='quartile'
    )
    plt.title(f'{col} by Cluster Label')
    plt.xlabel('cluster_label')
    plt.ylabel(col)
    sns.despine()

plt.tight_layout()
plt.show()

In [None]:
cols = ['OKS_score', 
       'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport', 'KOOS_adl',
       'KOOS_qol']
n = 2
rows = math.ceil(n / 2)

plt.figure(figsize=(5*n, 10))

for i, col in enumerate(cols, 1):
    rows = math.ceil(len(cols) / n)
    plt.subplot(rows, n, i)
    sns.boxplot(
        data=df,
        x='old_cluster',
        y=col,
        palette='Set3',
        # inner='quartile'
    )
    plt.title(f'{col} by Cluster Label')
    plt.xlabel('cluster_label')
    plt.ylabel(col)
    sns.despine()

plt.tight_layout()
plt.show()

In [None]:
df_wonoise = df[df['old_cluster'] != -1]
cols = [
    'OKS_score',
    'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
    'KOOS_adl', 'KOOS_qol'
]

n_cols = 2
n_rows = math.ceil(len(cols) / n_cols)

plt.figure(figsize=(6 * n_cols, 4 * n_rows))

for i, col in enumerate(cols, 1):
    plt.subplot(n_rows, n_cols, i)

    # jitter KL-score slightly to avoid overlap
    jitter = np.random.normal(0, 0.08, size=len(df_wonoise))

    sns.scatterplot(
        data=df_wonoise,
        x=df_wonoise['KL-Score'] + jitter,
        y=col,
        hue='old_cluster',
        palette='Set2',
        alpha=0.5,
        s=60,
        edgecolor='none'
    )

    plt.title(f'{col} vs KL-score')
    plt.xlabel('KL score')
    plt.ylabel(col)
    sns.despine()

    # clean legend (only once)
    if i != 1:
        plt.legend([], [], frameon=False)

plt.tight_layout()
plt.show()


In [None]:
cluster_col

In [None]:
cols = [
    'OKS_score',
    'KOOS_pain', 'KOOS_symptoms', 'KOOS_sport',
    'KOOS_adl', 'KOOS_qol'
]

# make cluster labels explicit + ordered
cluster_order = sorted(df_wonoise["old_cluster"].unique())

# define a fixed palette (important for legend correctness)
cluster_colors ={
    0: '#bebada',
    1: '#80b1d3',
    2: '#fccde5',
    3: '#bc80bd',
    4: '#ffed6f'}


n_cols = 2
n_rows = math.ceil(len(cols) / n_cols)

fig, axes = plt.subplots(
    n_rows,
    n_cols,
    figsize=(6 * n_cols, 4.5 * n_rows),
    squeeze=False
)

axes = axes.flatten()

for i, col in enumerate(cols):
    ax = axes[i]

    sns.barplot(
        data=df_wonoise,
        x="KL-Score",
        y=col,
        hue="old_cluster",
        hue_order=cluster_order,
        estimator=np.median,
        errorbar=("pi", 50),   # IQR
        # palette=cluster_colors,
        palette = 'Set2',
        capsize=0.15,
        ax=ax
    )

    ax.set_title(col, fontsize=13)
    ax.set_xlabel("KL score")
    ax.set_ylabel(col)
    sns.despine(ax=ax)

    # remove per-axis legend
    ax.legend_.remove()

# -----------------------------
# Global legend (from palette)
# -----------------------------
palette = sns.color_palette("Set2", n_colors=len(cluster_order))

handles = [
    plt.Line2D([0], [0], marker='s', linestyle='',
               color=palette[c], markersize=12)
    for c in cluster_order
]

labels = [f"Cluster {c}" for c in cluster_order]

fig.legend(
    handles,
    labels,
    title="Cluster",
    loc="upper center",
    bbox_to_anchor=(0.5, 1.02),
    ncol=len(cluster_order),
    frameon=False
)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
df_wonoise.columns

In [None]:
cols = ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']
n_cols = 2
n_rows = math.ceil(len(cols) / n_cols)

fig, axes = plt.subplots(
    n_rows,
    n_cols,
    figsize=(6 * n_cols, 4.5 * n_rows),
    squeeze=False
)

axes = axes.flatten()

for i, col in enumerate(cols):
    ax = axes[i]

    sns.barplot(
        data=df_wonoise,
        x="KL-Score",
        y=col,
        hue="old_cluster",
        hue_order=cluster_order,
        estimator=np.median,
        errorbar=("pi", 50),   # IQR
        palette=cluster_colors,
        capsize=0.15,
        ax=ax
    )

    ax.set_title(col, fontsize=13)
    ax.set_xlabel("KL score")
    ax.set_ylabel(col)
    sns.despine(ax=ax)

    # remove per-axis legend
    ax.legend_.remove()

# -----------------------------
# Global legend (from palette)
# -----------------------------
handles = [
    plt.Line2D([0], [0], marker='s', linestyle='',
               color=cluster_colors[c], markersize=12)
    for c in cluster_order
]

labels = [f"Cluster {c}" for c in cluster_order]

fig.legend(
    handles,
    labels,
    title="Cluster",
    loc="upper center",
    bbox_to_anchor=(0.5, 1.02),
    ncol=len(cluster_order),
    frameon=False
)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

### Cluster Label vs KL-Score

In [None]:
# scatterplot
color_map = make_cluster_color_map(df['KL-Score'].unique())
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(20, 10))
sns.catplot(data = df[df['cluster_label']!=-1], x='cluster_label', y='probability', hue='KL-Score', palette='Set3', jitter = 0.3)
plt.show()

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)

df2 = df[df['cluster_label'] != -1]

g = sns.catplot(
    data=df,
    x='cluster_label',
    y='probability',
    hue='cluster_label',          # <- adds legend
    kind='box',
    col='cluster_label',
    col_wrap=2,
    palette='Set3',
    height=5,
    aspect=1.2,
)

g.set_titles("Cluster {col_name}")
g.set_axis_labels("cluster_label", "Probability")
g.tight_layout()
plt.show()



In [None]:
kls = sorted(df['cluster_label'].unique())
kls = [kl for kl in kls if kl != -1]  # exclude noise

ncols = 2
nrows = math.ceil(len(kls)/ncols)
fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
ax = np.ravel(ax)  # flatten to 1D

for idx, kl in enumerate(kls):
    sns.boxplot(
        data=df[df['cluster_label'] == kl],
        x='KL-Score', y='probability',
        ax=ax[idx], color=color_map[kl]
    )
    ax[idx].set_title(f"cluster_label = {kl}")

# hide any unused axes
for j in range(len(kls), len(ax)):
    ax[j].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_klscore_v2_rawq.png'))
plt.show()

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(12, 7))

ax = sns.countplot(
    data=df,
    x='cluster_label',
    hue='KL-Score',
    palette='Set3'
)

# Add counts above bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=2)

plt.xlabel("Cluster Label")
plt.ylabel("Count")
plt.title("Distribution of KL-Score Across Clusters")

plt.show()

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(12, 7))

ax = sns.countplot(
    data=df,
    x='old_cluster',
    hue='KL-Score',
    palette='Set3'
)

# Add counts above bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=2)

plt.xlabel("Cluster Label")
plt.ylabel("Count")
plt.title("Distribution of KL-Score Across Clusters")

plt.show()

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(12, 7))

ax = sns.countplot(
    data=train,
    x='cluster_label',
    hue='KL-Score',
    palette='Set3'
)

# Add counts above bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=2)

plt.xlabel("Cluster Label")
plt.ylabel("Count")
plt.title("Distribution of KL-Score Across Clusters - Train")

plt.show()

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(12, 7))

ax = sns.countplot(
    data=test,
    x='cluster_label',
    hue='KL-Score',
    palette='Set3'
)

# Add counts above bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=2)

plt.xlabel("Cluster Label")
plt.ylabel("Count")
plt.title("Distribution of KL-Score Across Clusters - Test")

plt.show()

In [None]:
# for cluster in df['cluster_label'].unique():
#     cluster_data = df[df['cluster_label'] == cluster]
#     print(f"Cluster {cluster} KL-Score Distribution:")
#     display(cluster_data.value_counts('KL-Score').reset_index())

## Cluster Label vs MRI Data

In [None]:
# mri_cols = ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']

# for mri_col in mri_cols:
#     kls = sorted(df[mri_col].unique())
#     print(kls)

#     ncols = 2
#     nrows = math.ceil(len(kls)/ncols)
#     fig, ax = plt.subplots(nrows, ncols, figsize=(20, 4*nrows), sharey=True)
#     ax = np.ravel(ax)  # flatten to 1D

#     for idx, kl in enumerate(kls):
#         sns.boxplot(
#             data=df[df[mri_col] == kl],
#             x='cluster_label', y='probability',
#             ax=ax[idx], color=color_map[kl]
#         )
#         ax[idx].set_title(f"{mri_col} = {kl}")

#     # hide any unused axes
#     for j in range(len(kls), len(ax)):
#         ax[j].set_visible(False)

#     plt.tight_layout()
#     plt.savefig(os.path.join(save_path, f'{folder}_{run}_probability_cluster_{mri_col}_rawq.png'))
#     plt.show()

#     plt.figure(figsize=(10, 6))
#     sns.histplot(data=df, x='cluster_label', hue=mri_col, multiple='dodge', palette='Set3')
#     plt.savefig(os.path.join(save_path, f'{folder}_{run}_histogram_cluster_{mri_col}_rawq.png'))
#     plt.show()

In [None]:
summary = df.groupby('cluster_label')[['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']].mean()

In [None]:
summary

In [None]:
summary_long = summary.reset_index().melt(
    id_vars="cluster_label",
    value_vars=["mri_cart_yn", "mri_osteo_yn", "mri_bml_yn"],
    var_name="MRI_feature",
    value_name="proportion_positive"
)
sns.set_theme(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(15, 6))

sns.barplot(
    data=summary_long,
    x="cluster_label",
    y="proportion_positive",
    hue="MRI_feature",
    palette="Set3"
    , width=0.6
)

plt.xlabel("Cluster Label", fontsize=14)
plt.ylabel("Proportion Positive", fontsize=14)
plt.title("MRI Pathology Prevalence per Cluster", fontsize=16)
plt.legend(
    title="MRI Finding",
    bbox_to_anchor=(1.05, 1),
    loc="upper left"
)
plt.tight_layout()
plt.show()

## Check different order

In [None]:
# df['old_cluster2'] = df['cluster_label']
# new_orderl = [0, 4, 1, 3, 2]
# new_order = {old: new for new, old in enumerate(new_orderl)}
# df['cluster_label'] = df['cluster_label'].map(new_order)

In [None]:
# sns.set_theme(style="whitegrid", font_scale=1.2)
# plt.figure(figsize=(12, 7))

# ax = sns.countplot(
#     data=df,
#     x='cluster_label',
#     hue='KL-Score',
#     palette='Set3'
# )

# # Add counts above bars
# for container in ax.containers:
#     ax.bar_label(container, fmt='%d', padding=2)

# plt.xlabel("Cluster Label")
# plt.ylabel("Count")
# plt.title("Distribution of KL-Score Across Clusters")

# plt.show()

In [None]:
# summary = df.groupby('cluster_label')[['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']].mean()
# summary_long = summary.reset_index().melt(
#     id_vars="cluster_label",
#     value_vars=["mri_cart_yn", "mri_osteo_yn", "mri_bml_yn"],
#     var_name="MRI_feature",
#     value_name="proportion_positive"
# )
# sns.set_theme(style="whitegrid", font_scale=1.2)

# plt.figure(figsize=(15, 6))

# sns.barplot(
#     data=summary_long,
#     x="cluster_label",
#     y="proportion_positive",
#     hue="MRI_feature",
#     palette="Set3"
#     , width=0.6
# )

# plt.xlabel("Cluster Label", fontsize=14)
# plt.ylabel("Proportion Positive", fontsize=14)
# plt.title("MRI Pathology Prevalence per Cluster", fontsize=16)
# plt.legend(
#     title="MRI Finding",
#     bbox_to_anchor=(1.05, 1),
#     loc="upper left"
# )
# plt.tight_layout()
# plt.show()

In [None]:
# df.to_csv(os.path.join(filepath, f'{methods}_{run}_hdbscan_new_clusters.csv'), index=False)

In [None]:
#Look at using old order
sns.set_theme(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(12, 7))

ax = sns.countplot(
    data=df,
    x='old_cluster',
    hue='KL-Score',
    palette='Set3'
)

# Add counts above bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=2)

plt.xlabel("Cluster Label")
plt.ylabel("Count")
plt.title("Distribution of KL-Score Across Clusters")

plt.show()

In [None]:
summary = df.groupby('old_cluster')[['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']].mean()
summary_long = summary.reset_index().melt(
    id_vars="old_cluster",
    value_vars=["mri_cart_yn", "mri_osteo_yn", "mri_bml_yn"],
    var_name="MRI_feature",
    value_name="proportion_positive"
)
sns.set_theme(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(15, 6))

sns.barplot(
    data=summary_long,
    x="old_cluster",
    y="proportion_positive",
    hue="MRI_feature",
    palette="Set3"
    , width=0.6
)

plt.xlabel("Cluster Label", fontsize=14)
plt.ylabel("Proportion Positive", fontsize=14)
plt.title("MRI Pathology Prevalence per Cluster", fontsize=16)
plt.legend(
    title="MRI Finding",
    bbox_to_anchor=(1.05, 1),
    loc="upper left"
)
plt.tight_layout()
plt.show()

### KL-Score MRI Data

In [None]:
summary = df.groupby('KL-Score')[['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']].mean()
summary_long = summary.reset_index().melt(
    id_vars="KL-Score",
    value_vars=["mri_cart_yn", "mri_osteo_yn", "mri_bml_yn"],
    var_name="MRI_feature",
    value_name="proportion_positive"
)
sns.set_theme(style="whitegrid", font_scale=1.2)

plt.figure(figsize=(15, 6))

sns.barplot(
    data=summary_long,
    x="KL-Score",
    y="proportion_positive",
    hue="MRI_feature",
    palette="Set3"
    , width=0.6
)

plt.xlabel("KL-Score", fontsize=14)
plt.ylabel("Proportion Positive", fontsize=14)
plt.title("MRI Pathology Prevalence per KL-Score", fontsize=16)
plt.legend(
    title="MRI Finding",
    bbox_to_anchor=(1.05, 1),
    loc="upper left"
)
plt.tight_layout()
plt.show()

### Plots

In [None]:
columns_corr =  ['mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn'] 
barplots(df, y_list=columns_corr, x='cluster_label', hue=None, figsize = (6, 6))

## Plot Distribution per cluster first

In [None]:
feature_col = ['mri_bml_yn', 'mri_cart_yn', 'mri_osteo_yn', 'mri_syn_yn', 'mri_mnsc_yn', 'mri_lig_yn']

for feature in feature_col:
    clusters = df['cluster_label'].unique()
    clusters.sort()

    counts = df.groupby(['cluster_label', feature]).size().unstack(fill_value=0)

    plt.figure(figsize=(10, 6))
    counts.plot(kind='bar', stacked=False)
    plt.title(f'Distribution of {feature} across clusters')
    plt.xlabel('Cluster Label')
    plt.ylabel('Count')
    plt.show()