In [None]:
import itertools
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
import pandas as pd
import config
import os
import json

In [None]:
STAGE = 'ss'
MOD_PREFIX = "mod_smallimg3"
NEPOCH = 'latest'


DATAPATH = config.OUTPUT_PATH
base_dir = config.RAW_DATA_PATH
img_path = config.SCHULTHESS_DATAPATH
proc_dir = config.PROC_DATA_PATH

# #for rawq:
feature = 'rawq'
folder = "2025-11-20_hdbscan"
run = "run10"  

# feature = 'img_features'
# #for img features:
# folder = "2025-09-12_hdbscan"
# folder_date = folder.split('_')[0]
# run = "run92"

# feature = 'img_raw'
# folder = "2025-09-13_hdbscan_img"
# run = "run32"

# feature = 'agg'
# folder = "2025-08-11_hdbscan"
# run = 'run150'

anomalyscore_metric = "centre_mean"
cluster_col = "cluster_label"

folder_date = folder.split('_')[0]



if feature == 'rawq':
    filepath = os.path.join(proc_dir, folder, "pipeline", run)
    hdbscan_df = pd.read_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled.csv'))
elif feature == 'img_features' or feature == 'img_raw':
        filepath = os.path.join(proc_dir, "radiographic_features", folder, run)
        hdbscan_df = pd.read_csv(os.path.join(filepath, f'{folder}_{run}_umap_hdbscan_scaled.csv'))
elif feature == 'agg':
      filepath = os.path.join(proc_dir, folder, run)
      hdbscan_df = pd.read_csv(os.path.join(filepath, f'{folder}_{run}_umap_hdbscan_scaled.csv'))
        
kl = pd.read_csv(os.path.join(base_dir,  "brul_knee_annotations.csv"))
kl2 = pd.read_csv(os.path.join(base_dir, "rosand1_knee_annotations.csv"))
mri = pd.read_csv(os.path.join(base_dir, '2025-09-25_mrismall.csv'))



In [None]:
folder_df = "2025-09-11_data_exploration"
df_filename = "inmodi_data_questionnaire_kl_woSC.csv"
df = pd.read_csv(os.path.join(proc_dir, folder_df, df_filename))
df2 = hdbscan_df[hdbscan_df['id'].isin(df['name'])]

In [None]:
print(len(df2))

In [None]:
hdbscan_df = df2

In [None]:
l = ['IM1512_2_left',
 'IM1512_2_right',
 'IM1567_2_left',
 'IM1567_2_right',
 'IM1578_2_left',
 'IM1578_2_right',
 'IM2511_1_left',
 'IM2511_1_right',
 'IM2569_1_left',
 'IM2569_1_right',
 'IM3013_1_right',
 'IM3019_1_right',
 'IM3020_1_left',
 'IM3020_1_right']

In [None]:
ques = pd.read_csv(os.path.join(base_dir, 'questionnaires_raw.csv'))

In [None]:
for i in l:
    hdbscan_df[hdbscan_df['id'] == i]

In [None]:
hdbscan_df = hdbscan_df.merge(kl, left_on = 'id', right_on='name', how='left', validate='one_to_one')
hdbscan_df = hdbscan_df.merge(kl2, left_on = 'id', right_on='name', how='left', validate='one_to_one', suffixes=('', '2'))

try:
    hdbscan_df.drop(columns=['Unnamed: 0'], inplace=True)
except:
    pass

df = pd.read_csv(os.path.join(DATAPATH, 'outputs', 'dfs', 'ss', 'mod_smallimg3_ss_aggregated_scores.csv'))

df['id_temp'] = df['id'].apply(lambda x: x.split('/')[-1])
df['id'] = df['id_temp'].apply(lambda x: x.split('.')[0])
df.drop(columns=['id_temp'], inplace=True)
hdbscan_df = hdbscan_df.merge(df, on='id', how='left', validate='one_to_one')

# if 'KL-Score'  is na, fill with 'KL-Score2'
hdbscan_df['KL-Score'] = hdbscan_df['KL-Score'].fillna(hdbscan_df['KL-Score2'])
hdbscan_df['KL-Score2'] = hdbscan_df['KL-Score2'].fillna(hdbscan_df['KL-Score'])

hdbscan_df['KL-Score'].fillna(-1, inplace=True)
hdbscan_df['KL-Score2'].fillna(-1, inplace=True)

hdbscan_df_dropna = hdbscan_df.copy()
# hdbscan_df_dropna = hdbscan_df_dropna[hdbscan_df_dropna['KL-Score'] != -1]
# hdbscan_df_dropna = hdbscan_df_dropna[hdbscan_df_dropna['KL-Score2'] != -1]

In [None]:
with open(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_model_info.json')) as f:
    model_info= json.load(f)

In [None]:
ids = model_info['files']['ids']

In [None]:
len(ids)

In [None]:
# check what is not in df['id'] but in list ids
set(ids) - set(df['id'])

In [None]:
df = hdbscan_df_dropna
df = df[df['KL-Score'] != -1]
df = df[df['KL-Score2'] != -1]

In [None]:
embeddings_path = os.path.join(filepath, "X_umap_embeddings.npy")
embeddings = np.load(embeddings_path)

In [None]:
# Create lookup: id â†’ row index in embeddings
id_to_idx = {id_: i for i, id_ in enumerate(ids)}

# Reorder embeddings to match df rows
embedding_matrix = np.vstack([embeddings[id_to_idx[i]] for i in df['id']])

In [None]:
embedding_matrix.shape

In [None]:
healthy_centroid = embedding_matrix[df['cluster_label'] == 0].mean(axis=0)
df['severity_score'] = np.linalg.norm(embedding_matrix - healthy_centroid, axis=1)

In [None]:
df.groupby("KL-Score")["severity_score"].describe()


KL-Score 0 has lowest mean
3 and 4 show increase in mean severity score, which is what we would expect!

In [None]:
import seaborn as sns
sns.violinplot(data=df, x="KL-Score", y="severity_score")


In [None]:
from scipy.stats import spearmanr

rho, p = spearmanr(df['severity_score'], df['KL-Score'])
print(rho, p)


# Metrics Severity Score to KL-Score

In [None]:
from utils.clustering_eval_utils import get_metrics, get_metrics_pr

In [None]:
results_auc = get_metrics(df, 'severity_score', 'KL-Score')
print(results_auc)

In [None]:
results_pr = get_metrics_pr(df, 'severity_score', 'KL-Score')
print(results_pr)

# Severity Score Behaviour for each cluster label

In [None]:
df['cluster_label'].value_counts()

In [None]:
df.groupby("cluster_label")["severity_score"].describe()

In [None]:
order = [0, 4, 2, 1]
order_wnoise = [0,4,2,-1,1]

In [None]:
rating = {
    '1': 's+',
    '2': 's',
    '4': 'm',
    '0': 'h',
    '-1': 'noise'
}

rating_order = {
    'h': 0,
    'm': 1,
    's': 2,
    's+': 3,
    'noise': -1
}
df['old_cluster'] = df['cluster_label']
df['cluster_label_name'] = df['cluster_label'].astype(str).map(rating)
df['cluster_label']=df['cluster_label_name'].map(rating_order)

In [None]:
sns.violinplot(data=df, x="cluster_label", y="severity_score")

# SMOTE

In [None]:
smotefolder = '2025-08-11_data_exploration'
smote = pd.read_csv(os.path.join(proc_dir, smotefolder, f'smote_oversampled_data_Borderline_SMOTE2.csv'))

In [None]:
from sklearn.preprocessing import StandardScaler
import hdbscan
from umap import UMAP
import joblib

In [None]:
scaler_path = os.path.join(proc_dir, folder, 'pipeline', run, 'scaler.pkl')
umapmodel_path = os.path.join(proc_dir, folder, 'pipeline', run, 'umap_model.pkl')
hdbscan_path = os.path.join(proc_dir, folder, 'pipeline', run, 'pipeline_run10_umap_hdbscan_scaled_clusterer.pkl')

In [None]:
scaler = joblib.load(scaler_path)
umap_model = joblib.load(umapmodel_path)
clusterer = joblib.load(hdbscan_path)

In [None]:
X_scaled = scaler.transform(smote.drop(columns=[ 'KL-Score']))
X_umap = umap_model.transform(X_scaled)

cluster_labels, strengths = hdbscan.approximate_predict(clusterer, X_umap)
smote['cluster_label'] = cluster_labels

In [None]:
smote['old_cluster'] = smote['cluster_label']
smote['cluster_label_name'] = smote['cluster_label'].astype(str).map(rating)
smote['cluster_label']=smote['cluster_label_name'].map(rating_order)

In [None]:
embedding_matrix = X_umap

healthy_centroid = embedding_matrix[smote['cluster_label'] == 0].mean(axis=0)
smote['severity_score'] = np.linalg.norm(embedding_matrix - healthy_centroid, axis=1)

In [None]:
summary = smote.groupby('cluster_label')['severity_score'].describe().reset_index()

In [None]:
summary.sort_values('mean')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=smote,
    x="cluster_label",
    y="severity_score",
    palette="Set3",
    width=0.6,
    showfliers=False,     # hide outliers
    linewidth=1.5         # thicker box borders
)

# # add jittered points for visibility
# sns.stripplot(
#     data=smote,
#     x="cluster_label",
#     y="severity_score",
#     color="black",
#     alpha=0.4,
#     size=3,
#     jitter=0.25
# )

plt.title("Severity Score Distribution per Cluster", fontsize=16, pad=15)
plt.xlabel("Cluster Label", fontsize=13)
plt.ylabel("Severity Score", fontsize=13)
plt.grid(axis="y", alpha=0.25)
plt.tight_layout()
plt.show()


# Plot original data & SMOTE in embedding space

In [None]:
df

In [None]:
import shutil


In [None]:
output_path = os.path.join(DATAPATH, 'outputs', 'clusterimages')
os.makedirs(output_path, exist_ok=True)
img_path = os.path.join(base_dir, 'images_knee')

In [None]:
for cluster in hdbscan_df['cluster_label'].unique():
    ids = hdbscan_df[hdbscan_df['cluster_label'] == cluster]['id'].tolist()
    cluster_dir = os.path.join(output_path, f'cluster_{cluster}')
    os.makedirs(cluster_dir, exist_ok=True)
    for id_ in ids:
        img_file = os.path.join(img_path, f'{id_}.png')
        if os.path.exists(img_file):
            dest_file = os.path.join(cluster_dir, f'{id_}.png')
            if not os.path.exists(dest_file):
                shutil.copy2(img_file, dest_file)

In [None]:
# hdbscan_doc = pd.read_csv(os.path.join(filepath, f'pipeline_run10_umap_hdbscan_scaled_allpoints_wKL.csv'))

In [None]:
# hdbscan_doc['clusterlabelold'] = hdbscan_doc['cluster_label']
# hdbscan_doc['clusterlabeltext'] = hdbscan_doc['cluster_label'].astype(str).map(rating)
# hdbscan_doc['cluster_label'] = hdbscan_doc['clusterlabeltext'].map(rating_order)

In [None]:
hdbscan_df = hdbscan_df.dropna(subset=['cluster_label'])

In [None]:
hdbscan_df.to_csv(os.path.join(filepath, f'pipeline_{run}_umap_hdbscan_scaled_allpoints_wKL_v2.csv'))

# Get images per cluster

In [None]:
import random
from PIL import Image

def show_cluster_examples(output_path, max_grid=3, seed=42):
    random.seed(seed)

    clusters = sorted([c for c in os.listdir(output_path) if c.startswith("cluster_")])

    img_names_l = []

    for cluster in clusters:
        cluster_dir = os.path.join(output_path, cluster)

        imgs = [
            os.path.join(cluster_dir, f)
            for f in os.listdir(cluster_dir)
            if f.lower().endswith((".png", ".jpg", ".jpeg"))
        ]

        if len(imgs) == 0:
            print(f"No images found in {cluster}")
            continue

        # how many images to show
        n_images = min(len(imgs), max_grid * max_grid)
        sample_imgs = random.sample(imgs, n_images)

        img_names_l.append(sample_imgs)

        # determine rows and columns dynamically
        cols = min(max_grid, n_images)             # max 3 columns
        rows = (n_images + cols - 1) // cols       # enough rows to fit all

        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
        fig.suptitle(f"Cluster {cluster.replace('cluster_', '')}: Example Images", fontsize=18)

        # axes may not be 2D depending on rows/cols
        if rows == 1 and cols == 1:
            axes = [axes]
        elif rows == 1:
            axes = axes
        else:
            axes = axes.flatten()

        # fill axes
        for ax, img_path in zip(axes, sample_imgs):
            img = Image.open(img_path).convert("L")
            ax.imshow(img, cmap="gray")
            ax.axis("off")

        # hide unused axes
        for ax in axes[len(sample_imgs):]:
            ax.axis("off")

        plt.tight_layout()
        plt.show()
    return img_names_l



In [None]:
img_names_l = show_cluster_examples(output_path, max_grid=3, seed=101)

In [None]:
df2 = df.merge(mri, on='id', how='left', validate='one_to_one')

In [None]:
df2[df2['cluster_label']==1][['id', 'mri_cart_yn', 'mri_osteo_yn', 'mri_bml_yn']]

In [None]:
img_names_2 = show_cluster_examples(output_path, max_grid=3, seed=42)

In [None]:
#turn list of lists into single list
img_names_path = list(itertools.chain.from_iterable(img_names_l))
img_names2_path = list(itertools.chain.from_iterable(img_names_2))

img_names = []
for path in img_names_path:
    img_names.append(path.split('/')[-1].replace('.png',''))
for path in img_names2_path:
    img_names.append(path.split('/')[-1].replace('.png',''))

In [None]:
img_names = list(set(img_names))  # remove duplicates if any

In [None]:
len(img_names)

In [None]:
df.head()

In [None]:
kl[kl['name']=='IM3003_2_right']

In [None]:
not_in_imgname = set(df['id'].str.split('_').str[0]) - set(img_names)

In [None]:
hdbscan_df.head()

In [None]:
def sample_images_by_kl(df, not_in_imgname, kl_targets=[3, 4]):
    """
    From a list of base record IDs (IM0001, IM0023, ...), find matching images in df
    and randomly select one image for each KL target (default: 3 and 4).
    
    Returns:
        dict: {kl_score: image_path}
    """
    
    # Keep only rows that match the base image IDs
    mask = df["id"].apply(lambda x: x.split("_")[0] in not_in_imgname)
    df_filtered = df[mask]

    results = {}

    for kl in kl_targets:
        df_kl = df_filtered[df_filtered["KL-Score"] == kl]

        if df_kl.empty:
            print(f"No images found for KL={kl}")
            results[kl] = None
            continue

        # sample one row
        chosen = df_kl.sample(1).iloc[0]
        results[kl] = chosen['id']  # adjust column name if needed

    return results

def display_sampled_images(image_paths, title="Selected KL Images", max_grid=3):
    """
    image_paths = list of paths, e.g. [path_to_KL3, path_to_KL4]
    """

    # Filter out None values
    imgs = [p for p in image_paths if p is not None]

    if len(imgs) == 0:
        print("No images to display.")
        return

    n_images = min(len(imgs), max_grid * max_grid) 
    sample_imgs = random.sample(imgs, n_images)
    print(sample_imgs)

    # determine rows and columns dynamically
    cols = min(max_grid, n_images)
    rows = (n_images + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
    fig.suptitle(title, fontsize=18)

    # flatten axes
    if rows == 1 and cols == 1:
        axes = [axes]
    elif rows == 1:
        axes = axes
    else:
        axes = axes.flatten()

    for ax, img_path in zip(axes, sample_imgs):
        img = Image.open(img_path).convert("L")
        ax.imshow(img, cmap="gray")
        ax.axis("off")

    # turn off unused axes
    for ax in axes[len(sample_imgs):]:
        ax.axis("off")

    plt.tight_layout()
    plt.show()


In [None]:
res = sample_images_by_kl(hdbscan_df, not_in_imgname)

In [None]:
res_paths = []

for key, value in res.items():
    if value is not None:
        img_file = os.path.join(img_path, f'{value}.png')
        res_paths.append(img_file)
    else:
        res_paths.append(None)

In [None]:
display_sampled_images(res_paths, title="Cluster 3: Example Images", max_grid=3)

In [None]:
hdbscan_df[hdbscan_df['cluster_label']==3]

In [None]:
path = './data/raw/images_knee/train'

In [None]:
subf = ['train', 'test']
kl = [0 , 1,2,3,4]

In [None]:
import os
import config

base_dir = config.RAW_DATA_PATH

In [None]:
for sf in subf:
    for k in kl:
        l = os.listdir(os.path.join(base_dir,'images_knee',sf, str(k)))
        print(f'Subfolder: {sf}, KL-score: {k}, Number of images: {len(l)}')

In [None]:
patients = []
for sf in subf:
    for k in kl:
        l = os.listdir(os.path.join(base_dir,'images_knee',sf, str(k)))
        for img in l:
            img_name = img.split('.')[0]
            patient_id = img_name.split('_')[0]
            patients.append(patient_id)

patients = list(set(patients))
print(f'Number of patients: {len(patients)}')
        # print(f'Subfolder: {sf}, KL-score: {k}, Number of patients: {len(patients)}')


In [None]:

def show_klscore_example(klpath, max_grid=3, seed=42):
    random.seed(seed)

    clusters = [0, 1, 2, 3, 4]

    img_names_l = []

    for cluster in clusters:
        cluster_dir = os.path.join(klpath, str(cluster))

        imgs = [
            os.path.join(cluster_dir, f)
            for f in os.listdir(cluster_dir)
            if f.lower().endswith((".png", ".jpg", ".jpeg"))
        ]

        if len(imgs) == 0:
            print(f"No images found in {cluster}")
            continue

        # how many images to show
        n_images = min(len(imgs), max_grid * max_grid)
        sample_imgs = random.sample(imgs, n_images)

        img_names_l.append(sample_imgs)

        # determine rows and columns dynamically
        cols = min(max_grid, n_images)             # max 3 columns
        rows = (n_images + cols - 1) // cols       # enough rows to fit all

        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
        fig.suptitle(f"KL-Score {cluster}: Example Images", fontsize=18)

        # axes may not be 2D depending on rows/cols
        if rows == 1 and cols == 1:
            axes = [axes]
        elif rows == 1:
            axes = axes
        else:
            axes = axes.flatten()

        # fill axes
        for ax, img_path in zip(axes, sample_imgs):
            img = Image.open(img_path).convert("L")
            ax.imshow(img, cmap="gray")
            ax.axis("off")

        # hide unused axes
        for ax in axes[len(sample_imgs):]:
            ax.axis("off")

        plt.tight_layout()
        plt.show()
    return img_names_l



In [None]:
_ =show_klscore_example(os.path.join(base_dir,'images_knee', 'train'), max_grid=3, seed=42)