In [None]:
import datetime
import os
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import glob
import config
from pathlib import Path

import joblib

from utils.load_utils import load_image_folder_as_array

In [None]:
def load_image(img_path, size = (256, 256)):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, size)
    return img

In [None]:
today = datetime.date.today()

proc_dir = config.PROC_DATA_PATH

folder = "2025-09-12_hdbscan"
folder_date = folder.split('_')[0]
run = "run92"

img_path = config.RAW_DATA_PATH
img_path = os.path.join(img_path, 'images_knee', '600x600_imgs')
# img_path = os.path.join(img_path, "600x600_imgs")

filepath = os.path.join(proc_dir, "radiographic_features", folder, run)

#Load Cluster data
df_path = f"{folder}_{run}_umap_hdbscan_scaled.csv"
df_path = os.path.join(filepath, df_path)
df = pd.read_csv(df_path)
display(df.head())

scaler = joblib.load(os.path.join(filepath, "scaler.pkl"))

In [None]:
#Load Images/image filepaths
df['filepath'] = ""

subdirs = ['train', 'test']
for subdir in subdirs:
    feature_dir = os.path.join(img_path, subdir)
    
    for label_folder in sorted(Path(feature_dir).iterdir()):
        if label_folder.is_dir():
            label = label_folder.name
            for i_path in label_folder.iterdir():
                id = i_path.stem.split('.')[0]
                row_index = df[df['id']==id].index
                df['filepath'].iloc[row_index] = str(i_path)

In [None]:
#Check for empty filepaths
df[df['filepath']==""]

In [None]:
#df.to_csv(os.path.join(filepath, f"{folder}_{run}_umap_hdbscan_scaled.csv"), index=False)

In [None]:
clusters = df['cluster_label'].unique()
clusters.sort()
print(clusters)

## Without Scaler

In [None]:
for cluster in clusters:
    df_temp = df[df['cluster_label']==cluster]

    brightness_vals = []
    contrast_vals = []

    plt.figure(figsize=(30,15))

    for i, row in df_temp.iterrows():
        img = load_image(row['filepath'])

        brightness_vals.append(np.mean(img))
        contrast_vals.append(np.std(img))
    
    plt.subplot(1, 2, 1)
    plt.title(f'Cluster {cluster} - Brightness')
    plt.hist(brightness_vals, bins = 20, color='skyblue', alpha=0.7, edgecolor='black')
    plt.subplot(1, 2, 2)
    plt.title(f'Cluster {cluster} - Contrast')
    plt.hist(contrast_vals, bins = 20, color='lightgreen', alpha=0.7, edgecolor='black')

    plt.tight_layout()
    plt.show()


In [None]:
df_filtered = df[df['cluster_label']!=-1]

plt.figure(figsize=(14,6))

# Brightness histogram (all clusters together)
plt.subplot(1, 2, 1)
for cluster, df_temp in df_filtered.groupby("cluster_label"):
    brightness_vals = [np.mean(load_image(row['filepath'])) for _, row in df_temp.iterrows()]
    plt.hist(brightness_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"Cluster {cluster}")
plt.title("Brightness distribution by cluster")
plt.xlabel("Mean intensity")
plt.ylabel("Count")
plt.legend()

# Contrast histogram (all clusters together)
plt.subplot(1, 2, 2)
for cluster, df_temp in df_filtered.groupby("cluster_label"):
    contrast_vals = [np.std(load_image(row['filepath'])) for _, row in df_temp.iterrows()]
    plt.hist(contrast_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"Cluster {cluster}")
plt.title("Contrast distribution by cluster")
plt.xlabel("Std dev intensity")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(filepath, "histogram_cluster_labels.png"))
plt.show()

In [None]:
plt.figure(figsize=(14,6))

# Brightness histogram (all clusters together)
plt.subplot(1, 2, 1)
for cluster, df_temp in df_filtered.groupby("label"):
    brightness_vals = [np.mean(load_image(row['filepath'])) for _, row in df_temp.iterrows()]
    plt.hist(brightness_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"KL-Score {cluster}")
plt.title("Brightness distribution by KL-Score")
plt.xlabel("Mean intensity")
plt.ylabel("Count")
plt.legend()

# Contrast histogram (all clusters together)
plt.subplot(1, 2, 2)
for cluster, df_temp in df_filtered.groupby("label"):
    contrast_vals = [np.std(load_image(row['filepath'])) for _, row in df_temp.iterrows()]
    plt.hist(contrast_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"KL-Score {cluster}")
plt.title("Contrast distribution by KL-Score")
plt.xlabel("Std dev intensity")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(filepath, "histogram_kl_scores.png"))
plt.show()

In [None]:
# df_filtered = df_filtered.copy()
# df_filtered["brightness"] = df_filtered["filepath"].apply(
#     lambda fp: np.mean(load_image(fp))
# )
# df_filtered["contrast"] = df_filtered["filepath"].apply(
#     lambda fp: np.std(load_image(fp))
# )

# plt.figure(figsize=(14, 6))

# # Brightness density
# plt.subplot(1, 2, 1)
# sns.kdeplot(
#     data=df_filtered,
#     x="brightness",
#     hue="cluster_label",
#     common_norm=False,   # keeps distributions comparable instead of normalizing to 1
#     fill=False,          # line only, no fill -> avoids color merging
#     linewidth=2
# )
# plt.title("Brightness distribution by cluster")

# # Contrast density
# plt.subplot(1, 2, 2)
# sns.kdeplot(
#     data=df_filtered,
#     x="contrast",
#     hue="cluster_label",
#     common_norm=False,
#     fill=False,
#     linewidth=2
# )
# plt.title("Contrast distribution by cluster")

# plt.tight_layout()
# plt.show()

## With Scaler

In [None]:
for cluster in clusters:
    df_temp = df[df['cluster_label']==cluster]

    brightness_vals = []
    contrast_vals = []

    plt.figure(figsize=(30,15))

    for i, row in df_temp.iterrows():
        img = load_image(row['filepath'])

        img = scaler.transform(img)

        brightness_vals.append(np.mean(img))
        contrast_vals.append(np.std(img))
    
    plt.subplot(1, 2, 1)
    plt.title(f'Cluster {cluster} - Brightness')
    plt.hist(brightness_vals, bins = 20, color='skyblue', alpha=0.7, edgecolor='black')
    plt.subplot(1, 2, 2)
    plt.title(f'Cluster {cluster} - Contrast')
    plt.hist(contrast_vals, bins = 20, color='lightgreen', alpha=0.7, edgecolor='black')

    plt.tight_layout()
    plt.show()

In [None]:
df_filtered = df[df['cluster_label']!=-1]

plt.figure(figsize=(14,6))

# Brightness histogram (all clusters together)
plt.subplot(1, 2, 1)
for cluster, df_temp in df_filtered.groupby("cluster_label"):
    brightness_vals = [np.mean(scaler.transform(load_image(row['filepath']))) for _, row in df_temp.iterrows()]
    plt.hist(brightness_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"Cluster {cluster}")
plt.title("Brightness distribution by cluster")
plt.xlabel("Mean intensity")
plt.ylabel("Count")
plt.legend()

# Contrast histogram (all clusters together)
plt.subplot(1, 2, 2)
for cluster, df_temp in df_filtered.groupby("cluster_label"):
    contrast_vals = [np.std(scaler.transform(load_image(row['filepath']))) for _, row in df_temp.iterrows()]
    plt.hist(contrast_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"Cluster {cluster}")
plt.title("Contrast distribution by cluster")
plt.xlabel("Std dev intensity")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(filepath, "histogram_standardscaler_cluster_labels.png"))
plt.show()

In [None]:
plt.figure(figsize=(14,6))

# Brightness histogram (all clusters together)
plt.subplot(1, 2, 1)
for cluster, df_temp in df_filtered.groupby("label"):
    brightness_vals = [np.mean(scaler.transform(load_image(row['filepath']))) for _, row in df_temp.iterrows()]
    plt.hist(brightness_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"KL-Score {cluster}")
plt.title("Brightness distribution by KL-Score")
plt.xlabel("Mean intensity")
plt.ylabel("Count")
plt.legend()

# Contrast histogram (all clusters together)
plt.subplot(1, 2, 2)
for cluster, df_temp in df_filtered.groupby("label"):
    contrast_vals = [np.std(scaler.transform(load_image(row['filepath']))) for _, row in df_temp.iterrows()]
    plt.hist(contrast_vals, bins=20, alpha=0.5, histtype='barstacked', linewidth=2, label=f"KL-Score {cluster}")
plt.title("Contrast distribution by KL-Score")
plt.xlabel("Std dev intensity")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(filepath, "histogram_standardscaler_kl_scores.png"))
plt.show()