In [None]:
import config
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from IPython.display import display
import json
import datetime
import math
import joblib
from PIL import Image
from sklearn.preprocessing import StandardScaler

In [None]:
today = datetime.date.today()

base_dir = config.RAW_DATA_PATH
proc_dir = config.PROC_DATA_PATH

img_path = os.path.join(base_dir, 'images_knee')

# rawimg_folder = "2025-09-13_hdbscan_img"
# rawimg_folder_date = rawimg_folder.split('_')[0]
# rawimg_run = "run32"

# feature_folder = "2025-08-11_hdbscan"
# feature_folder_date = feature_folder.split('_')[0]
# feature_run = "run22"

# rawimg_filepath =  os.path.join(proc_dir, "radiographic_features", rawimg_folder, rawimg_run)
# feature_filepath =  os.path.join(proc_dir, "radiographic_features", feature_folder, feature_run)

ques_folder = "2025-08-23_hdbscan"
ques_folder_date = ques_folder.split('_')[0]
ques_run = "run2"

ques_filepath = os.path.join(proc_dir, ques_folder, "questionnaire", ques_run)

ques = pd.read_csv(os.path.join(ques_filepath, "questionnaire_run2_umap_hdbscan_scaled_wKL_v2.csv" ))
display(ques.head())

In [None]:
# scaler = joblib.load(os.path.join(rawimg_filepath, "scaler.pkl"))
from utils.load_utils import load_image_folder_as_array

In [None]:
clusters = ques['cluster_label'].unique()
clusters.sort()

print(f"Number of clusters: {len(clusters)}")

In [None]:
files = os.listdir(img_path)
files = [f for f in files if f.endswith('.png')]
print(f"Number of image files: {len(files)}")

In [None]:
names = []
X = []
for subdir in ['train', 'test']:
    feature_dir = os.path.join(img_path, subdir)
    X_sub, _, _ = load_image_folder_as_array(feature_dir, image_size=(256, 256))
    X.extend(X_sub)
scaler = StandardScaler()
scaler.fit(X)

In [None]:
def load_image(im_path, image_size=(256, 256)):
    img = Image.open(im_path).convert('L').resize(image_size) # Convert to grayscale
    img_array = np.array(img)
    img_array = img_array.flatten()  # Flatten the image to a vector
    img_array = img_array.reshape(1, -1)
    return img_array

In [None]:
for cluster in clusters:
    df_temp = ques[ques['cluster_label'] == cluster]
    cluster_ids = df_temp['id'].tolist()

    filenames = [f"{id}.png" for id in cluster_ids]
    images = [os.path.join(img_path, fname) for fname in filenames if os.path.exists(os.path.join(img_path, fname))]

    w, h = 256, 256
    n = len(images)

    arr = np.zeros((h, w), np.float32)
    imgs=[]
    for im in images:
        imarr = np.array(load_image(im), dtype=float)
        imarr = imarr.reshape(h, w)
        imgs.append(imarr)
   
    arr = np.mean(np.stack(imgs, axis=0), axis=0)
    # arr = arr.flatten()
    # arr = arr.reshape(1, -1)
    # # arr = scaler.inverse_transform(arr)
    # arr = arr[0].reshape(h,w)
    # arr = np.clip(arr, 0, 255).astype(np.uint8)
    arr = np.array(np.round(arr), dtype=np.uint8)
    out = Image.fromarray(arr)
    out.save(os.path.join(ques_filepath, f"average_cluster_{cluster}.png"))
    out.show()
    

In [None]:
X = []
y=[]
for subdir in ['train', 'test']:
    feature_dir = os.path.join(img_path, subdir)
    X_sub, y_sub, _ = load_image_folder_as_array(feature_dir, image_size=(256, 256))
    X.extend(X_sub)
    y.extend(y_sub)

print(np.array(X).shape)
print(np.array(y).shape)

In [None]:
for kl in range(0, 5):
    print(f"KL-Score {kl}")
    imgs=[]
    for i in range(len(y)):
        if y[i] == kl:
            imarr = np.array(X[i], dtype=float)
            imarr = imarr.reshape(256, 256)
            imgs.append(imarr)
    arr = np.mean(np.stack(imgs, axis=0), axis=0)
    arr = np.array(np.round(arr), dtype=np.uint8)
    out = Image.fromarray(arr)
    out.save(os.path.join(ques_filepath, f"average_cluster_{kl}_byKL.png"))
    out.show()