In [None]:
import os
import random
import matplotlib.pyplot as plt
from PIL import Image
import config
import numpy as np

base_dir = config.RAW_DATA_PATH
root_dir = os.path.join(base_dir, 'kaggle')  # folder containing subfolders 0,1,2,3,4

clusters = ["0", "1", "2", "3", "4"]
num_samples = 3

# fig, axes = plt.subplots(len(clusters), num_samples, figsize=(num_samples*4, len(clusters)*4))

for i, cluster in enumerate(clusters):
    fig, axes = plt.subplots(1, num_samples, figsize=(num_samples*4, 4))
    folder_path = os.path.join(root_dir, cluster)
    img_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png','.jpg','.jpeg'))]

    # Sample 3 random images
    sampled = random.sample(img_files, num_samples)

    for j, img_name in enumerate(sampled):
        img_path = os.path.join(folder_path, img_name)
        img = Image.open(img_path)

        ax = axes[j]
        ax.imshow(img, cmap='gray')
        ax.axis("off")


    plt.suptitle(f"KL-Score {cluster}: Example Images", fontsize=14)

    plt.tight_layout()
    plt.show()


In [None]:
def load_image(path):
    return np.array(Image.open(path).convert("L"))  # grayscale array


clusters = ["0", "1", "2", "3", "4"]

brightness_per_cluster = {}
contrast_per_cluster = {}

for cluster in clusters:
    folder_path = os.path.join(root_dir, cluster)
    img_files = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.lower().endswith(('.png','.jpg','.jpeg'))
    ]

    brightness_vals = []
    contrast_vals = []

    for path in img_files:
        img = load_image(path)
        brightness_vals.append(np.mean(img))
        contrast_vals.append(np.std(img))

    brightness_per_cluster[cluster] = brightness_vals
    contrast_per_cluster[cluster] = contrast_vals

In [None]:
plt.figure(figsize=(14,6))

# Brightness
plt.subplot(1, 2, 1)
for cluster, bvals in brightness_per_cluster.items():
    plt.hist(bvals, bins=20, alpha=0.5, histtype='barstacked',
             linewidth=2, label=f"KL-Score {cluster}")
plt.title("Brightness distribution by cluster")
plt.xlabel("Mean intensity")
plt.ylabel("Count")
plt.legend()

# Contrast
plt.subplot(1, 2, 2)
for cluster, cvals in contrast_per_cluster.items():
    plt.hist(cvals, bins=20, alpha=0.5, histtype='barstacked',
             linewidth=2, label=f"KL-Score {cluster}")
plt.title("Contrast distribution by KL-Score")
plt.xlabel("Std intensity")
plt.ylabel("Count")
plt.legend()

plt.tight_layout()
plt.show()
