In [None]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import shutil

In [1]:
# Function to load images and preprocess them
def load_images_from_folder(folder, batch_size=1000):
    images = []
    filenames = []
    for filename in os.listdir(folder):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            img_path = os.path.join(folder, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            # Resize image to ensure it meets the minimum size requirement for HOG
            img = cv2.resize(img, (64, 64))
            images.append(img)
            filenames.append(filename)
            if len(images) == batch_size:
                yield images, filenames
                images = []
                filenames = []
    if images:  # Yield the remaining images if any
        yield images, filenames

# Function to extract HOG features from images
def extract_hog_features(images):
    hog_features = []
    for img in images:
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return hog_features

# Function to perform HAC clustering with PCA for feature reduction
def cluster_and_save_images_hac_with_pca(images_folder, output_folder, num_clusters=100, n_components=100, batch_size=1000):
    # Initialize lists to collect features and filenames
    all_features = []
    all_filenames = []

   
    for batch_images, batch_filenames in load_images_from_folder(images_folder, batch_size=batch_size):
        features = extract_hog_features(batch_images)
        all_features.extend(features)
        all_filenames.extend(batch_filenames)

    all_features = np.array(all_features)

    # Apply PCA to reduce feature dimensions
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(all_features)

    # Perform HAC clustering
    hac = AgglomerativeClustering(n_clusters=num_clusters)
    labels = hac.fit_predict(reduced_features)

    # Create folders for each cluster and move images into the respective folders
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for idx, label in enumerate(labels):
        cluster_folder = os.path.join(output_folder, f'cluster_{label}')
        if not os.path.exists(cluster_folder):
            os.makedirs(cluster_folder)
        shutil.copy(os.path.join(images_folder, all_filenames[idx]), os.path.join(cluster_folder, all_filenames[idx]))

    print("Clustering completed and images are saved in respective folders.")

# Specify the paths
images_folder = "segmented_data"  
output_folder = "cluster_hac_pca"  

cluster_and_save_images_hac_with_pca(images_folder, output_folder)


Clustering completed and images are saved in respective folders.
