Intial Clustering

In [2]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from sklearn.cluster import MiniBatchKMeans
import shutil

In [2]:
# Function to load images and preprocess them in batches
def load_images_from_folder(folder, batch_size=1000):
    images = []
    filenames = []
    for filename in os.listdir(folder):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            img_path = os.path.join(folder, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            # Resize image to ensure it meets the minimum size requirement for HOG
            img = cv2.resize(img, (64, 64))
            images.append(img)
            filenames.append(filename)
            if len(images) == batch_size:
                yield images, filenames
                images = []
                filenames = []
    if images:  # Yield the remaining images if any
        yield images, filenames

# Function to extract HOG features from images
def extract_hog_features(images):
    hog_features = []
    for img in images:
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return hog_features

# Function to perform K-means clustering and save the clustered images
def cluster_and_save_images(images_folder, output_folder, num_clusters=100, batch_size=1000):
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
    
    # Iterate over images in batches
    for batch_images, batch_filenames in load_images_from_folder(images_folder, batch_size=batch_size):
        features = extract_hog_features(batch_images)
        features = np.array(features)

        # Perform K-means clustering
        kmeans.partial_fit(features)

    # Create folders for each cluster
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Perform clustering and save images
    for filename in os.listdir(images_folder):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            img_path = os.path.join(images_folder, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (64, 64))
            features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
            label = kmeans.predict([features])[0]
            cluster_folder = os.path.join(output_folder, f'cluster_{label}')
            if not os.path.exists(cluster_folder):
                os.makedirs(cluster_folder)
            shutil.copy(img_path, os.path.join(cluster_folder, filename))

    print("Clustering completed and images are saved in respective folders.")

# Specify the paths
images_folder = "segmented_data"  
output_folder = "Clusters"  

# Perform clustering and save images
cluster_and_save_images(images_folder, output_folder)

Clustering completed and images are saved in respective folders.


Refine Clusters Created

In [4]:
# Function to load images and preprocess them
def load_images_from_folder(folder, batch_size=1000):
    images = []
    filenames = []
    for filename in os.listdir(folder):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            img_path = os.path.join(folder, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            # Resize image to ensure it meets the minimum size requirement for HOG
            img = cv2.resize(img, (64, 64))
            images.append(img)
            filenames.append(filename)
            if len(images) == batch_size:
                yield images, filenames
                images = []
                filenames = []
    if images:  # Yield the remaining images if any
        yield images, filenames

# Function to extract HOG features from images
def extract_hog_features(images):
    hog_features = []
    for img in images:
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return hog_features

# Function to perform K-means clustering on images in a folder
def refine_clusters(input_folder, output_base_folder, num_clusters=2, batch_size=1000):
    for cluster_folder in os.listdir(input_folder):
        cluster_path = os.path.join(input_folder, cluster_folder)
        if os.path.isdir(cluster_path):
            # Count the number of images in the cluster
            num_images = len([name for name in os.listdir(cluster_path) if name.endswith('.png') or name.endswith('.jpg')])
            
            # Skip clusters with fewer images than the number of clusters
            if num_images < num_clusters:
                print(f"Skipping {cluster_folder} as it contains fewer images ({num_images}) than the number of clusters ({num_clusters})")
                continue
            
            # Create a subfolder in the output folder
            refined_output_folder = os.path.join(output_base_folder, cluster_folder)
            if not os.path.exists(refined_output_folder):
                os.makedirs(refined_output_folder)
                
            kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
            
            # Iterate over images in batches
            for batch_images, batch_filenames in load_images_from_folder(cluster_path, batch_size=batch_size):
                features = extract_hog_features(batch_images)
                features = np.array(features)
                
                # Perform K-means clustering
                kmeans.partial_fit(features)

            # Cluster and save images
            for filename in os.listdir(cluster_path):
                if filename.endswith('.png') or filename.endswith('.jpg'):
                    img_path = os.path.join(cluster_path, filename)
                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                    img = cv2.resize(img, (64, 64))
                    features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
                    label = kmeans.predict([features])[0]
                    final_cluster_folder = os.path.join(refined_output_folder, f'cluster_{label}')
                    if not os.path.exists(final_cluster_folder):
                        os.makedirs(final_cluster_folder)
                    shutil.copy(img_path, os.path.join(final_cluster_folder, filename))

    print("Refined clustering completed and images are saved in respective folders.")

# Specify the paths
input_base_folder = "Clusters"  # Folder containing the initial clusters
output_base_folder = "Refined_Clusters"  # Folder to save the refined clusters

# Perform refined clustering on each initial cluster
refine_clusters(input_base_folder, output_base_folder)

Skipping cluster_0 as it contains fewer images (1) than the number of clusters (2)
Refined clustering completed and images are saved in respective folders.
