In [23]:
import os
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Function to extract features from an image
def extract_features(image_path):
    # Load and preprocess the image
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))
    img = img.astype('float32')
    img /= 255.0
    img = np.expand_dims(img, axis=0)
    return img.flatten()

# Folder containing images
folder_path = '/mnt/DATA/Ankita/mmpose_images'

# List files in the folder
files = os.listdir(folder_path)

# List to hold all feature vectors
features_list = []

# Iterate through the images and extract features
for filename in files:
    image_path = os.path.join(folder_path, filename)
    features = extract_features(image_path)
    features_list.append(features)

# Convert the list to a numpy array
features_array = np.array(features_list)

# Initialize an empty list to store valid features
valid_features = []

# Iterate over each feature vector in the features array
for feature_vector in features_array:
    # Convert feature vector elements to numerical data type
    feature_vector_numeric = np.array(feature_vector, dtype=float)
    
    # Check if any element in the feature vector is NaN
    if not np.isnan(feature_vector_numeric).any():
        valid_features.append(feature_vector_numeric)

# Convert the list of valid features to a numpy array
features_array_filtered = np.array(valid_features)

# Check if there are any valid features left after filtering NaN values
if len(features_array_filtered) == 0:
    print("No valid features found after filtering NaN values.")
else:
    # Scale the features (optional but usually beneficial)
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_array_filtered)

    # Number of clusters
    num_clusters = 4

    # Initialize KMeans with the desired number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)

    # Perform KMeans clustering
    cluster_labels = kmeans.fit_predict(features_scaled)

    # Create a directory for each cluster
    for i in range(num_clusters):
        cluster_dir = os.path.join(folder_path, f'cluster_{i}')
        os.makedirs(cluster_dir, exist_ok=True)

    # Copy images to respective cluster directories
    for filename, cluster_label in zip(files, cluster_labels):
        src_path = os.path.join(folder_path, filename)
        dst_path = os.path.join(folder_path, f'cluster_{cluster_label}', filename)
        os.rename(src_path, dst_path)
