In [1]:
# Step 1: Import necessary libraries
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from shutil import copy
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
import pandas as pd

In [2]:
# Step 2: Create labels from directory structure
# This function assigns a unique label to each folder and lists all image files within.
def create_labels_from_paths(base_dir):
    labels = []
    image_files = []

    for idx, cluster_folder in enumerate(sorted(os.listdir(base_dir))):
        folder_path = os.path.join(base_dir, cluster_folder)
        if os.path.isdir(folder_path):
            for image_file in os.listdir(folder_path):
                image_files.append(os.path.join(folder_path, image_file))
                labels.append(idx)  # Assigns a label to each folder

    return np.array(labels), image_files

In [3]:
# Step 3: Load feature data
# Load pre-extracted features from a NumPy file.
features = np.load('C:\\Users\\Berkay\\PycharmProjects\\DDYM\\model\\xception_flower.npy')

In [4]:
# Step 4: Scale the dataset
# Standardize features to have zero mean and unit variance.
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [5]:
# Step 5: Apply PCA
# Reduce dimensions to 9 principal components.
pca = PCA(n_components=9)
principal_components = pca.fit_transform(scaled_features)

# Print explained variance ratio for each principal component
print(f"Explained variance ratios: {pca.explained_variance_ratio_}")

Explained variance ratios: [0.07718904 0.0529648  0.03668069 0.02669235 0.02371409 0.02206665
 0.01783295 0.01737857 0.01357454]


In [6]:
# Step 6: Cluster using Gaussian Mixture Model (GMM)
# Fit GMM with 5 clusters on the PCA-reduced data.
n_clusters = 5
gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=0)
gmm_labels = gmm.fit_predict(principal_components)

In [7]:
# Step 7: Calculate silhouette score
# Evaluate clustering performance with silhouette score.
silhouette_avg = silhouette_score(principal_components, gmm_labels)
print(f"Silhouette Score: {silhouette_avg:.4f}")

Silhouette Score: 0.2031


In [8]:
# Step 8: Save clusters to separate folders
# This function copies images into new folders based on their cluster labels.
def save_clusters_to_folders(image_files, labels, output_dir, max_workers=8):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    def copy_file(idx, label):
        cluster_folder = os.path.join(output_dir, f"cluster_{label}")
        if not os.path.exists(cluster_folder):
            os.makedirs(cluster_folder)
        copy(image_files[idx], cluster_folder)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(lambda x: copy_file(x[0], x[1]), enumerate(labels))

# Define paths and save clustered images
base_dir = 'C:\\Users\\Berkay\\Desktop\\flower\\flower_photos'
output_dir = 'C:\\Users\\Berkay\\Desktop\\flower_ensemble_clustered'
_, image_files = create_labels_from_paths(base_dir)
save_clusters_to_folders(image_files, gmm_labels, output_dir)
print(f"Clustered images saved to {output_dir}")

Clustered images saved to C:\Users\Berkay\Desktop\flower_ensemble_clustered


In [9]:
# Step 9: Extract original class labels from folders
# This function extracts true labels based on original folder structure.
def extract_class_labels(folder_path):
    class_labels = []
    class_names = sorted(os.listdir(folder_path))
    class_to_label = {class_name: i for i, class_name in enumerate(class_names)}
    for class_name in class_names:
        class_folder = os.path.join(folder_path, class_name)
        for image_file in os.listdir(class_folder):
            class_labels.append(class_to_label[class_name])
    return np.array(class_labels), class_names

In [10]:
# Step 10: Compare label lengths
# Check if true labels and clustered labels have the same length.
def check_label_lengths(true_labels, clustered_labels):
    if len(true_labels) != len(clustered_labels):
        print(f"Error: True labels ({len(true_labels)}) and Clustered labels ({len(clustered_labels)}) are not the same length!")
    else:
        print("True labels and Clustered labels have the same length.")
    return len(true_labels) == len(clustered_labels)

In [11]:
# Step 11: Display dominant flower types in each cluster
# Calculate the distribution of original flower types within each cluster.
def display_flower_types_in_clusters(true_labels, cluster_labels, class_names):
    df = pd.DataFrame({'TrueLabel': true_labels, 'Cluster': cluster_labels})
    for cluster in df['Cluster'].unique():
        cluster_df = df[df['Cluster'] == cluster]
        total_images = len(cluster_df)
        flower_counts = Counter(cluster_df['TrueLabel'])
        flower_percentages = {class_names[key]: (value / total_images) * 100 for key, value in flower_counts.items()}
        
        dominant_flowers = {flower: percentage for flower, percentage in flower_percentages.items() if percentage > 50}
        print(f"Cluster {cluster} (Size: {total_images}):")
        if dominant_flowers:
            print(" Dominant flower types (>50%):")
            for flower, percentage in dominant_flowers.items():
                print(f"  {flower}: {percentage:.2f}%")
        else:
            print(" No dominant flower type over 50%.")
        print()

In [12]:
# Step 12: Run comparison and display results
# Extract true labels from original dataset and display cluster composition.
folder_path = r"C:\Users\Berkay\Desktop\flower\flower_photos"
true_labels, class_names = extract_class_labels(folder_path)

if check_label_lengths(true_labels, gmm_labels):
    display_flower_types_in_clusters(true_labels, gmm_labels, class_names)
else:
    print("Cannot display clusters because label lengths do not match.")

True labels and Clustered labels have the same length.
Cluster 1 (Size: 538):
 Dominant flower types (>50%):
  daisy: 82.53%

Cluster 4 (Size: 1007):
 Dominant flower types (>50%):
  sunflowers: 50.84%

Cluster 0 (Size: 657):
 Dominant flower types (>50%):
  roses: 70.62%

Cluster 3 (Size: 832):
 Dominant flower types (>50%):
  tulips: 72.48%

Cluster 2 (Size: 636):
 Dominant flower types (>50%):
  dandelion: 98.43%
