In [12]:
import numpy as np

""" Clustering Algorithm """
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

""" 
Metrics for Number of Clusters
Note 
Silhouette Score: The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. 
Davies Bouldin Score: The minimum score is zero, with lower values indicating better clustering.
Calinski Harabasz Score: The score is a positive floating-point value, where higher values indicate better clustering.
"""
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [13]:
# """ The clustering algorithm inside the dictionary datatype or a key-value pair data structure """
# random_state=42
# CLUSTERS_DICT = {
#     'kmeans': KMeans(random_state=random_state),
#     'spectral': SpectralClustering(random_state=random_state),
#     'hierarchical': AgglomerativeClustering(linkage='ward'),
#     'agglomerative': AgglomerativeClustering(),
#     'gaussian': GaussianMixture(random_state=random_state)
# }

In [16]:
""" The clustering algorithm and metrics score inside the dictionary datatype or a key-value pair data structure """
#clusters
CLUSTERS_DICT = {
    'kmeans': KMeans,
    'spectral': SpectralClustering,
    'hierarchical': AgglomerativeClustering,
    'agglomerative': AgglomerativeClustering,
    'gaussian': GaussianMixture
}

# Metrics
METRICS_DICT = {
    'silhouette': silhouette_score,
    'davies': davies_bouldin_score,
    'calinski': calinski_harabasz_score
}

In [51]:
class Clustering:
    def __init__(self, X, cluster_algo, random_state=42):
        self.X = X
        self.cluster_algo = cluster_algo # This is a reference of a clustering algorithm
        self.metric_best_score = None
        self.metric_scores = None
        self.random_state=random_state

    def find_n_cluster(self, metric, max_range=10):
        self.metric = metric
        self.n_cluster, self.metric_best_score = self._compute_n(self.cluster_algo, metric, max_range)
        return self.n_cluster, self.metric_best_score

    def _compute_n(self, cluster_algo, metric, max_range):
        self.metric_scores = []
        best_score = float('-inf') if metric != "davies_bouldin" else float('inf')
        best_n_cluster = None
        for k in range(2, max_range+1):
            y_labels = self._create_cluster_instance_fit_predict(k)
            score = silhouette_score(self.X, y_labels)
            self.metric_scores.append(score)
            if (metric == "davies_bouldin" and score < best_score) or (metric != "davies_bouldin" and score > best_score):
                best_score = score
                best_n_cluster = k      
        # if metric==davies_bouldin_score:
        #     return best_n_cluster, best_score
        return best_n_cluster, best_score
        
    def _create_cluster_instance_fit_predict(self, k):
        # KMeans
        if self.cluster_algo == KMeans:
            cluster = self.cluster_algo(n_clusters=k, random_state=self.random_state)
            y_labels = cluster.fit_predict(self.X)
    
        # SpectralClustering
        elif self/cluster_algo == SpectralClustering:
            cluster = self.cluster_algo(n_clusters=k, random_state=self.random_state, affinity='nearest_neighbors')
            y_labels = cluster.fit_predict(self.X)
    
        # Ward Hierarchical Clustering
        # Agglomerative Clustering is a specific type of Hierarchical Clustering and use the linkage ward
        elif self.cluster_algo == AgglomerativeClustering:
            cluster = self.cluster_algo(n_clusters=k, random_state=self.random_state, linkage='ward')
            y_labels = cluster.fit_predict(self.X)
             
        # # AgglomerativeClustering
        # elif self.cluster_algo == AgglomerativeClustering:
        #     cluster = self.cluster_algo(n_clusters=k)
        #     y_labels = cluster.fit_predict(self.X)
    
        # GaussianMixture
        elif self.cluster_algo == GaussianMixture:
            cluster = self.cluster_algo(n_components=k, random_state=self.random_state)
            cluster.fit(self.X)
            y_labels = cluster.predict(self.X)
    
        # Handle unknown clustering algorithms
        else:
            raise ValueError(f"Unsupported clustering algorithm: {cluster_algo}")
        return y_labels

In [52]:
from sklearn.datasets import make_blobs
import pandas as pd

# Generate synthetic dataset
n_samples = 500  # Number of data points
n_features = 5   # Number of features
n_clusters = 3   # Number of clusters

X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# Convert to a Pandas DataFrame for better visualization
columns = ['Voltage', 'Current', 'Power Factor', 'Frequency', 'Power']
df = pd.DataFrame(X, columns=columns)
# df['Cluster'] = y  # Add cluster labels for reference (optional)

print(df.head())  # Show the first few rows


    Voltage   Current  Power Factor  Frequency     Power
0 -9.767521  9.480710      6.487488  -6.462790 -7.361530
1 -8.560889  8.734477      6.489258  -4.808246 -6.589762
2 -3.327672  7.511521      4.379144   0.847751 -7.028664
3 -7.431058 -8.789943      6.635979   1.891384  3.126497
4 -8.664148  8.975563      7.325451  -6.289177 -5.823476


In [53]:
kmeans = Clustering(X, CLUSTERS_DICT['kmeans'])
n_cluster, best_score = kmeans.find_n_cluster(METRICS_DICT['silhouette'])
print(f'Number of best cluster: {n_cluster}; Best Score: {best_score};')

Number of best cluster: 3; Best Score: 0.7718348236319091;


In [54]:
kmeans.metric_scores

[np.float64(0.7422772235562364),
 np.float64(0.7718348236319091),
 np.float64(0.5790579710600725),
 np.float64(0.3904968274566857),
 np.float64(0.15392219409234362),
 np.float64(0.15249154274279758),
 np.float64(0.15143073182593317),
 np.float64(0.15517915916121405),
 np.float64(0.14892206257592097)]