In [6]:
import numpy as np

""" Clustering Algorithm """
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

""" 
Metrics for Number of Clusters
Note 
Silhouette Score: The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. 
Davies Bouldin Score: The minimum score is zero, with lower values indicating better clustering.
Calinski Harabasz Score: The score is a positive floating-point value, where higher values indicate better clustering.
"""
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

"""
Scaler for preprocessing
"""
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [7]:
# """ The clustering algorithm inside the dictionary datatype or a key-value pair data structure """
# random_state=42
# CLUSTERS_DICT = {
#     'kmeans': KMeans(random_state=random_state),
#     'spectral': SpectralClustering(random_state=random_state),
#     'hierarchical': AgglomerativeClustering(linkage='ward'),
#     'agglomerative': AgglomerativeClustering(),
#     'gaussian': GaussianMixture(random_state=random_state)
# }

In [8]:
""" The clustering algorithm and metrics score inside the dictionary datatype or a key-value pair data structure """
#clusters
# CLUSTERS_DICT = {
#     'kmeans': KMeans,
#     'spectral': SpectralClustering,
#     'hierarchical': AgglomerativeClustering,
#     'agglomerative': AgglomerativeClustering,
#     'gaussian': GaussianMixture
# }



' The clustering algorithm and metrics score inside the dictionary datatype or a key-value pair data structure '

In [9]:
class Clustering:
    """ Flexible clustering class with optional scaler and metric selection """

    CLUSTERS_DICT = {
        'kmeans': lambda k, random_state: KMeans(n_clusters=k, random_state=random_state),
        'spectral': lambda k, random_state: SpectralClustering(n_clusters=k, random_state=random_state, affinity='nearest_neighbors'),
        'hierarchical': lambda k, random_state: AgglomerativeClustering(n_clusters=k, linkage='ward'),
        'agglomerative': lambda k, random_state: AgglomerativeClustering(n_clusters=k),
        'gaussian': lambda k, random_state: GaussianMixture(n_components=k, random_state=random_state),
    }

    METRICS_DICT = {
        'silhouette': silhouette_score,
        'davies': davies_bouldin_score,
        'calinski': calinski_harabasz_score
    }

    def __init__(self, X, model, metric='silhouette', scaler_class=None, random_state=42):
        self.random_state = random_state
        self.cluster_algo = model
        self.metric = metric
        self.metric_best_score = None
        self.metric_scores = []

        # Apply scaling if scaler_class is provided
        if scaler_class is not None:
            self.scaler = scaler_class()
            self.X = self.scaler.fit_transform(X)
        else:
            self.scaler = None
            self.X = X

    def find_n_cluster(self, max_range=10):
        self.n_cluster, self.metric_best_score = self._compute_n(self.cluster_algo, self.metric, max_range)
        return self.n_cluster, self.metric_best_score

    def _compute_n(self, cluster_algo, metric, max_range):
        self.metric_scores = []
        best_score = float('-inf') if metric != 'davies' else float('inf')
        best_n_cluster = None

        for k in range(2, max_range + 1):
            y_labels = self._create_cluster_instance_fit_predict(k)
            score = self.METRICS_DICT[metric](self.X, y_labels)
            self.metric_scores.append(score)

            if (metric == 'davies' and score < best_score) or (metric != 'davies' and score > best_score):
                best_score = score
                best_n_cluster = k

        return best_n_cluster, best_score

    def _create_cluster_instance_fit_predict(self, k):
        cluster_instance = self.CLUSTERS_DICT[self.cluster_algo](k, self.random_state)

        if hasattr(cluster_instance, 'fit_predict'):
            return cluster_instance.fit_predict(self.X)

        cluster_instance.fit(self.X)
        return cluster_instance.predict(self.X)

    def get_scaled_data(self):
        """
        Returns:
            - The scaled (or raw) data
            - Name of the scaler used, or 'None'
        """
        scaler_name = type(self.scaler).__name__ if self.scaler else "None"
        return self.X, scaler_name

In [10]:
from sklearn.datasets import make_blobs
import pandas as pd

# Generate synthetic dataset
n_samples = 500  # Number of data points
n_features = 5   # Number of features
n_clusters = 3   # Number of clusters

X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)

# Convert to a Pandas DataFrame for better visualization
columns = ['Voltage', 'Current', 'Power Factor', 'Frequency', 'Power']
df = pd.DataFrame(X, columns=columns)
# df['Cluster'] = y  # Add cluster labels for reference (optional)

print(df.head())  # Show the first few rows


    Voltage   Current  Power Factor  Frequency     Power
0 -9.767521  9.480710      6.487488  -6.462790 -7.361530
1 -8.560889  8.734477      6.489258  -4.808246 -6.589762
2 -3.327672  7.511521      4.379144   0.847751 -7.028664
3 -7.431058 -8.789943      6.635979   1.891384  3.126497
4 -8.664148  8.975563      7.325451  -6.289177 -5.823476


In [33]:
import pandas as pd

df_sensor_readings = pd.read_csv('Lorega(raw)_anomalous.csv')
# df_sensor_readings.head()

data_X = df_sensor_readings[['current', 'frequency', 'power', 'powerFactor', 'voltage']]
data_X = data_X.dropna()
data_X = data_X[:1000]
data_X.shape[0]

1000

In [34]:
kmeans = Clustering(data_X, model='kmeans', scaler_class=StandardScaler)
n_cluster, best_score = kmeans.find_n_cluster(max_range=10)
print(f'Number of best cluster: {n_cluster}; Best Score: {best_score};')

Number of best cluster: 9; Best Score: 0.6861919127435481;


In [35]:
kmeans.metric_scores

[np.float64(0.5184147225013972),
 np.float64(0.5521798703552381),
 np.float64(0.6485967052822541),
 np.float64(0.649231973785181),
 np.float64(0.650886668730593),
 np.float64(0.5949171162031216),
 np.float64(0.6352412101006863),
 np.float64(0.6861919127435481),
 np.float64(0.6629215082413686)]

In [36]:
scaled_data, scaler = kmeans.get_scaled_data()
print(f'Scaler: {scaler}')

Scaler: StandardScaler


In [38]:
scaled_data

array([[-1.75249189e-02, -7.83807980e-01,  2.86838605e-02,
         4.61199504e-02,  6.38107591e-01],
       [-6.96022754e+00, -7.83807980e-01, -6.35192589e+00,
         6.33635243e-01,  3.29843054e-01],
       [-1.98246471e+01, -7.83807980e-01, -1.93914765e+01,
        -2.87421294e+01,  8.43617282e-01],
       ...,
       [ 3.90869353e-01, -7.83807980e-01,  2.49248148e-01,
         4.61199504e-02, -1.82800870e+00],
       [ 4.02880949e-01, -7.83807980e-01,  2.43996617e-01,
         4.61199504e-02, -2.03351839e+00],
       [ 4.14892545e-01, -7.83807980e-01,  2.49248148e-01,
         4.61199504e-02, -2.03351839e+00]])