In [1]:
import pandas as pd
#load the dataset
data = pd.read_csv("/content/balance-scale.data")

In [2]:
import numpy as np
from sklearn.cluster import KMeans, MeanShift, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


In [4]:
X = data.iloc[:, 1:].values
# Preprocessing methods
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

transformer = PowerTransformer()
X_transformed = transformer.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

X_t_n = transformer.fit_transform(X_normalized)
X_t_n_pca = pca.fit_transform(X_t_n)

In [5]:

# Define clustering methods
def perform_clustering(X, method="kmeans", clusters=[3, 4, 5]):
    results = []
    for c in clusters:
        if method == "kmeans":
            model = KMeans(n_clusters=c, random_state=42, n_init=10)
        elif method == "hierarchical":
            model = AgglomerativeClustering(n_clusters=c)
        elif method == "meanshift":
            model = MeanShift()
        else:
            raise ValueError("Invalid method")

        labels = model.fit_predict(X)

        silhouette = silhouette_score(X, labels) if len(set(labels)) > 1 else np.nan
        calinski = calinski_harabasz_score(X, labels)
        davies = davies_bouldin_score(X, labels) if len(set(labels)) > 1 else np.nan

        results.append([c, silhouette, calinski, davies])

    return pd.DataFrame(results, columns=["Clusters", "Silhouette", "Calinski-Harabasz", "Davies-Bouldin"])


In [6]:
# Perform clustering on all preprocessing variations
methods = ["kmeans", "hierarchical", "meanshift"]
datasets = {
    "No Data Processing": X,
    "Using Normalization": X_normalized,
    "Using Transform": X_transformed,
    "Using PCA": X_pca,
    "Using T+N": X_t_n,
    "T+N+PCA": X_t_n_pca,
}

In [7]:
# Generate Results
final_results = {}

for method in methods:
    method_results = {}
    for name, data in datasets.items():
        try:
            df_result = perform_clustering(data, method=method)
            method_results[name] = df_result
        except:
            method_results[name] = None  # If clustering fails
    final_results[method] = method_results

In [8]:
# Display formatted table
for method, results in final_results.items():
    print(f"\n### {method.upper()} CLUSTERING RESULTS ###\n")
    for name, df_result in results.items():
        print(f"\n**{name}**")
        if df_result is not None:
            print(df_result.to_string(index=False))


### KMEANS CLUSTERING RESULTS ###


**No Data Processing**
 Clusters  Silhouette  Calinski-Harabasz  Davies-Bouldin
        3    0.174466         136.674065        1.690472
        4    0.190545         136.108490        1.480735
        5    0.188296         134.064355        1.389558

**Using Normalization**
 Clusters  Silhouette  Calinski-Harabasz  Davies-Bouldin
        3    0.174466         136.674065        1.690472
        4    0.190986         136.221537        1.482004
        5    0.188311         133.993363        1.385375

**Using Transform**
 Clusters  Silhouette  Calinski-Harabasz  Davies-Bouldin
        3    0.184227         142.482152        1.685186
        4    0.187092         137.842444        1.530177
        5    0.191315         136.889557        1.376939

**Using PCA**
 Clusters  Silhouette  Calinski-Harabasz  Davies-Bouldin
        3    0.343051         405.367021        0.919147
        4    0.338763         438.087470        0.878220
        5    0.329847   