<a href="https://colab.research.google.com/github/aditisinha427/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [2]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

In [3]:
def preprocess_data(X, method):
    if method == "none":
        return X
    if method == "normalize":
        return MinMaxScaler().fit_transform(X)
    if method == "transform":
        return np.log1p(X)
    if method == "pca":
        return PCA(n_components=2).fit_transform(X)
    if method == "t+n":
        X_t = np.log1p(X)
        return MinMaxScaler().fit_transform(X_t)
    if method == "t+n+pca":
        X_tn = MinMaxScaler().fit_transform(np.log1p(X))
        return PCA(n_components=2).fit_transform(X_tn)

In [8]:
def run_clustering(X, algo, n_clusters=None):
    if algo == "kmeans":
        model = KMeans(n_clusters=n_clusters, n_init='auto')
    elif algo == "hierarchical":
        model = AgglomerativeClustering(n_clusters=n_clusters)
    elif algo == "meanshift":
        model = MeanShift()
    else:
        return None

    labels = model.fit_predict(X)
    return {
        "silhouette": silhouette_score(X, labels),
        "calinski": calinski_harabasz_score(X, labels),
        "davies": davies_bouldin_score(X, labels),
        "labels": labels
    }

In [9]:
preprocess_methods = ["none", "normalize", "transform", "pca", "t+n", "t+n+pca"]
cluster_range = [3, 4, 5]

results = []


In [10]:
for algo in ["kmeans", "hierarchical", "meanshift"]:
    for method in preprocess_methods:
        X_pre = preprocess_data(X, method)
        if algo == "meanshift":
            res = run_clustering(X_pre, algo)
            results.append([algo, method, "auto", res['silhouette'], res['calinski'], res['davies']])
        else:
            for c in cluster_range:
                res = run_clustering(X_pre, algo, n_clusters=c)
                results.append([algo, method, c, res['silhouette'], res['calinski'], res['davies']])

In [11]:
results_df = pd.DataFrame(results, columns=["Algorithm", "Preprocessing", "Clusters", "Silhouette", "Calinski", "Davies"])
results_df = results_df.round(3)
results_df

Unnamed: 0,Algorithm,Preprocessing,Clusters,Silhouette,Calinski,Davies
0,kmeans,none,3,0.551,561.594,0.666
1,kmeans,none,4,0.498,530.766,0.78
2,kmeans,none,5,0.364,454.876,0.91
3,kmeans,normalize,3,0.483,351.295,0.787
4,kmeans,normalize,4,0.444,313.893,0.908
5,kmeans,normalize,5,0.438,252.956,0.894
6,kmeans,transform,3,0.572,974.176,0.628
7,kmeans,transform,4,0.502,839.179,0.774
8,kmeans,transform,5,0.461,683.666,0.914
9,kmeans,pca,3,0.598,693.708,0.565
