In [3]:
from pycaret.clustering import *
import pandas as pd

In [4]:
data = pd.read_csv("./Customers.csv")

In [5]:
num_clusters = [3, 4, 5]

In [6]:
results_dict = {
    "kmeans": [],
    "hclust": [],
    "meanshift": [],
}

In [7]:
def collect_metrics(model):
    performance = pull()
    return {
        "Silhouette": performance["Silhouette"].iloc[0],
        "Calinski-Harabasz": performance["Calinski-Harabasz"].iloc[0],
        "Davies-Bouldin": performance["Davies-Bouldin"].iloc[0],
    }

In [8]:
def run_clustering(clustering_type, data, preprocess_desc, **kwargs):
    for clusters in num_clusters:
        setup_data = setup(data=data, verbose=False, **kwargs)
        model = create_model(clustering_type, num_clusters=clusters)
        metrics = collect_metrics(model)
        results_dict[clustering_type].append(
            {"Preprocessing": preprocess_desc, "Clusters": clusters, **metrics}
        )

K MEANS CLUSTERING

In [9]:
run_clustering("kmeans", data, preprocess_desc="No Data Processing")
run_clustering("kmeans", data, preprocess_desc="Using Normalization", normalize=True)
run_clustering("kmeans", data, preprocess_desc="Using Transformation", transformation=True)
run_clustering("kmeans", data, preprocess_desc="Using PCA", pca=True)
run_clustering("kmeans", data, preprocess_desc="Using T+N", transformation=True, normalize=True)
run_clustering("kmeans", data, preprocess_desc="Using T+N+PCA", transformation=True, normalize=True, pca=True)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5567,6012.5623,0.5565,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5309,6977.0857,0.5804,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5585,9048.4127,0.5008,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1363,164.715,2.3272,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1506,190.3317,2.3197,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1696,195.1291,1.678,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5561,5807.1667,0.5612,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5392,6752.4719,0.5636,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5613,8989.2378,0.4988,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5567,6012.5623,0.5565,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5549,6806.1002,0.5056,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5583,9054.6569,0.5016,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1303,171.2776,2.6034,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1409,196.1121,2.4284,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1912,207.5749,2.0878,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0942,144.5202,2.8686,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.146,184.8677,2.1637,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1705,206.3691,2.0211,0,0,0


Hierarchical Clustering

In [10]:
run_clustering("hclust", data, preprocess_desc="No Data Processing")
run_clustering("hclust", data, preprocess_desc="Using Normalization", normalize=True)
run_clustering("hclust", data, preprocess_desc="Using Transformation", transformation=True)
run_clustering("hclust", data, preprocess_desc="Using PCA", pca=True)
run_clustering("hclust", data, preprocess_desc="Using T+N", transformation=True, normalize=True)
run_clustering("hclust", data, preprocess_desc="Using T+N+PCA", transformation=True, normalize=True, pca=True)


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5012,4448.9974,0.5678,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5193,6465.2449,0.5535,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5312,7687.4171,0.5015,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1037,188.735,2.5588,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1498,198.0505,2.375,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1898,211.929,2.1875,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4587,4185.8002,0.5562,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4866,5483.6902,0.5451,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5204,7302.1568,0.4788,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5012,4448.9974,0.5678,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5193,6465.2449,0.5535,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5312,7687.4171,0.5015,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1036,188.745,2.5595,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1498,198.0706,2.3753,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1895,211.9212,2.1875,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1036,188.745,2.5595,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1498,198.0706,2.3753,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1895,211.9212,2.1875,0,0,0


Mean-Shift Clustering

In [11]:
run_clustering("meanshift", data, preprocess_desc="No Data Processing")
run_clustering("meanshift", data, preprocess_desc="Using Normalization", normalize=True)
run_clustering("meanshift", data, preprocess_desc="Using Transformation", transformation=True)
run_clustering("meanshift", data, preprocess_desc="Using PCA", pca=True)
run_clustering("meanshift", data, preprocess_desc="Using T+N", transformation=True, normalize=True)
run_clustering("meanshift", data, preprocess_desc="Using T+N+PCA", transformation=True, normalize=True, pca=True)


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.299,140.0445,1.0656,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.299,140.0445,1.0656,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.299,140.0445,1.0656,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6098,4876.7662,0.506,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6098,4876.7662,0.506,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6098,4876.7662,0.506,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6167,5227.1638,0.5092,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2982,140.2256,1.0591,0,0,0


In [12]:
kmeans_df = pd.DataFrame(results_dict["kmeans"])
hclust_df = pd.DataFrame(results_dict["hclust"])
meanshift_df = pd.DataFrame(results_dict["meanshift"])

In [13]:
print("K-Means Results")
print(kmeans_df)

print("\nHierarchical Clustering Results")
print(hclust_df)

print("\nMean-Shift Clustering Results")
print(meanshift_df)

K-Means Results
           Preprocessing  Clusters  Silhouette  Calinski-Harabasz  \
0     No Data Processing         3      0.5567          6012.5623   
1     No Data Processing         4      0.5309          6977.0857   
2     No Data Processing         5      0.5585          9048.4127   
3    Using Normalization         3      0.1363           164.7150   
4    Using Normalization         4      0.1506           190.3317   
5    Using Normalization         5      0.1696           195.1291   
6   Using Transformation         3      0.5561          5807.1667   
7   Using Transformation         4      0.5392          6752.4719   
8   Using Transformation         5      0.5613          8989.2378   
9              Using PCA         3      0.5567          6012.5623   
10             Using PCA         4      0.5549          6806.1002   
11             Using PCA         5      0.5583          9054.6569   
12             Using T+N         3      0.1303           171.2776   
13             Usi