# 1.0 Installing neccessary libraries

In [1]:
%pip install -r requirements.txt

Collecting anyio>=3.1.0 (from jupyter-server<3,>=2.4.0->notebook->jupyter==1.0.0->-r requirements.txt (line 5))
  Using cached anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Using cached anyio-3.7.1-py3-none-any.whl (80 kB)
Installing collected packages: anyio
  Attempting uninstall: anyio
    Found existing installation: anyio 4.4.0
    Uninstalling anyio-4.4.0:
      Successfully uninstalled anyio-4.4.0
Successfully installed anyio-3.7.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip list

Package                   Version
------------------------- --------------
anyio                     3.7.1
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.15.0
beautifulsoup4            4.13.0b2
bleach                    6.1.0
certifi                   2024.7.4
cffi                      1.17.0rc1
charset-normalizer        3.3.2
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.2.1
cycler                    0.12.1
debugpy                   1.8.2
decorator                 5.1.1
defusedxml                0.8.0rc2
executing                 2.0.1
fastjsonschema            2.20.0
fonttools                 4.53.1
fqdn                      1.5.1
h11                       0.12.0
httpcore                  0.13.7
httpx                     1.0.0b0
idna                      3.7
ipykern

In [1]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import pickle

from sklearn.cluster import AgglomerativeClustering, MeanShift, Birch, HDBSCAN, OPTICS
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

df_umap = pd.read_csv(r'./dimension_reduced_data/knn3_umap3.csv', index_col = 'CUST_ID')

df_clusters = pd.DataFrame(index = df_umap.index)
df_scores = pd.DataFrame(index = ['silhouette', 'davies_bouldin', 'calinski_harabasz'])


for n_clusters in range(2, 11):
    for linkage in ['ward', 'complete', 'average', 'single']:
        agglomerative_model = AgglomerativeClustering(n_clusters = n_clusters, metric = 'euclidean', linkage = linkage).fit(df_umap)
        
        df_clusters[f'agg_ncluster{n_clusters}_linkage{linkage}'] = agglomerative_model.labels_
        
        df_scores[f'agg_ncluster{n_clusters}_linkage{linkage}'] = [silhouette_score(df_umap, agglomerative_model.labels_), davies_bouldin_score(df_umap, agglomerative_model.labels_), calinski_harabasz_score(df_umap, agglomerative_model.labels_)]

In [9]:
from sklearn.cluster import estimate_bandwidth
import numpy as np

for quantile in [0.1, 0.2, 0.3, 0.4, 0.5]:
    # Bandwidth setting
    bandwidth = estimate_bandwidth(df_umap, quantile=quantile)

    # Creating and fitting the Mean Shift model
    ms_model= MeanShift(bandwidth=bandwidth).fit(df_umap)

    df_clusters[f'ms_quantile{quantile}'] = ms_model.labels_

    if len(np.unique(ms_model.labels_)) > 1:
        df_scores[f'ms_quantile{quantile}'] = [silhouette_score(df_umap, ms_model.labels_), davies_bouldin_score(df_umap, ms_model.labels_), calinski_harabasz_score(df_umap, ms_model.labels_)]
    else:
        df_scores[f'ms_quantile{quantile}'] = [-1, -1, -1]

In [12]:
for threshold in [0.1, 0.3, 0.5, 0.7, 0.9]:
    for factor in [10, 20, 30, 40, 50]:
        for ncluster in [2, 3, 4, 5, 6]:
            # Creating and fitting the Birch model
            birch_model = Birch(threshold = threshold, branching_factor = factor, n_clusters = ncluster).fit(df_umap)

            df_clusters[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = birch_model.labels_

            df_scores[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = [silhouette_score(df_umap, birch_model.labels_), davies_bouldin_score(df_umap, birch_model.labels_), calinski_harabasz_score(df_umap, birch_model.labels_)]

  df_clusters[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = birch_model.labels_
  df_scores[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = [silhouette_score(df_umap, birch_model.labels_), davies_bouldin_score(df_umap, birch_model.labels_), calinski_harabasz_score(df_umap, birch_model.labels_)]
  df_clusters[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = birch_model.labels_
  df_scores[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = [silhouette_score(df_umap, birch_model.labels_), davies_bouldin_score(df_umap, birch_model.labels_), calinski_harabasz_score(df_umap, birch_model.labels_)]
  df_clusters[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = birch_model.labels_
  df_scores[f'birch_threshold{threshold}_factor{factor}_ncluster{ncluster}'] = [silhouette_score(df_umap, birch_model.labels_), davies_bouldin_score(df_umap, birch_model.labels_), calinski_harabasz_score(df_umap, birch_model.la

In [15]:
for min_cluster_size in range(5, 51, 5):
    for min_sample in range(5, 11):
        # Creating and fitting the HDBSCAN model
        hdbscan_model = HDBSCAN(min_cluster_size = min_cluster_size, min_samples = min_sample).fit(df_umap)

        df_clusters[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = hdbscan_model.labels_

        df_scores[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = [silhouette_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), davies_bouldin_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), calinski_harabasz_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1])]

  df_clusters[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = hdbscan_model.labels_
  df_scores[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = [silhouette_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), davies_bouldin_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), calinski_harabasz_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1])]
  df_clusters[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = hdbscan_model.labels_
  df_scores[f'hdbscan_mincluster{min_cluster_size}_minsample{min_sample}'] = [silhouette_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), davies_bouldin_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdbscan_model.labels_ != -1]), calinski_harabasz_score(df_umap[hdbscan_model.labels_ != -1], hdbscan_model.labels_[hdb

In [17]:
for min_samp in [30, 60, 80, 120, 140]:
    for xi in [0.0005, 0.005, 0.01, 0.02, 0.05]:
        # Creating and fitting the OPTICS model
        optics_model = OPTICS(min_samples = min_samp, xi = xi).fit(df_umap)

        df_clusters[f'optics_minsamp{min_samp}_xi{xi}'] = optics_model.labels_
        
        df_scores[f'optics_minsamp{min_samp}_xi{xi}'] = [silhouette_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), davies_bouldin_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), calinski_harabasz_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1])]

  df_clusters[f'optics_minsamp{min_samp}_xi{xi}'] = optics_model.labels_
  df_scores[f'optics_minsamp{min_samp}_xi{xi}'] = [silhouette_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), davies_bouldin_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), calinski_harabasz_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1])]
  df_clusters[f'optics_minsamp{min_samp}_xi{xi}'] = optics_model.labels_
  df_scores[f'optics_minsamp{min_samp}_xi{xi}'] = [silhouette_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), davies_bouldin_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1]), calinski_harabasz_score(df_umap[optics_model.labels_ != -1], optics_model.labels_[optics_model.labels_ != -1])]
  df_clusters[f'optics_minsamp{min_samp}_xi{xi}'] = optics_model.labels_
  df_scores[f'optics_minsam

In [18]:
df_clusters

Unnamed: 0_level_0,agg_ncluster2_linkageward,agg_ncluster2_linkagecomplete,agg_ncluster2_linkageaverage,agg_ncluster2_linkagesingle,agg_ncluster3_linkageward,agg_ncluster3_linkagecomplete,agg_ncluster3_linkageaverage,agg_ncluster3_linkagesingle,agg_ncluster4_linkageward,agg_ncluster4_linkagecomplete,...,optics_minsamp120_xi0.0005,optics_minsamp120_xi0.005,optics_minsamp120_xi0.01,optics_minsamp120_xi0.02,optics_minsamp120_xi0.05,optics_minsamp140_xi0.0005,optics_minsamp140_xi0.005,optics_minsamp140_xi0.01,optics_minsamp140_xi0.02,optics_minsamp140_xi0.05
CUST_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C17875,0,0,0,0,2,2,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
C16296,0,1,0,0,1,1,0,0,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
C17219,0,1,0,0,1,1,0,0,0,1,...,4,4,4,1,-1,-1,-1,-1,-1,-1
C13108,0,0,0,0,1,2,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
C13576,0,0,0,0,1,2,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C14210,1,0,0,0,0,0,2,0,1,2,...,5,5,5,2,1,4,-1,-1,1,-1
C17858,0,0,0,0,2,2,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
C12903,0,0,0,0,2,2,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
C10444,1,0,0,0,0,0,2,0,1,2,...,6,6,-1,2,1,5,5,-1,1,-1


In [19]:
df_scores

Unnamed: 0,agg_ncluster2_linkageward,agg_ncluster2_linkagecomplete,agg_ncluster2_linkageaverage,agg_ncluster2_linkagesingle,agg_ncluster3_linkageward,agg_ncluster3_linkagecomplete,agg_ncluster3_linkageaverage,agg_ncluster3_linkagesingle,agg_ncluster4_linkageward,agg_ncluster4_linkagecomplete,...,optics_minsamp120_xi0.0005,optics_minsamp120_xi0.005,optics_minsamp120_xi0.01,optics_minsamp120_xi0.02,optics_minsamp120_xi0.05,optics_minsamp140_xi0.0005,optics_minsamp140_xi0.005,optics_minsamp140_xi0.01,optics_minsamp140_xi0.02,optics_minsamp140_xi0.05
silhouette,0.366102,0.088799,0.309151,0.309151,0.387961,0.269658,0.320291,0.184405,0.428147,0.321796,...,0.548577,0.574236,0.593377,0.604613,0.60985,0.577771,0.580939,0.63718,0.60835,0.692485
davies_bouldin,1.139835,2.02431,0.541395,0.541395,0.929431,1.377573,0.922123,0.795726,0.756479,1.180261,...,0.658468,0.614286,0.593893,0.515698,0.530535,0.588762,0.586676,0.491374,0.531678,0.445922
calinski_harabasz,2088.770904,430.282661,106.937648,106.937648,2007.486411,1640.478566,1174.836952,362.911648,2142.713459,1729.09723,...,3558.345344,3389.844066,3307.671563,3109.942236,3381.030928,3770.36243,3688.454503,3412.694977,3363.395816,3656.80345


In [21]:
df_clusters.to_csv(r'./cluster_data/clusters.csv')
df_scores.to_csv(r'./cluster_data/scores.csv')