In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap
import hdbscan
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [14]:
df = pd.read_csv("../dataset/final.csv")
X_scaled = df.values

In [15]:
results = []

n_neighbors_list = [5, 10, 15, 30]
n_components_list = [5, 10, 15, 20]
min_cluster_size_list = [5, 10, 15]

total_iterations = (
    len(n_neighbors_list)
    * len(n_components_list)
    * len(min_cluster_size_list)
)

with tqdm(total=total_iterations) as pbar:
    
    for n_neighbors in n_neighbors_list:
        for n_components in n_components_list:
            for min_cluster_size in min_cluster_size_list:
                
                X_umap = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=0.0,
                    random_state=42
                ).fit_transform(X_scaled)
                
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_cluster_size,
                    cluster_selection_method='leaf'
                )
                labels = clusterer.fit_predict(X_umap)
                
                cluster_count = len(set(labels) - {-1})
                
                if cluster_count == 0:
                    pbar.update(1)
                    continue
                
                mask = labels != -1
                
                try:
                    sil = silhouette_score(X_umap[mask], labels[mask])
                    dbi = davies_bouldin_score(X_umap[mask], labels[mask])
                except:
                    sil = np.nan
                    dbi = np.nan
                
                results.append({
                    "n_neighbors": n_neighbors,
                    "n_components": n_components,
                    "min_cluster_size": min_cluster_size,
                    "clusters": cluster_count,
                    "silhouette": sil,
                    "dbi": dbi
                })
                
                pbar.update(1)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["silhouette", "clusters"], ascending=[False, True]
)

print(df_results.head(10))

100%|██████████| 48/48 [02:51<00:00,  3.57s/it]

    n_neighbors  n_components  min_cluster_size  clusters  silhouette  \
41           30            10                15        12    0.591466   
44           30            15                15        12    0.588342   
20           10            15                15        18    0.577301   
0             5             5                 5        88    0.570767   
32           15            15                15        15    0.566135   
47           30            20                15        12    0.565342   
23           10            20                15        18    0.564896   
10            5            20                10        31    0.561091   
29           15            10                15        14    0.561065   
38           30             5                15        13    0.560925   

         dbi  
41  0.559537  
44  0.544835  
20  0.608275  
0   0.546334  
32  0.606788  
47  0.557546  
23  0.651105  
10  0.594356  
29  0.589755  
38  0.566191  





Too many clusters

In [16]:
results = []

n_neighbors_list = [50, 75, 100]
n_components_list = [10, 15, 20]
min_cluster_size_list = [20, 25, 30]

total_iterations = (
    len(n_neighbors_list)
    * len(n_components_list)
    * len(min_cluster_size_list)
)

with tqdm(total=total_iterations) as pbar:
    
    for n_neighbors in n_neighbors_list:
        for n_components in n_components_list:
            for min_cluster_size in min_cluster_size_list:
                
                X_umap = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=0.0,
                    random_state=42
                ).fit_transform(X_scaled)
                
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_cluster_size,
                    cluster_selection_method = 'eom'
                )
                labels = clusterer.fit_predict(X_umap)
                
                cluster_count = len(set(labels) - {-1})
                
                if cluster_count == 0:
                    pbar.update(1)
                    continue
                
                mask = labels != -1
                
                try:
                    sil = silhouette_score(X_umap[mask], labels[mask])
                    dbi = davies_bouldin_score(X_umap[mask], labels[mask])
                except:
                    sil = np.nan
                    dbi = np.nan
                
                results.append({
                    "n_neighbors": n_neighbors,
                    "n_components": n_components,
                    "min_cluster_size": min_cluster_size,
                    "clusters": cluster_count,
                    "silhouette": sil,
                    "dbi": dbi
                })
                
                pbar.update(1)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["silhouette", "clusters"], ascending=[False, True]
)

print(df_results.head(10))

100%|██████████| 27/27 [07:39<00:00, 17.00s/it]

    n_neighbors  n_components  min_cluster_size  clusters  silhouette  \
9            75            10                20         2    0.842025   
10           75            10                25         2    0.842025   
11           75            10                30         2    0.842025   
12           75            15                20         2    0.830090   
13           75            15                25         2    0.830090   
14           75            15                30         2    0.830090   
15           75            20                20         2    0.810888   
16           75            20                25         2    0.810888   
17           75            20                30         2    0.810888   
18          100            10                20         2    0.807319   

         dbi  
9   0.224142  
10  0.224142  
11  0.224142  
12  0.241177  
13  0.241177  
14  0.241177  
15  0.266716  
16  0.266716  
17  0.266716  
18  0.271214  





Only 2 Clusters