In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap
import hdbscan
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [10]:
df = pd.read_csv("../dataset/final.csv")
X_scaled = df.values

In [11]:
results = []

n_neighbors_list = [5, 10, 15, 30]
n_components_list = [5, 10, 15, 20]
min_cluster_size_list = [5, 10, 15]

total_iterations = (
    len(n_neighbors_list)
    * len(n_components_list)
    * len(min_cluster_size_list)
)

with tqdm(total=total_iterations) as pbar:
    
    for n_neighbors in n_neighbors_list:
        for n_components in n_components_list:
            for min_cluster_size in min_cluster_size_list:
                
                X_umap = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=0.0,
                    random_state=42
                ).fit_transform(X_scaled)
                
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_cluster_size,
                    cluster_selection_method='leaf'
                )
                labels = clusterer.fit_predict(X_umap)
                
                cluster_count = len(set(labels) - {-1})
                
                if cluster_count == 0:
                    pbar.update(1)
                    continue
                
                mask = labels != -1
                
                try:
                    sil = silhouette_score(X_umap[mask], labels[mask])
                    dbi = davies_bouldin_score(X_umap[mask], labels[mask])
                except:
                    sil = np.nan
                    dbi = np.nan
                
                results.append({
                    "n_neighbors": n_neighbors,
                    "n_components": n_components,
                    "min_cluster_size": min_cluster_size,
                    "clusters": cluster_count,
                    "silhouette": sil,
                    "dbi": dbi
                })
                
                pbar.update(1)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["silhouette", "clusters"], ascending=[False, True]
)

print(df_results.head(10))

100%|██████████| 48/48 [02:43<00:00,  3.41s/it]

    n_neighbors  n_components  min_cluster_size  clusters  silhouette  \
38           30             5                15        13    0.635233   
44           30            15                15        14    0.622799   
40           30            10                10        14    0.613833   
47           30            20                15        12    0.609399   
35           15            20                15        12    0.603254   
5             5            10                15        21    0.602688   
8             5            15                15        22    0.591396   
37           30             5                10        17    0.586987   
26           15             5                15        17    0.585410   
23           10            20                15        16    0.581100   

         dbi  
38  0.513496  
44  0.514855  
40  0.533049  
47  0.505334  
35  0.577666  
5   0.537447  
8   0.548448  
37  0.558642  
26  0.561865  
23  0.572342  





Too many clusters

In [12]:
results = []

n_neighbors_list = [50, 75, 100]
n_components_list = [10, 15, 20]
min_cluster_size_list = [20, 25, 30]

total_iterations = (
    len(n_neighbors_list)
    * len(n_components_list)
    * len(min_cluster_size_list)
)

with tqdm(total=total_iterations) as pbar:
    
    for n_neighbors in n_neighbors_list:
        for n_components in n_components_list:
            for min_cluster_size in min_cluster_size_list:
                
                X_umap = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=0.0,
                    random_state=42
                ).fit_transform(X_scaled)
                
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_cluster_size,
                    cluster_selection_method = 'eom'
                )
                labels = clusterer.fit_predict(X_umap)
                
                cluster_count = len(set(labels) - {-1})
                
                if cluster_count == 0:
                    pbar.update(1)
                    continue
                
                mask = labels != -1
                
                try:
                    sil = silhouette_score(X_umap[mask], labels[mask])
                    dbi = davies_bouldin_score(X_umap[mask], labels[mask])
                except:
                    sil = np.nan
                    dbi = np.nan
                
                results.append({
                    "n_neighbors": n_neighbors,
                    "n_components": n_components,
                    "min_cluster_size": min_cluster_size,
                    "clusters": cluster_count,
                    "silhouette": sil,
                    "dbi": dbi
                })
                
                pbar.update(1)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["silhouette", "clusters"], ascending=[False, True]
)

print(df_results.head(10))

100%|██████████| 27/27 [07:08<00:00, 15.87s/it]

    n_neighbors  n_components  min_cluster_size  clusters  silhouette  \
15           75            20                20         2    0.771150   
16           75            20                25         2    0.771150   
17           75            20                30         2    0.771150   
12           75            15                20         2    0.769782   
13           75            15                25         2    0.769782   
14           75            15                30         2    0.769782   
3            50            15                20         2    0.716305   
4            50            15                25         2    0.716305   
5            50            15                30         2    0.716305   
0            50            10                20         2    0.714623   

         dbi  
15  0.321279  
16  0.321279  
17  0.321279  
12  0.323350  
13  0.323350  
14  0.323350  
3   0.390801  
4   0.390801  
5   0.390801  
0   0.392681  





Only 2 Clusters