In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap
import hdbscan
import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../dataset/final.csv")
X_scaled = df.values

In [None]:
results = []

n_neighbors_list = [5, 10, 15, 30]
n_components_list = [5, 10, 15, 20]
min_cluster_size_list = [5, 10, 15]

total_iterations = (
    len(n_neighbors_list)
    * len(n_components_list)
    * len(min_cluster_size_list)
)

with tqdm(total=total_iterations) as pbar:
    
    for n_neighbors in n_neighbors_list:
        for n_components in n_components_list:
            for min_cluster_size in min_cluster_size_list:
                
                X_umap = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=0.0,
                    random_state=42
                ).fit_transform(X_scaled)
                
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_cluster_size,
                    cluster_selection_method='leaf'
                )
                labels = clusterer.fit_predict(X_umap)
                
                cluster_count = len(set(labels) - {-1})
                
                if cluster_count == 0:
                    pbar.update(1)
                    continue
                
                mask = labels != -1
                
                try:
                    sil = silhouette_score(X_umap[mask], labels[mask])
                    dbi = davies_bouldin_score(X_umap[mask], labels[mask])
                except:
                    sil = np.nan
                    dbi = np.nan
                
                results.append({
                    "n_neighbors": n_neighbors,
                    "n_components": n_components,
                    "min_cluster_size": min_cluster_size,
                    "clusters": cluster_count,
                    "silhouette": sil,
                    "dbi": dbi
                })
                
                pbar.update(1)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(
    by=["silhouette", "clusters"], ascending=[False, True]
)

print(df_results.head(10))


TypeError: 'module' object is not callable