# Clustering Models

First of all, we are going to import the necessary libraries.

In [63]:
import pandas as pd
from cuml import NearestNeighbors
import numpy as np
import optuna
import cuml
from cuml.cluster import DBSCAN as cuDBSCAN
from dask.array import asarray
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score as cu_silhouette_score



First of all, we are going to create a class to compute all the metrics. This class will be used to evaluate the performance of the models using the K Fold method.


In [64]:
class ClusteringMetrics:
    def __init__(self, X, model):
        self.X = pd.DataFrame(X.get()).reset_index(drop=True)
        self.model = model


    def compute_internalEvaluation(self):
        self.model.fit(self.X)
        y_pred = self.model.labels_

        self.shiloette_score = cu_silhouette_score(cp.asarray(self.X), y_pred)
        self.calinski_harabasz_score = calinski_harabasz_score(self.X, y_pred)
        self.davies_bouldin_score = davies_bouldin_score(self.X, y_pred)

    def print_metrics(self):
        print(f"Silhouette Score: {self.shiloette_score}")
        print(f"Calinski Harabasz Score: {self.calinski_harabasz_score}")
        print(f"Davies Bouldin Score: {self.davies_bouldin_score}")



Loading the data

In [65]:
df = pd.read_parquet('../data/processed/selected_features_df.parquet')

We are going to scale the data using the StandardScaler.

In [66]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

Let's calculate the Hopkins statistic to check if the data is suitable for clustering.

In [5]:
def hopkins(X):
    d = X.shape[1]
    n = len(X)
    m = int(0.1 * n)
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)

    rand_X = np.random.rand(n, d)
    ujd = []
    wjd = []

    for j in range(m):
        u_dist, _ = nbrs.kneighbors(rand_X[j].reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[j].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])

    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if np.isnan(H):
        print(ujd, wjd)
        H = 0

    return H

The Hopkins statistic ranges from 0 to 1. The closer to 1, the more suitable the data is for clustering. Let's calculate the Hopkins statistic for the data.

In [14]:
hopkins(df)

0.9645847043701986

The Hopkins statistic is close to 1, so the data is suitable for clustering.

# DBSCAN

We are going to use the DBSCAN algorithm to cluster the data. We are going to use the Optuna library to find the best hyperparameters.

In [6]:
import cupy as cp

def objective(trial):
    eps = trial.suggest_float('eps', 0.1, 1.0)
    min_samples = trial.suggest_int('min_samples', 5, 10)
    metric = trial.suggest_categorical('metric', ['euclidean', 'cosine'])

    model = cuDBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    model.fit(df_scaled)
    y_pred = model.labels_

    # Verificar si hay al menos 2 clusters
    if len(cp.unique(y_pred)) > 1:  # Usamos cupy para manejar el arreglo en GPU
        return silhouette_score(df_scaled, y_pred)  # Devuelve el Silhouette score si hay más de 1 cluster
    else:
        return -1  # Devuelve un valor negativo si solo se encuentra un único cluster
    study = optuna.create_study(direction='maximize', study_name='dbscan')
    study.optimize(objective, n_trials=5)
    print(f"Best parameters: {study.best_params}")

[I 2024-12-06 14:29:53,352] A new study created in memory with name: dbscan
[I 2024-12-06 14:46:08,489] Trial 0 finished with value: 0.06344437831203903 and parameters: {'eps': 0.45560417786879226, 'min_samples': 9, 'metric': 'cosine'}. Best is trial 0 with value: 0.06344437831203903.
[I 2024-12-06 15:01:01,110] Trial 1 finished with value: -0.08649164881054881 and parameters: {'eps': 0.2812818251401415, 'min_samples': 9, 'metric': 'cosine'}. Best is trial 0 with value: 0.06344437831203903.
[I 2024-12-06 15:05:37,931] Trial 2 finished with value: -1.0 and parameters: {'eps': 0.15472718863535367, 'min_samples': 10, 'metric': 'euclidean'}. Best is trial 0 with value: 0.06344437831203903.
[I 2024-12-06 15:20:50,951] Trial 3 finished with value: 0.12279910760197582 and parameters: {'eps': 0.529945149343017, 'min_samples': 10, 'metric': 'cosine'}. Best is trial 3 with value: 0.12279910760197582.
[I 2024-12-06 15:36:01,502] Trial 4 finished with value: 0.15272329032663132 and parameters: {'e

Best parameters: {'eps': 0.9095320672300284, 'min_samples': 8, 'metric': 'cosine'}


We can see that the bigger the eps, the better the silhouette score. Also, the cosine metric is better than the euclidean metric for this problem. Taking this into account we are going to create another Optuna study to find the best hyperparameters.

In [11]:
import cupy as cp
import gc

df_scaled = cp.asarray(df_scaled)

def objective(trial):
    gc.collect()
    cp._default_memory_pool.free_all_blocks()

    eps = trial.suggest_float('eps', 1, 1.5)
    min_samples = trial.suggest_int('min_samples', 5, 10)

    model = cuDBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    model.fit(df_scaled)
    y_pred = model.labels_

    # Verificar si hay al menos 2 clusters
    if len(cp.unique(y_pred)) > 1:  # Usamos cupy para manejar el arreglo en GPU
        return cu_silhouette_score(df_scaled, y_pred)  # Devuelve el Silhouette score si hay más de 1 cluster
    else:
        return -1  # Devuelve un valor negativo si solo se encuentra un único cluster
study = optuna.create_study(direction='maximize', study_name='dbscan')
study.optimize(objective, n_trials=5)
print(f"Best parameters: {study.best_params}")

[I 2024-12-07 10:09:25,235] A new study created in memory with name: dbscan
[I 2024-12-07 10:18:00,067] Trial 0 finished with value: 0.35992020990680346 and parameters: {'eps': 1.468031449624661, 'min_samples': 9}. Best is trial 0 with value: 0.35992020990680346.
[I 2024-12-07 10:26:35,946] Trial 1 finished with value: 0.37549860352774733 and parameters: {'eps': 1.2027419903680259, 'min_samples': 8}. Best is trial 1 with value: 0.37549860352774733.
[I 2024-12-07 10:35:13,500] Trial 2 finished with value: 0.38383985601186843 and parameters: {'eps': 1.2125524450750875, 'min_samples': 7}. Best is trial 2 with value: 0.38383985601186843.
[I 2024-12-07 10:43:59,112] Trial 3 finished with value: 0.35992020990680346 and parameters: {'eps': 1.4344474832899103, 'min_samples': 6}. Best is trial 2 with value: 0.38383985601186843.
[I 2024-12-07 10:52:46,669] Trial 4 finished with value: 0.1540651024846537 and parameters: {'eps': 1.001327955535295, 'min_samples': 6}. Best is trial 2 with value: 0.3

Best parameters: {'eps': 1.2125524450750875, 'min_samples': 7}


In [16]:
model = cuDBSCAN(eps=1.2125524450750875, min_samples=7, metric='cosine')
metrics = ClusteringMetrics(df_scaled, model)
metrics.compute_internalEvaluation()
metrics.print_metrics()

Silhouette Score: 0.38383985601186843
Calinski Harabasz Score: 6.71393677535788
Davies Bouldin Score: 1.5755187609143613


# BIRCH

In [67]:
import optuna
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
import numpy as np

# df_scaled = asarray(df_scaled)

# Definir la función objetivo para Optuna
def objective(trial):
    # Sugerir valores para los hiperparámetros de BIRCH
    threshold = trial.suggest_float('threshold', 0.01, 1.0)  # Umbral para la construcción del árbol
    n_clusters = trial.suggest_int('n_clusters', 2, 10)  # Número de clusters
    # branching_factor = trial.suggest_int('branching_factor', 10, 100)  # Número de hijos por nodo

    # Crear el modelo BIRCH con los hiperparámetros sugeridos
    model = Birch(threshold=threshold, n_clusters=n_clusters)

    # df_sparse = csr_matrix(df_scaled.get())
    # model.fit(df_sparse)
    # Entrenar el modelo
    model.fit(df_scaled)
    labels = model.labels_

    # Calcular el Silhouette Score para evaluar la calidad del clustering
    if len(np.unique(labels)) > 1:
        score = silhouette_score(df_scaled, labels)
        return score
    else:
        return -1

study = optuna.create_study(direction='maximize', study_name='birch_tuning')
study.optimize(objective, n_trials=5)
print(f"Best parameters: {study.best_params}")


[I 2024-12-07 11:47:58,934] A new study created in memory with name: birch_tuning
[W 2024-12-07 11:48:12,517] Trial 0 failed with parameters: {'threshold': 0.29956887580750957, 'n_clusters': 3} because of the following error: MemoryError((35502197811,), dtype('float64')).
Traceback (most recent call last):
  File "/home/aitor/anaconda3/envs/rapids-24.10/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_9771/3399477042.py", line 22, in objective
    model.fit(df_scaled)
  File "/home/aitor/anaconda3/envs/rapids-24.10/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/aitor/anaconda3/envs/rapids-24.10/lib/python3.10/site-packages/sklearn/cluster/_birch.py", line 524, in fit
    return self._fit(X, partial=False)
  File "/home/aitor/anaconda3/envs/rapids-24.10/lib/python3.10/site-packages/sklearn/cluster/_birch.py", line 

MemoryError: Unable to allocate 265. GiB for an array with shape (35502197811,) and data type float64

In [61]:
df_scaled = df_scaled.astype('float32')  # o 'float16' si es aceptable

In [60]:
df_scaled = np.asarray(df_scaled)