In [5]:
import pandas as pd
from cuml.cluster import KMeans as cuKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from cuml.metrics import adjusted_rand_score
import cupy as cp
import optuna
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_parquet('../data/processed/selected_features_df.parquet')
X = df.drop(columns=['TARGET'])
df.shape

(266469, 97)

We will standardize the data before applying KMeans algorithms since it is sensitive to the scale of the data due to the use of the Euclidean distance.

In [3]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## HOPKINS STATISTIC

First, we are going to compute the Hopkins statistic to check if data present a cluster tendency or if it is uniformly distributed.

In [6]:
def hopkins_statistic(X, n_samples=100):
    n,d = X.shape
    if n_samples > n:
        n_samples = n

    # Randomly select n_samples from the data
    random_indices = np.random.choice(n, n_samples, replace=False)
    X_sample = X[random_indices]

    # Generate uniformly distributed random data
    X_uniform = np.random.uniform(np.min(X,axis=0), np.max(X,axis=0), size=(n_samples, d))

    # Compute the distance between the points
    nearest_neighbours = NearestNeighbors(n_neighbors=1).fit(X)

    uniform_distances = nearest_neighbours.kneighbors(X_sample, return_distance=True)[0].sum()
    sample_distances = nearest_neighbours.kneighbors(X_uniform, return_distance=True)[0].sum()

    # Compute the Hopkins statistic
    return sample_distances / (uniform_distances + sample_distances)

In [9]:
h_statistic = hopkins_statistic(X_scaled, n_samples=30000)
print(f"Hopkins Statistic: {h_statistic}")

Hopkins Statistic: 0.9999999968708171


## KMeans

Optuna for hyperparameter tuning

In [23]:
def objective(trial):
    n_clusters = trial.suggest_int('n_clusters', 2, 10)
    init = trial.suggest_categorical('init', ['random', 'k-means++'])
    n_init = trial.suggest_int('n_init', 1, 10)
    max_iter = trial.suggest_int('max_iter', 100, 500)
    tol = trial.suggest_float('tol', 1e-5, 1e-1)
    kmeans = cuKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol)
    labels = kmeans.fit_predict(cp.array(X_scaled))
    score = adjusted_rand_score
    return score

In [24]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2024-12-05 20:15:37,269] A new study created in memory with name: no-name-c1075f8a-274d-40a2-b408-510a555dfbf9
[W 2024-12-05 20:15:38,341] Trial 0 failed with parameters: {'n_clusters': 5, 'init': 'random', 'n_init': 3, 'max_iter': 371, 'tol': 0.010523854288125406} because of the following error: TypeError('Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.').
Traceback (most recent call last):
  File "/home/ineguiluz/anaconda3/envs/MLKernel/lib/python3.12/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_32249/3438170868.py", line 9, in objective
    score = silhouette_score(X_scaled, labels)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ineguiluz/anaconda3/envs/MLKernel/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.