In [1]:
import cudf
import numpy as np
import pandas as pd
import plotly.express as px

# init paths to data and models
DATA_PATH = '/data/'
TMP_PATH = DATA_PATH + 'tmp/sm2_128/'

df = pd.read_csv(TMP_PATH + 'df.csv')
X = np.load(TMP_PATH + 'X_norm.npy')
dists = np.load(TMP_PATH + 'dists.npy')

1) high davies_bouldin_score is better
2) low silhouette_score is better
3) high calinski_harabasz_score is better

In [2]:
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from cuml.cluster import KMeans


# names of metrics in calc_scrore
calc_scrore_names = ['Davies Bouldin', 'Silhouette', 'Calinskiâ€“Harabasz']


# calulate scores for n clusters of events
# return list of calc_scrore_names scores
def calc_scrore(n):
    kmeans = KMeans(n_clusters=n, n_init=5).fit(X)
    dbs = davies_bouldin_score(X, kmeans.labels_)
    sils = silhouette_score(dists, kmeans.labels_)
    chs = calinski_harabasz_score(X, kmeans.labels_)
    return [dbs, sils, chs]


# calculate scores for list of different numbers of clusters - ns
# return pandas DataFrame, where row contain n_clustres and different scores for this n_clustres
def n_scores(ns: list):
    ans = []
    for n in ns: 
        ans.append([n] + calc_scrore(n)) 
    columns = ['n_clusters'] + calc_scrore_names
    return pd.DataFrame(ans, columns=columns)

In [None]:
samples = list(range(120, 150))

df_scores = pd.concat([df_scores, n_scores(samples)])
df_scores.to_csv(r'' + TMP_PATH + 'scores2.csv', index=False)

In [None]:
df_scores = pd.read_csv(TMP_PATH + 'scores2.csv')

for name in calc_scrore_names:
    fig = px.line(df_scores, x="n_clusters", y=name, title=name)
    fig.show()