# Выбор числа кластеров

In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from sklearn.preprocessing import StandardScaler  # z-нормировка
# from sklearn.preprocessing import MinMaxScaler, RobustScaler # другие способы нормировки

# Не показывать Warning
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
df = pd.read_csv('./datasets/Countries.csv', sep=';', decimal=',')
# Удалим нечисловые переменные
X = df.drop(columns=['Страны'])

# Специфицируем метод нормировки
scaler = StandardScaler(with_mean=True, with_std=True)
# scaler = MinMaxScaler()
# scaler = RobustScaler()

X_norm = scaler.fit_transform(X)

[Основные метрики](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)
| Метрика |Метод |Критерий|
|-|-|-|
|Silhouette Coefficient|silhouette_score| max|
|Calinski-Harabasz|calinski_harabasz_score|max|
|Davies-Bouldin|davies_bouldin_score|min|
## Метод  k-средних

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = KMeans(n_clusters=n, n_init='auto', random_state=2)
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
cluster_metrics.plot(subplots=True)

## Иерархическая

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='ward', compute_full_tree=True, distance_threshold=None)
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
cluster_metrics.plot(subplots=True)