# Выбор числа кластеров

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing  # методы для предварительной обработки данных (нормирование etc)
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import metrics

In [None]:
data = pd.read_csv('Countries.csv', sep=';', decimal=',')
# Удалим нечисловые переменные
data = data.drop(columns=['Страны'])

# Нормируем данные
data_norm = preprocessing.StandardScaler(with_mean=True, with_std=True).fit_transform(data)
# data_norm = preprocessing.MinMaxScaler().fit_transform(data)
# data_norm = preprocessing.RobustScaler().fit_transform(data)

[Основные метрики](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)
| Метрика |Метод |Критерий|
|-|-|-|
|Silhouette Coefficient|silhouette_score| max|
|Calinski-Harabasz|calinski_harabasz_score|max|
|Davies-Bouldin|davies_bouldin_score|min|
## Метод  k-средних

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = KMeans(n_clusters=n, n_init='auto')
    cluster.fit(data_norm)
    n_clusters.append(n)
    metric_silhouette.append(metrics.silhouette_score(data_norm, cluster.labels_))
    metric_ch.append(metrics.calinski_harabasz_score(data_norm, cluster.labels_))
    metric_db.append(metrics.davies_bouldin_score(data_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters
cluster_metrics

In [None]:
cluster_metrics.plot(subplots=True)

## Иерархическая

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='ward', compute_full_tree=True, distance_threshold=None)
    cluster.fit(data_norm)
    n_clusters.append(n)
    metric_silhouette.append(metrics.silhouette_score(data_norm, cluster.labels_))
    metric_ch.append(metrics.calinski_harabasz_score(data_norm, cluster.labels_))
    metric_db.append(metrics.davies_bouldin_score(data_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters
cluster_metrics

In [None]:
cluster_metrics.plot(subplots=True)