# Выбор параметров кластеризации

In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors # расстояние доя ближайших соседей для DBSCAN

from sklearn.preprocessing import StandardScaler  # z-нормировка
# from sklearn.preprocessing import MinMaxScaler, RobustScaler # другие способы нормировки

import matplotlib.pyplot as plt

# Не показывать Warning
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
df = pd.read_csv('./datasets/countries.csv')
# Удалим нечисловые переменные
X = df.drop(columns=['Страны'])

# Специфицируем метод нормировки
scaler = StandardScaler(with_mean=True, with_std=True)
# scaler = MinMaxScaler()
# scaler = RobustScaler()

X_norm = scaler.fit_transform(X)

## Выбор числа кластеров (k-means, agglomerative)

[Основные метрики](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)
| Метрика |Метод |Критерий|
|-|-|-|
|Silhouette Coefficient|silhouette_score| max|
|Calinski-Harabasz|calinski_harabasz_score|max|
|Davies-Bouldin|davies_bouldin_score|min|

__Важно__: применяем только к k-menas & agglomerative!

### Метод  k-средних

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = KMeans(n_clusters=n, n_init='auto', random_state=2)
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
cluster_metrics.plot(subplots=True)
plt.show()

"оптимальное" число кластеров

In [None]:
print('Silhouette:', n_clusters[cluster_metrics['Silhouette'].argmax()])
print('Calinski-Harabasz:', n_clusters[cluster_metrics['Calinski-Harabasz'].argmax()])
print('Davies-Bouldin:', n_clusters[cluster_metrics['Davies-Bouldin'].argmin()])

### Иерархическая

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = AgglomerativeClustering(n_clusters=n, metric='euclidean')
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
cluster_metrics.plot(subplots=True)
plt.show()

"оптимальное" число кластеров

In [None]:
print('Silhouette:', n_clusters[cluster_metrics['Silhouette'].argmax()])
print('Calinski-Harabasz:', n_clusters[cluster_metrics['Calinski-Harabasz'].argmax()])
print('Davies-Bouldin:', n_clusters[cluster_metrics['Davies-Bouldin'].argmin()])

## Оптимальные параметры DBSCAN

In [None]:
num_neighbors = 2*X_norm.shape[1]-1

neigh = NearestNeighbors(n_neighbors=num_neighbors)
neigh.fit(X_norm)
distances, indices = neigh.kneighbors(X_norm)

In [None]:
distances = np.sort(distances, axis=0)
plt.plot(distances[:,-1])
plt.title('K-distance Graph')
plt.xlabel('Data Points sorted by distance')
plt.ylabel('Epsilon')
plt.show()

"оптимальное" $\varepsilon\approx 3.2$

In [None]:
# Специфицируем процедуру
cluster = DBSCAN(eps=3.3, min_samples=num_neighbors, metric='euclidean')
# проводим кластеризацию
# проводим кластеризацию
cluster.fit(X_norm)

n_clusters_ = len(set(cluster.labels_)) - (1 if -1 in cluster.labels_ else 0)
n_noise_ = list(cluster.labels_).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)