# Кластеризация и преобразование данных

In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors # расстояние доя ближайших соседей для DBSCAN
from sklearn.preprocessing import QuantileTransformer, PowerTransformer # преобразование данных
from sklearn.preprocessing import StandardScaler  # z-нормировка
# from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler # другие способы нормировки

import seaborn as sns # 2D-визуализация
import matplotlib.pyplot as plt
import plotly.express as px # 3D-визуализация

# Не показывать Warning
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
df = pd.read_csv('./datasets/countries.csv')
X = df.drop(columns=['Страны'])
X.index = df['Страны']

In [None]:
sns.pairplot(X)
plt.show()

In [None]:
transformer = PowerTransformer(method='yeo-johnson')
# transformer = PowerTransformer(method='box-cox')
# transformer = QuantileTransformer(output_distribution='normal', random_state=0)

X_tr = transformer.fit_transform(X)
X_tr = pd.DataFrame(X_tr, columns=X.columns)


In [None]:
sns.pairplot(X_tr)
plt.show()

In [None]:
# Специфицируем метод нормировки
scaler = StandardScaler(with_mean=True, with_std=True)
# scaler = MinMaxScaler()
# scaler = RobustScaler()
# scaler = MaxAbsScaler()

X_norm = scaler.fit_transform(X_tr)

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = KMeans(n_clusters=n, n_init='auto', random_state=2)
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
print('Silhouette:', n_clusters[cluster_metrics['Silhouette'].argmax()])
print('Calinski-Harabasz:', n_clusters[cluster_metrics['Calinski-Harabasz'].argmax()])
print('Davies-Bouldin:', n_clusters[cluster_metrics['Davies-Bouldin'].argmin()])

In [None]:
max_n_clusters = 10

n_clusters = []
metric_silhouette = []
metric_ch = []
metric_db = []

for n in range(2, max_n_clusters+1):
    cluster = AgglomerativeClustering(n_clusters=n, metric='euclidean')
    cluster.fit(X_norm)
    n_clusters.append(n)
    metric_silhouette.append(silhouette_score(X_norm, cluster.labels_))
    metric_ch.append(calinski_harabasz_score(X_norm, cluster.labels_))
    metric_db.append(davies_bouldin_score(X_norm, cluster.labels_))
# cluster_metrics = pd.DataFrame({'n_clusters': n_clusters, 'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics = pd.DataFrame({'Silhouette':metric_silhouette, 'Calinski-Harabasz':metric_ch, 'Davies-Bouldin':metric_db })
cluster_metrics.index = n_clusters

cluster_metrics.round(3)

In [None]:
print('Silhouette:', n_clusters[cluster_metrics['Silhouette'].argmax()])
print('Calinski-Harabasz:', n_clusters[cluster_metrics['Calinski-Harabasz'].argmax()])
print('Davies-Bouldin:', n_clusters[cluster_metrics['Davies-Bouldin'].argmin()])