# Кластеризация данных: Сравнение KMeans, Agglomerative и GMM

## 1. Импорт библиотек

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

## 2. Загрузка и предобработка данных

In [None]:
df = pd.read_csv('../data/marketing_campaign.csv', sep='\t')

# Нормализация категорий
df["Education"] = df["Education"].replace({"2n Cycle": "Pre-Graduate", "Basic": "Pre-Graduate"})
df["Marital_Status"] = df["Marital_Status"].replace({
    "Married": "Married/Together", "Together": "Married/Together",
    "Single": "Single", "Divorced": "Other", "Widow": "Other",
    "Alone": "Other", "Absurd": "Other", "YOLO": "Other"
})

# Feature engineering
df["Kids"] = df["Kidhome"].astype("int8") + df["Teenhome"].astype("int8")
df["Expenses"] = df[["MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"]].sum(axis=1)
df["TotalAcceptedCmp"] = df[["AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5"]].astype("int8").sum(axis=1)
df["TotalNumPurchases"] = df[["NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases", "NumDealsPurchases"]].sum(axis=1)

# Удаление лишних колонок
df.drop(columns=["Kidhome", "Teenhome", "MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", 
                "MntSweetProducts", "MntGoldProds", "AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3", 
                "AcceptedCmp4", "AcceptedCmp5", "NumWebPurchases", "NumCatalogPurchases", 
                "NumStorePurchases", "NumDealsPurchases"], inplace=True)

df["Kids"] = df["Kids"].replace({0: "No Kid", 1: "Has Kids", 2: "Has Kids", 3: "Has Kids"})
df["TotalAcceptedCmp"] = df["TotalAcceptedCmp"].replace({0: "0", 1: ">0", 2: ">0", 3: ">0", 4: ">0"})

num_features = ["Income", "Recency", "NumWebVisitsMonth", "Expenses", "TotalNumPurchases"]
cat_features = ["Education", "Marital_Status", "Response", "Complain", "Kids", "TotalAcceptedCmp"]

# Очистка Income
df['Income'] = df['Income'].fillna(df['Income'].median())
df = df[df['Income'] < 600000]

## 3. Создание Pipeline и обучение моделей

In [None]:
n_clusters = 4

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Основной Pipeline с KMeans
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=2)), 
    ('cluster', KMeans(n_clusters=n_clusters, random_state=42, n_init=10))
])

# Обучаем KMeans
df['Cluster_KM'] = pipeline.fit_predict(df)

# ВАЖНО: Извлекаем подготовленные данные для обучения других моделей
X_processed = pipeline.named_steps['preprocessor'].transform(df)
X_pca = pipeline.named_steps['pca'].transform(X_processed)

# МОДЕЛЬ 2: Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=n_clusters)
df['Cluster_Agg'] = agg.fit_predict(X_processed)

# МОДЕЛЬ 3: Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
df['Cluster_GMM'] = gmm.fit_predict(X_processed)

## 4. Сравнение результатов и визуализация

In [None]:
# Расчет Silhouette Score
score_km = silhouette_score(X_processed, df['Cluster_KM'])
score_agg = silhouette_score(X_processed, df['Cluster_Agg'])
score_gmm = silhouette_score(X_processed, df['Cluster_GMM'])

print(f"Silhouette Score (KMeans): {score_km:.3f}")
print(f"Silhouette Score (Agglomerative): {score_agg:.3f}")
print(f"Silhouette Score (GMM): {score_gmm:.3f}")

# Визуализация
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster_KM'], palette='viridis', ax=axes[0])
axes[0].set_title(f'KMeans (Score: {score_km:.2f})')

sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster_Agg'], palette='magma', ax=axes[1])
axes[1].set_title(f'Agglomerative (Score: {score_agg:.2f})')

sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster_GMM'], palette='plasma', ax=axes[2])
axes[2].set_title(f'GMM (Score: {score_gmm:.2f})')

plt.tight_layout()
plt.show()

## 5. Анализ средних показателей (на примере GMM)

In [None]:
print("Средние показатели для кластеров GMM:")
display(df.groupby('Cluster_GMM')[num_features].mean())