
# 🧪 K-means Clustering on Wholesale Customers Dataset

This notebook demonstrates how to apply **K-means clustering** to the **UCI Wholesale Customers** dataset.

## 📋 Steps:
- Load and preprocess the data
- Apply K-means clustering with different k values
- Evaluate using **Silhouette Score** and **Davies-Bouldin Index**
- Visualize clusters using **PCA**


In [None]:

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

# Load dataset từ file CSV trong cùng thư mục
df = pd.read_csv("Wholesale customers data.csv")

# Drop cột Region và Channel vì chúng là categorical (ordinal) không cần thiết
X = df.drop(['Region', 'Channel'], axis=1)

# Normalize dữ liệu về khoảng [0,1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

# Tính silhouette scores cho k = 2 đến 10
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

# Vẽ biểu đồ silhouette
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title("Silhouette Score vs Number of Clusters (k)")
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()


In [None]:

# Chọn k tốt nhất
best_k = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"Best k based on silhouette score: {best_k}")

# Fit lại với best_k
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
labels_final = kmeans_final.fit_predict(X_scaled)

# PCA để giảm xuống 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Vẽ scatter plot
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_final, cmap='viridis')
plt.title(f"K-means Clustering with k={best_k}")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()


In [None]:

# Tính Davies-Bouldin Index
db_score = davies_bouldin_score(X_scaled, labels_final)
print(f"Davies-Bouldin Index: {db_score:.2f}")
