
# üß™ K-means Clustering on Wholesale Customers Dataset

This notebook demonstrates how to apply **K-means clustering** to the **UCI Wholesale Customers** dataset.

## üìã Steps:
- Load and preprocess the data
- Apply K-means clustering with different k values
- Evaluate using **Silhouette Score** and **Davies-Bouldin Index**
- Visualize clusters using **PCA**


In [None]:

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

# Load dataset t·ª´ file CSV trong c√πng th∆∞ m·ª•c
df = pd.read_csv("Wholesale customers data.csv")

# Drop c·ªôt Region v√† Channel v√¨ ch√∫ng l√† categorical (ordinal) kh√¥ng c·∫ßn thi·∫øt
X = df.drop(['Region', 'Channel'], axis=1)

# Normalize d·ªØ li·ªáu v·ªÅ kho·∫£ng [0,1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

# T√≠nh silhouette scores cho k = 2 ƒë·∫øn 10
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

# V·∫Ω bi·ªÉu ƒë·ªì silhouette
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title("Silhouette Score vs Number of Clusters (k)")
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()


In [None]:

# Ch·ªçn k t·ªët nh·∫•t
best_k = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"Best k based on silhouette score: {best_k}")

# Fit l·∫°i v·ªõi best_k
kmeans_final = KMeans(n_clusters=best_k, random_state=42)
labels_final = kmeans_final.fit_predict(X_scaled)

# PCA ƒë·ªÉ gi·∫£m xu·ªëng 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# V·∫Ω scatter plot
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_final, cmap='viridis')
plt.title(f"K-means Clustering with k={best_k}")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()


In [None]:

# T√≠nh Davies-Bouldin Index
db_score = davies_bouldin_score(X_scaled, labels_final)
print(f"Davies-Bouldin Index: {db_score:.2f}")
