In [None]:
# ============================
# K-MEANS CLUSTERING EXAMPLE
# Wholesale customers dataset
# ============================

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 1) Load dataset
url = "https://raw.githubusercontent.com/akay6483/mmml-ecommerce/main/dataset/Wholesale%20customers%20data.csv"
df = pd.read_csv(url)

# (optional) drop old index column if present
if 'index' in df.columns:
    df = df.drop(columns=['index'])

# 2) Select numeric features for K-Means
features = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
X = df[features]

# 3) Standardize features so each has mean 0 & variance 1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4) Apply K-Means (partition data into k clusters, minimizing within-cluster variance)
k = 3   # number of clusters (you chose this from silhouette/elbow)
kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_scaled)

# 5) Attach cluster labels back to original dataframe
df['Cluster_KMeans'] = labels

# 6) Show basic results
print("Cluster counts:")
print(df['Cluster_KMeans'].value_counts())

print("\nCluster-wise mean spending (cluster profile):")
cluster_profile = df.groupby('Cluster_KMeans')[features].mean()
print(cluster_profile)

# 7) (Optional) Silhouette score to show quality of partition
sil = silhouette_score(X_scaled, labels)
print(f"\nSilhouette score for k={k}: {sil:.3f}")


Cluster counts:
Cluster_KMeans
0    350
1     53
2     37
Name: count, dtype: int64

Cluster-wise mean spending (cluster profile):
                       Fresh          Milk       Grocery       Frozen  \
Cluster_KMeans                                                          
0                8935.500000   4228.528571   5848.034286  2167.231429   
1               34540.113208   5860.358491   6122.622642  9841.735849   
2                8704.864865  20534.405405  30466.243243  1932.621622   

                Detergents_Paper   Delicassen  
Cluster_KMeans                                 
0                    1913.605714  1102.120000  
1                     981.471698  3664.245283  
2                   14758.837838  2459.351351  

Silhouette score for k=3: 0.458
