In [18]:
import pandas as pd
import numpy as np
import os
import warnings

os.environ["OMP_NUM_THREADS"] = "1"
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score

In [19]:
df = pd.read_csv("../Data/Mall_Customers.csv")
df.head()


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [20]:
df = df.drop("CustomerID", axis=1)


In [21]:
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])


In [22]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)


In [23]:
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, labels)
    silhouette_scores.append(score)

silhouette_scores


[np.float64(0.2776035125578272),
 np.float64(0.2576199805135528),
 np.float64(0.29010917402310876),
 np.float64(0.27191023466188324),
 np.float64(0.3347543475669217),
 np.float64(0.3457397879257699),
 np.float64(0.3727647594212929),
 np.float64(0.3876146244442297),
 np.float64(0.42076374869477745)]

In [24]:
kmeans = KMeans(n_clusters=5, random_state=42)
df["Cluster"] = kmeans.fit_predict(scaled_data)


In [25]:
df_display = df.copy()

df_display["Gender"] = df_display["Gender"].map({0: "Female", 1: "Male"})

numeric_summary = (
    df_display
    .drop(columns=["Gender"])
    .groupby("Cluster")
    .mean()
    .round(2)
)

gender_percentage = (
    df_display
    .groupby("Cluster")["Gender"]
    .value_counts(normalize=True)
    .mul(100)
    .round(2)
    .unstack(fill_value=0)
)

final_cluster_summary = numeric_summary.join(gender_percentage)
final_cluster_summary


Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Female,Male
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,56.47,46.1,39.31,49.02,50.98
1,39.5,85.15,14.05,0.0,100.0
2,28.69,60.9,70.24,0.0,100.0
3,37.9,82.12,54.45,100.0,0.0
4,27.32,38.84,56.21,100.0,0.0
