In [180]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set the default template to 'plotly_dark' for all Plotly figures
import plotly.io as pio
pio.templates.default = 'plotly_dark'

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [181]:
path = '../data/clean/cleaned.csv'
df = pd.read_csv(path).set_index('id')
df.head()

Unnamed: 0_level_0,age,gender,income,spending_score,membership_years,purchase_frequency,preferred_category,last_purchase_amount,age_range,income_level,spending_score_category,purchase_frequency_category,membership_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,38,Female,99342,90,3,24,Groceries,113.53,"(30, 40]",Medium,Premium,Medium,Bronze
2,21,Female,78852,60,2,42,Sports,41.93,"(20, 30]",Medium,High,High,Bronze
3,60,Female,126573,30,2,28,Clothing,424.36,"(50, 60]",High,Medium,Medium,Bronze
4,40,Other,47099,74,9,5,Home & Garden,991.93,"(30, 40]",Low,High,Low,Gold
5,65,Female,140621,21,3,25,Electronics,347.08,"(60, 70]",High,Low,Medium,Bronze


In [182]:
df.dtypes

age                              int64
gender                          object
income                           int64
spending_score                   int64
membership_years                 int64
purchase_frequency               int64
preferred_category              object
last_purchase_amount           float64
age_range                       object
income_level                    object
spending_score_category         object
purchase_frequency_category     object
membership_score                object
dtype: object

### Test 1: Kmeans clustering with encoded categorical features

In [183]:
# Use LabelEncoder to encode the categorical groupings
le = LabelEncoder()

feature_cols = df.select_dtypes('object').columns

# Create the labeled features
features = pd.DataFrame()
for col in feature_cols:
    features[f'{col}_enc'] = le.fit_transform(df[col])

# Scale the features
ss = StandardScaler()
features_scaled = ss.fit_transform(features)

# Range of k values to test
k_values = range(1, 11)
wcss = []

# Run KMeans for each k and calculate WCSS
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(features_scaled)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

res = np.array([k_values, wcss]).T

# Plot the elbow plot
fig = px.line(res, x=res[:,0], y=res[:,1])
fig.show()

In [184]:
# Based on the above elbow plot, the ideal number of clusters is 3 (or 4).
kmeans = KMeans(n_clusters=7, random_state=0)
df['cluster'] = kmeans.fit_predict(features_scaled)
silhouette_score(features_scaled, df['cluster'])

np.float64(0.12363039158859411)

In [185]:
display(df.groupby('cluster')[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']].mean().round(0))
display(df.groupby('cluster')[['gender', 'preferred_category', 'age_range', 'income_level', 'spending_score_category', 'purchase_frequency_category', 'membership_score']].agg(pd.Series.mode))

Unnamed: 0_level_0,age,income,spending_score,membership_years,purchase_frequency,last_purchase_amount
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,40.0,76966.0,45.0,6.0,24.0,450.0
1,47.0,79902.0,57.0,6.0,33.0,509.0
2,50.0,103856.0,49.0,5.0,35.0,489.0
3,34.0,80022.0,61.0,5.0,22.0,527.0
4,44.0,122032.0,50.0,5.0,21.0,465.0
5,52.0,75204.0,47.0,6.0,21.0,561.0
6,40.0,76752.0,46.0,5.0,30.0,462.0


Unnamed: 0_level_0,gender,preferred_category,age_range,income_level,spending_score_category,purchase_frequency_category,membership_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Female,Sports,"(40, 50]",Medium,Medium,Low,Silver
1,Other,Clothing,"[(40, 50], (50, 60]]",Medium,Premium,High,Silver
2,Male,Sports,"(50, 60]",High,Low,High,Bronze
3,Male,Groceries,"(20, 30]",Medium,Premium,Medium,Bronze
4,Male,Electronics,"(50, 60]",High,Medium,Medium,Silver
5,Other,Sports,"(60, 70]",Medium,High,Medium,Silver
6,Female,Clothing,"(20, 30]",Medium,High,High,Bronze


### Test 2: Kmeans clustering with numerical features

In [186]:
features = df.select_dtypes('number')

# Scale the features
ss = StandardScaler()
features_scaled = ss.fit_transform(features)

# Range of k values to test
k_values = range(1, 11)
wcss = []

# Run KMeans for each k and calculate WCSS
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(features_scaled)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

res = np.array([k_values, wcss]).T

# Plot the elbow plot
fig = px.line(res, x=res[:,0], y=res[:,1])
fig.show()

In [187]:
# Based on the above elbow plot, the ideal number of clusters is 3 (or 4).
kmeans = KMeans(n_clusters=5, random_state=0)
df['cluster'] = kmeans.fit_predict(features_scaled)
silhouette_score(features_scaled, df['cluster'])

np.float64(0.12125846458732488)

### Test 3: Kmeans with PCA

In [188]:
features = df.select_dtypes('number')

# Scale the features
ss = StandardScaler()
features_scaled = ss.fit_transform(features)

# Apply dimension reduction (PCA)
pca = PCA(n_components=2)
features_reduced = pca.fit_transform(features_scaled)

# Range of k values to test
k_values = range(1, 11)
wcss = []

# Run KMeans for each k and calculate WCSS
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(features_reduced)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

res = np.array([k_values, wcss]).T

# Plot the elbow plot
fig = px.line(res, x=res[:,0], y=res[:,1])
fig.show()

In [189]:
# Based on the above elbow plot, the ideal number of clusters is 3 (or 4).
kmeans = KMeans(n_clusters=3, random_state=0)
df['cluster'] = kmeans.fit_predict(features_reduced)
silhouette_score(features_reduced, df['cluster'])

np.float64(0.39329558513424484)

In [190]:
frdf = pd.DataFrame(features_reduced)
frdf['cluster'] = kmeans.fit_predict(features_reduced)
frdf

Unnamed: 0,0,1,cluster
0,1.061983,-1.224977,1
1,0.611551,-1.802815,1
2,0.427081,0.012539,1
3,0.034066,1.618647,2
4,0.397550,-0.162549,1
...,...,...,...
995,1.019813,-0.114909,1
996,-0.709358,-0.215477,0
997,-1.274178,-2.143864,0
998,-0.151515,-1.231066,1


In [191]:
fig = px.scatter_matrix(frdf, dimensions=[0, 1], color='cluster')
fig.show()

In [192]:
print("Mean based summary: numerical features")
display(df.groupby('cluster')[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']].mean().round(0))
print("Median based summary: numerical features")
display(df.groupby('cluster')[['age', 'income', 'spending_score', 'membership_years', 'purchase_frequency', 'last_purchase_amount']].median().round(0))
print("Mode based summary: categorical features")
display(df.groupby('cluster')[['gender', 'preferred_category', 'age_range', 'income_level', 'spending_score_category', 'purchase_frequency_category', 'membership_score']].agg(pd.Series.mode))

Mean based summary: numerical features


Unnamed: 0_level_0,age,income,spending_score,membership_years,purchase_frequency,last_purchase_amount
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,45.0,86525.0,53.0,7.0,37.0,479.0
1,37.0,103324.0,50.0,4.0,19.0,334.0
2,51.0,73114.0,47.0,4.0,19.0,716.0


Median based summary: numerical features


Unnamed: 0_level_0,age,income,spending_score,membership_years,purchase_frequency,last_purchase_amount
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,45.0,84764.0,55.0,8.0,38.0,492.0
1,33.0,105600.0,48.0,4.0,18.0,284.0
2,54.0,67704.0,46.0,4.0,17.0,762.0


Mode based summary: categorical features


Unnamed: 0_level_0,gender,preferred_category,age_range,income_level,spending_score_category,purchase_frequency_category,membership_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Male,Home & Garden,"(40, 50]",Medium,Premium,High,Gold
1,Female,Electronics,"(20, 30]",Medium,Medium,Low,Bronze
2,Other,Sports,"(60, 70]",Medium,Medium,Low,Bronze


In [193]:
fig = px.scatter_matrix(features, dimensions=['spending_score', 'purchase_frequency'], color='cluster')
fig.show()

In [194]:
df.to_csv('../data/clean/clustered.csv', index=True)