In [1]:
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import metrics


In [2]:
data = pd.read_csv('./Mall_Customers.csv')
data.drop(columns='CustomerID', inplace = True)
data.head()


Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [3]:
data.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Gender                  200 non-null    object
 1   Age                     200 non-null    int64 
 2   Annual Income (k$)      200 non-null    int64 
 3   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 6.4+ KB


In [5]:
data.isna().sum()

Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [6]:
data.value_counts()

Gender  Age  Annual Income (k$)  Spending Score (1-100)
Female  18   65                  48                        1
Male    29   28                  82                        1
        24   60                  52                        1
        25   24                  73                        1
             77                  12                        1
                                                          ..
Female  41   99                  39                        1
             103                 17                        1
        42   34                  17                        1
        43   48                  50                        1
Male    70   49                  55                        1
Name: count, Length: 200, dtype: int64

In [7]:
data.duplicated().value_counts()
data.duplicated().sum()

0

In [8]:
#Using only fit_transform function for Kmeans without any standardization or normalization
encoder = OneHotEncoder(handle_unknown= 'ignore', sparse_output= False)
encoded_array = encoder.fit_transform(data[['Gender']])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['Gender']))
encoded_data_df = pd.concat([data, encoded_df], axis= 1)
encoded_data_df.drop(columns='Gender', inplace=True)
encoded_data_df.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Female,Gender_Male
0,19,15,39,0.0,1.0
1,21,15,81,0.0,1.0
2,20,16,6,1.0,0.0
3,23,16,77,1.0,0.0
4,31,17,40,1.0,0.0


In [9]:
inertia = []
k = list(range(1,11))
for i in k:
    k_model = KMeans(n_clusters = i, random_state = 42)
    k_model = k_model.fit(encoded_data_df)
    inertia.append(k_model.inertia_)

elbow_data = pd.DataFrame({'k': k, 'inertia': inertia})
elbow_data.hvplot.line(
    x = 'k',
    y = 'inertia'
)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [10]:
#K-Means with 4 clusters 
k_model = KMeans(n_clusters=4, random_state=42)
k_model.fit(encoded_data_df)
k_predictions = k_model.predict(encoded_data_df)
data['cust_seg_4'] = k_predictions
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'cust_seg_4'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
#K-Means with 5 clusters 
k_model_2 = KMeans(n_clusters = 5, random_state=42)
k_model_2.fit_transform(encoded_data_df)
k_predictions_2 = k_model_2.predict(encoded_data_df)
data['cust_seg_5'] = k_predictions_2
data.head()
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'cust_seg_5'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
#K-Means with 6 clusters 
k_model_3 = KMeans(n_clusters = 6, random_state=42)
k_model_3.fit_transform(encoded_data_df)
k_predictions_3 = k_model_3.predict(encoded_data_df)
data['cust_seg_6'] = k_predictions_3
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'cust_seg_6'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
#KMeans model without and standardization and normalization that has 5 clusters seems to work the best

In [17]:
#Using standard scaler for k-means because the algorithm assumes that the clusters are spherical and have similar sizes, so it might benefit from standard scaling
scaler = StandardScaler()
data_copy = pd.read_csv('./Mall_Customers.csv')
data_copy.drop(columns=['CustomerID', 'Gender'], inplace=True)
standard_data = scaler.fit_transform(data_copy)
s_data_df = pd.DataFrame(standard_data,columns=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
standard_data_df = pd.concat([s_data_df, encoded_df], axis=1)
standard_data_df.head()
#K-Means Model w/ standard scaler and 4 clusters 
k_standard_model = KMeans(n_clusters = 4, random_state = 78)
k_standard_model.fit_transform(standard_data_df)
k_standard_pred = k_standard_model.predict(standard_data_df)
data['standard_cust_seg_4'] = k_standard_pred
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'standard_cust_seg_4'
)


  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
#K-Means Model w/ standard scaler and 5 clusters 
k_standard_model_2 = KMeans(n_clusters = 5, random_state = 78)
k_standard_model_2.fit_transform(standard_data_df)
k_standard_pred_2 = k_standard_model_2.predict(standard_data_df)
data['standard_cust_seg_5'] = k_standard_pred_2
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'standard_cust_seg_5'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
#K-Means Model w/ standard scaler and 6 clusters 
k_standard_model_3 = KMeans(n_clusters = 6, random_state = 78)
k_standard_model_3.fit_transform(standard_data_df)
k_standard_pred_3 = k_standard_model_3.predict(standard_data_df)
data['standard_cust_seg_6'] = k_standard_pred_3
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'standard_cust_seg_6'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Using standard scaler performed worse clustering compared to just using encoded data 

In [26]:
#Using minmaxscaler()
min_scaler = MinMaxScaler()
min_data = min_scaler.fit_transform(data_copy)
m_data_df = pd.DataFrame(min_data,
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
min_data_df = pd.concat([m_data_df, encoded_df], axis=1)
min_data_df.head()
#KMeans Model w/ minxmaxscaler and 4 clusters 
k_min_model = KMeans(n_clusters = 4, random_state =78)
k_min_model.fit_transform(min_data_df)
k_min_pred = k_min_model.predict(min_data_df)
data['min_cust_seg_4'] = k_min_pred
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'min_cust_seg_4'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [27]:
#KMeans Model w/ minxmaxscaler and 5 clusters
k_min_model_2 = KMeans(n_clusters = 5, random_state =78)
k_min_model_2.fit_transform(min_data_df)
k_min_pred_2 = k_min_model_2.predict(min_data_df)
data['min_cust_seg_5'] = k_min_pred_2
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'min_cust_seg_5'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [28]:
#KMeans Model w/ minxmaxscaler and 6 clusters
k_min_model_3 = KMeans(n_clusters = 6, random_state =78)
k_min_model_3.fit_transform(min_data_df)
k_min_pred_3 = k_min_model_3.predict(min_data_df)
data['min_cust_seg_6'] = k_min_pred_3
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'min_cust_seg_6'
)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
#KMeans model with minmaxscaler() did not perform well either with 4, 5, or 6 clusters 

In [29]:
#Birch Model w/ only encoded data and 4 clusters 
birch_model = Birch(n_clusters=4)
birch_model_predictions = birch_model.fit_predict(encoded_data_df)
data['birch_cust_seg_4'] = birch_model_predictions
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'birch_cust_seg_4'
)

In [31]:
#Birch Model w/ only encoded data and 5 clusters 
birch_model_2 = Birch(n_clusters=5)
birch_model_predictions_2 = birch_model_2.fit_predict(encoded_data_df)
data['birch_cust_seg_5'] = birch_model_predictions_2
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'birch_cust_seg_5'
)

In [32]:
#Birch Model w/ only encoded data and 6 clusters 
birch_model_3 = Birch(n_clusters=6)
birch_model_predictions_3 = birch_model_3.fit_predict(encoded_data_df)
data['birch_cust_seg_6'] = birch_model_predictions_3
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'birch_cust_seg_6'
)

In [39]:
#Birch Model with standard scaler and 5 clusters 
birch_standard_model = Birch(n_clusters = 5)
birch_standard_pred = birch_standard_model.fit_predict(standard_data_df)
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'birch_standard_cust_seg_5'
)

In [37]:
#Agglo model with only encoded data and 5 clusters 
agglo_model = AgglomerativeClustering(n_clusters=5)
agglo_model_predictions = agglo_model.fit_predict(encoded_data_df)
data['a_cust_seg_5'] = agglo_model_predictions
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'a_cust_seg_5'
)

In [40]:
#Agglo Model with standard scaler and 5 clusters 
agglo_standard_model = AgglomerativeClustering(n_clusters = 5)
agglo_standard_pred = agglo_standard_model.fit_predict(standard_data_df)
data['a_standard_cust_seg_5'] = agglo_standard_pred
data.hvplot.scatter(
    x = 'Annual Income (k$)',
    y = 'Spending Score (1-100)',
    by = 'a_standard_cust_seg_5'
)

In [None]:
#Using only encoded w/ the fit_transform action works better for KMeans, Birch, and Agglo cluster algorithms. 

In [41]:
#Calinski_harabasz_score for KMeans, AgglomerativeClustering, and Birch 
score_kmeans = []
score_agglomerative = []
score_birch = []
k = list(range(4, 7))

for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(encoded_data_df)
    labels = k_model.labels_
    score = metrics.calinski_harabasz_score(encoded_data_df, labels)    
    score_kmeans.append(score)
    
    agglo_model = AgglomerativeClustering(n_clusters=i)
    agglo_predictions = agglo_model.fit_predict(encoded_data_df)
    labels = agglo_model.labels_
    score = metrics.calinski_harabasz_score(encoded_data_df, labels)    
    score_agglomerative.append(score)    
    
    birch_model = Birch(n_clusters=i)
    birch_model.fit(encoded_data_df)
    labels = birch_model.labels_
    score = metrics.calinski_harabasz_score(encoded_data_df, labels)    
    score_birch.append(score)



  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [42]:
score_kmeans

[127.8658764903549, 150.84889824848167, 166.44672420825694]

In [43]:
score_agglomerative

[124.18985945299062, 147.4871493864345, 159.20181693659265]

In [44]:
score_birch

[125.59313686337879, 148.53845639041177, 158.41746114166995]

In [None]:
#KMeans algorithm seems to perform the best from the Calinski_harabasz_score 

In [None]:
#Hierarchical clustering uses different linkage methods that affect the distance calculation, so it might benefit from normalizing the data. Additionally, if you want to visualize or report the cluster centroids, min-max scaling may be preferable because it preserves the original range and units of the data; if you want to perform further analysis or modeling on the clustered data, standard scaling may be more suitable because it facilitates comparison and combination of different variables.