In [43]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [44]:
shopper_df = pd.read_csv('shopper_data_scaled_with_StandardScaler.csv')
shopper_df.head()

Unnamed: 0,card_member,Age,annual_income,spending_score
0,1.128152,-1.424569,-1.738999,-0.434801
1,1.128152,-1.281035,-1.738999,1.195704
2,-0.886405,-1.352802,-1.70083,-1.715913
3,-0.886405,-1.137502,-1.70083,1.040418
4,-0.886405,-0.563369,-1.66266,-0.39598


In [45]:
k_values = list(range(1,11))
inertia = []
for i in k_values:
    km_model = KMeans(n_clusters = i, random_state = 0)
    km_model.fit(shopper_df)
    inertia.append(km_model.inertia_)
print(inertia)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



[800.0000000000003, 588.8026767824932, 476.7875544135158, 386.71478577406884, 325.44857048543486, 275.1950046939481, 236.25761095237866, 200.31388217165548, 174.12493921599284, 152.02983429775693]


In [46]:
elbow_data = {'k_val':  k_values, 'inertia':  inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x='k_val', y='inertia', title='Elbow Plot', xticks = k_values)

In [47]:
def get_clusters(data_df, k_value):
    km_model = KMeans(n_clusters = k_value, random_state = 0)
    km_model.fit(data_df[['card_member','Age','annual_income','spending_score']])
    predictions = km_model.predict(data_df[['card_member','Age','annual_income','spending_score']])
    column_name = 'predictions k =' + str(k_value)
    data_df[column_name] = km_model.labels_
    return data_df


In [48]:
get_clusters(shopper_df, 6)

Unnamed: 0,card_member,Age,annual_income,spending_score,predictions k =6
0,1.128152,-1.424569,-1.738999,-0.434801,3
1,1.128152,-1.281035,-1.738999,1.195704,3
2,-0.886405,-1.352802,-1.700830,-1.715913,1
3,-0.886405,-1.137502,-1.700830,1.040418,1
4,-0.886405,-0.563369,-1.662660,-0.395980,1
...,...,...,...,...,...
195,-0.886405,-0.276302,2.268791,1.118061,5
196,-0.886405,0.441365,2.497807,-0.861839,0
197,1.128152,-0.491602,2.497807,0.923953,5
198,1.128152,-0.491602,2.917671,-1.250054,0


In [40]:
get_clusters(shopper_df, 5)

Unnamed: 0,card_member,Age,annual_income,spending_score,predictions,predictions k =6,predictions k =5
0,1.128152,-1.424569,-1.738999,-0.434801,3,3,4
1,1.128152,-1.281035,-1.738999,1.195704,3,3,4
2,-0.886405,-1.352802,-1.700830,-1.715913,1,1,2
3,-0.886405,-1.137502,-1.700830,1.040418,1,1,0
4,-0.886405,-0.563369,-1.662660,-0.395980,1,1,2
...,...,...,...,...,...,...,...
195,-0.886405,-0.276302,2.268791,1.118061,5,5,0
196,-0.886405,0.441365,2.497807,-0.861839,0,0,3
197,1.128152,-0.491602,2.497807,0.923953,5,5,4
198,1.128152,-0.491602,2.917671,-1.250054,0,0,3


In [49]:
shopper_df.hvplot.scatter(x='annual_income',y='spending_score', by='predictions k =6')

In [51]:
fig = px.scatter_3d(shopper_df, x='Age', y='spending_score',z='annual_income', color = 'predictions k =6', 
                    symbol = 'predictions k =6', width = 800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()