In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans

import plotly.express as px
import plotly.graph_objs as go

In [None]:
df = pd.read_csv("SelfShiksha_ULB_FC47_Mall_Customers.csv")

In [None]:
df

In [None]:
# The CustomerID column is not relevant for us and so we will drop it from the DataFrame
df.drop("CustomerID",axis=1,inplace=True)

In [None]:
df

In [None]:
# Lets see if there is any relation between annual income and spending score
fig = px.scatter(x=df["Annual Income (k$)"],y=df["Spending Score (1-100)"])
fig.update_layout(xaxis_title="Annual Income (k$)",yaxis_title="Spending Score (1-100)")
fig.show()

In [None]:
# Lets see if there is any relation between age and spending score
fig = px.scatter(x=df["Age"],y=df["Spending Score (1-100)"])
fig.update_layout(xaxis_title="Age",yaxis_title="Spending Score (1-100)")
fig.show()

In [None]:
# Lets see if there is any relation between age and annual income
fig = px.scatter(x=df["Age"],y=df["Annual Income (k$)"])
fig.update_layout(xaxis_title="Age",yaxis_title="Annual Income (k$)")
fig.show()

In [None]:
# From the plots, there does not seem to be any clear relation between these 3 numerical variables.
# Lets also compute the correlation values to be sure.
# We can see that all the off-diagonal values are very close to zero,
# except for the age and spending score which have a small negative correlation with each other.

df.corr()

Does the Gender have a significant bearing on the three numerical variables?

In [None]:
df.describe()

In [None]:
df.groupby("Gender").agg("describe")["Age"]

In [None]:
df.groupby("Gender").agg("describe")["Annual Income (k$)"]

In [None]:
df.groupby("Gender").agg("describe")["Spending Score (1-100)"]

In [None]:
fig = px.scatter_3d(df,x="Age", y="Annual Income (k$)", z="Spending Score (1-100)",color="Gender")
fig.show()

In [None]:
# On visual inspection of the above 3D plot, 
# we can see that some clusters can be formed using the annual income and spending score columns. 
# So lets use these two columns for further analysis

X = np.array(df[["Annual Income (k$)","Spending Score (1-100)"]])

In [None]:
fig = px.scatter(x=X[:,0],y=X[:,1])
fig.show()

In [None]:
# Initialise the KMeans function and use it to find the cluster labels.
# Ideally we should scale the input features, 
# but in our case since both annual income and spending score have a similar range, 
# we don't need to use scaling for this case.

# n_clusters is the number of clusters to be formed
# init and n_init help in finding a good initialisation for the cluster centers
# max_iter is the maximum number of iterations to be performed
# random_state is to ensure that we get the same result every time we run this code

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)

# y is the predicted cluster label for each point in the dataset
y=kmeans.fit_predict(X)

In [None]:
y

In [None]:
df["label"] = y

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(name="Clusters",
      x=df["Annual Income (k$)"],y=df["Spending Score (1-100)"],
                     mode='markers',
                     marker=dict(size=10, color=df["label"], symbol='circle')))

fig.add_trace(
    go.Scatter(name="Cluster Centers",
     x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1],
                     mode='markers',
                     marker=dict(size=10, color='rgb(0, 255, 0)', symbol='circle')))

fig.update_layout(xaxis_title="Annual Income (k$)",yaxis_title="Spending Score (1-100)")
fig.show()

# What if we chose n_clusters = 5 instead of 3?

In [None]:
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
df["label"]=kmeans.fit_predict(X)

fig = go.Figure()

fig.add_trace(
    go.Scatter(name="Clusters",
      x=df["Annual Income (k$)"],y=df["Spending Score (1-100)"],
                     mode='markers',
                     marker=dict(size=10, color=df["label"], symbol='circle')))

fig.add_trace(
    go.Scatter(name="Cluster Centers",
     x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1],
                     mode='markers',
                     marker=dict(size=10, color='rgb(0, 255, 0)', symbol='circle')))

fig.update_layout(xaxis_title="Annual Income (k$)",yaxis_title="Spending Score (1-100)")
fig.show()

# What if we chose n_clusters = 7?

In [None]:
kmeans = KMeans(n_clusters=7, init='k-means++', max_iter=300, n_init=10, random_state=0)
df["label"]=kmeans.fit_predict(X)

fig = go.Figure()

fig.add_trace(
    go.Scatter(name="Clusters",
      x=df["Annual Income (k$)"],y=df["Spending Score (1-100)"],
                     mode='markers',
                     marker=dict(size=10, color=df["label"], symbol='circle')))

fig.add_trace(
    go.Scatter(name="Cluster Centers",
     x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1],
                     mode='markers',
                     marker=dict(size=10, color='rgb(0, 255, 0)', symbol='circle')))

fig.update_layout(xaxis_title="Annual Income (k$)",yaxis_title="Spending Score (1-100)")
fig.show()

# How to decide the appropriate number of clusters?

The best way is to do it visually or using domain knowledge, and see what makes most sense. But if that is not possible when the number of features is larger than 3, we can use various empirical methods, one of which is known as the Elbow method. Its not guaranteed to work well, but can be used as a guide.

In [None]:
# WCSS ( Within-Cluster Sum of Square ). 
# WCSS is the sum of squared distance between each point and the centroid in a cluster
# The idea is to choose number of clusters at which the WCSS curve undergoes a transition in the slope.
# Increasing the number of clusters beyong this is not useful.

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
# In this plot of WCSS, there is no clear elbow point, but n_clusters = 4 seems to be the best choice.
fig = px.line(y=wcss)
fig.update_layout(xaxis_title="Number of Clusters",yaxis_title="WCSS")
fig.show()

# What if we chose n_clusters = 4?

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
df["label"]=kmeans.fit_predict(X)

fig = go.Figure()

fig.add_trace(
    go.Scatter(name="Clusters",
      x=df["Annual Income (k$)"],y=df["Spending Score (1-100)"],
                     mode='markers',
                     marker=dict(size=10, color=df["label"], symbol='circle')))

fig.add_trace(
    go.Scatter(name="Cluster Centers",
     x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1],
                     mode='markers',
                     marker=dict(size=10, color='rgb(0, 255, 0)', symbol='circle')))

fig.update_layout(xaxis_title="Annual Income (k$)",yaxis_title="Spending Score (1-100)")
fig.show()