In [1]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings


from itertools import product
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN 
from sklearn.metrics import silhouette_score

warnings.filterwarnings("ignore")

  shapely_geos_version, geos_capi_version_string


In [2]:
df = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [3]:
df.dtypes

CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

In [4]:
print("Size of Dataset: {} rows , {} columns".format(df.shape[0],df.shape[1]))

Size of Dataset: 200 rows , 5 columns


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [6]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [7]:
#checking for missing values
def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

missing(df)

Unnamed: 0,Missing_Number,Missing_Percent
CustomerID,0,0.0
Gender,0,0.0
Age,0,0.0
Annual Income (k$),0,0.0
Spending Score (1-100),0,0.0


In [8]:
fig = px.scatter(df, x="Spending Score (1-100)", y="Annual Income (k$)", color="Age", size="Age")
fig.show()

**Our Data Looks So complicated**

First we will implement K-means algorithm

In [9]:
# lets starts with two variables

x2 = df[['Spending Score (1-100)',"Annual Income (k$)"]]
wcss=[]
for i in range(1,11):
  kmeans = KMeans(i, init='k-means++')
  kmeans.fit(x2)
  wcss_iter = kmeans.inertia_
  wcss.append(wcss_iter)

In [10]:
# Using eblow method to find out optimal k value with WCSS score

fig = go.Figure(
    data=[go.Scatter(x=np.array(range(1, 11)), y=wcss)],
    layout=go.Layout(
        title=go.layout.Title(text="Eblow Method"),
        xaxis_title="Number Of Cluster",
        yaxis_title="WCSS Score",
        font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
    )
)

fig.show()

5 is optimal K value.

In [11]:
kmeans = KMeans(n_clusters=5, init='k-means++')
pred_y = kmeans.fit_predict(x2)

In [12]:
x2['clusters'] = kmeans.labels_

In [13]:
np.unique(x2['clusters'])

array([0, 1, 2, 3, 4], dtype=int32)

In [14]:
# Using eblow method to find out optimal k value with WCSS score

fig = go.Figure(
    layout=go.Layout(
        title=go.layout.Title(text="Clusters of Customers"),
        xaxis_title="Spending Score (1-100)",
        yaxis_title="Annual Income (k$)",
        font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )))

# Add traces
fig.add_trace(go.Scatter(x=x2['Spending Score (1-100)'], y=x2['Annual Income (k$)'],
                    mode='markers',
                    marker_color=kmeans.labels_))
fig.add_trace(go.Scatter(y=kmeans.cluster_centers_[:, 0], x=kmeans.cluster_centers_[:, 1],
                    mode='markers',
                    name='centroids',
                    marker_size = [30,30,30,30,30]))

fig.update(layout_showlegend=False)

fig.show()

K-Means algorithm generated the following 5 clusters:

1. clients with low annual income and high spending score
2. clients with medium annual income and medium spending score
3. clients with high annual income and low spending score
4. clients with high annual income and high spending score
5. clients with low annual income and low spending score

In [15]:
# lets starts with 3 variables

x3 = df[['Spending Score (1-100)',"Annual Income (k$)","Age"]]
wcss=[]
for i in range(1,11):
  kmeans = KMeans(i, init='k-means++')
  kmeans.fit(x3)
  wcss_iter = kmeans.inertia_
  wcss.append(wcss_iter)

In [16]:
fig = go.Figure(
    data=[go.Scatter(x=np.array(range(1, 11)), y=wcss)],
    layout=go.Layout(
        title=go.layout.Title(text="Eblow Method"),
        xaxis_title="Number Of Cluster",
        yaxis_title="WCSS Score",
        font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
    )
)

fig.show()

6 is optimal k value

In [17]:
kmeans = KMeans(n_clusters=6, init='k-means++')
pred_y = kmeans.fit_predict(x3)
x3['cluster'] = pred_y

In [18]:
np.unique(pred_y)

array([0, 1, 2, 3, 4, 5], dtype=int32)

In [19]:
def tracer(db, n, name):
    '''
    This function returns trace object for Plotly
    '''
    return go.Scatter3d(
        x = db[db['cluster']==n]['Age'],
        y = db[db['cluster']==n]['Spending Score (1-100)'],
        z = db[db['cluster']==n]['Annual Income (k$)'],
        mode = 'markers',
        name = name,
        marker = dict(
            size = 5
        )
     )

trace0 = tracer(x3, 0, 'Cluster 0')
trace1 = tracer(x3, 1, 'Cluster 1')
trace2 = tracer(x3, 2, 'Cluster 2')
trace3 = tracer(x3, 3, 'Cluster 3')
trace4 = tracer(x3, 4, 'Cluster 4')
trace5 = tracer(x3, 5, 'Cluster 5')

data = [trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(
    title = 'Clusters by K-Means',
    scene = dict(
            xaxis = dict(title = 'Age'),
            yaxis = dict(title = 'Spending Score'),
            zaxis = dict(title = 'Annual Income')
        )
)

fig = go.Figure(data=data, layout=layout)
# fig.show()
py.offline.iplot(fig)

**Now We will use density based clustering algorithm**

In [20]:
eps_values = np.arange(8,12.75,0.25) # eps values to be investigated
min_samples = np.arange(3,10) # min_samples values to be investigated

DBSCAN_params = list(product(eps_values, min_samples))

In [21]:
x4 = df[['Spending Score (1-100)',"Annual Income (k$)"]]

In [22]:
# using silhouetter score to find out the optimal min pts and eplison values

no_of_clusters = []
sil_score = []

for p in DBSCAN_params:
    DBS_clustering = DBSCAN(eps=p[0], min_samples=p[1]).fit(x4)
    no_of_clusters.append(len(np.unique(DBS_clustering.labels_)))
    sil_score.append(silhouette_score(x4, DBS_clustering.labels_))

In [23]:
tmp = pd.DataFrame.from_records(DBSCAN_params, columns =['Eps', 'Min_samples'])   
tmp['Sil_score'] = sil_score

pivot_1 = pd.pivot_table(tmp, values='Sil_score', index='Min_samples', columns='Eps')

fig = px.imshow(pivot_1, text_auto=".2f", color_continuous_scale='Purples', aspect="auto")
fig.show()

the optimal values for eplison and min pts are 9,3 respectively

In [24]:
db = DBSCAN(eps=9, min_samples=3).fit(x4)

x4['cluster'] = db.labels_

In [25]:
np.unique(x4['cluster'])

array([-1,  0,  1,  2,  3,  4,  5,  6])

In [26]:
def tracer(db, n, name):
    '''
    This function returns trace object for Plotly
    '''
    return go.Scatter(
        y = db[db['cluster']==n]['Annual Income (k$)'],
        x = db[db['cluster']==n]['Spending Score (1-100)'],
        mode = 'markers',
        name = name,
        marker = dict(
            size = 10
        )
     )

traceminus1 = tracer(x4, -1, 'Outliers')
trace0 = tracer(x4, 0, 'Cluster 0')
trace1 = tracer(x4, 1, 'Cluster 1')
trace2 = tracer(x4, 2, 'Cluster 2')
trace3 = tracer(x4, 3, 'Cluster 3')
trace4 = tracer(x4, 4, 'Cluster 4')
trace5 = tracer(x4, 5, 'Cluster 5')

data = [traceminus1,trace0, trace1, trace2, trace3, trace4, trace5]

layout = go.Layout(
    title = 'Clusters by Density Based',
    xaxis_title="Spending Score (1-100)",
    yaxis_title="Annual Income (k$)",
    font=dict(
    family="Courier New, monospace",
    size=18,
    color="RebeccaPurple"
    )
)

fig = go.Figure(data=data, layout=layout)
# fig.show()
py.offline.iplot(fig)

In [27]:
# lets starts with three variables

x5 = df[['Spending Score (1-100)',"Annual Income (k$)","Age"]]

In [28]:
no_of_clusters = []
sil_score = []

for p in DBSCAN_params:
    DBS_clustering = DBSCAN(eps=p[0], min_samples=p[1]).fit(x5)
    no_of_clusters.append(len(np.unique(DBS_clustering.labels_)))
    sil_score.append(silhouette_score(x4, DBS_clustering.labels_))

In [29]:
tmp = pd.DataFrame.from_records(DBSCAN_params, columns =['Eps', 'Min_samples'])   
tmp['Sil_score'] = sil_score

pivot_1 = pd.pivot_table(tmp, values='Sil_score', index='Min_samples', columns='Eps')

fig = px.imshow(pivot_1, text_auto=".2f", color_continuous_scale='BuGn', aspect="auto")
fig.show()

The optimal values for eplison and min pts are 12.25 and 5 respectively

In [30]:
db = DBSCAN(eps = 12.25, min_samples=5).fit(x5)

x5['cluster'] = db.labels_

In [31]:
np.unique(x5['cluster'])

array([-1,  0,  1,  2,  3])

In [32]:
def tracer(db, n, name):
    '''
    This function returns trace object for Plotly
    '''
    return go.Scatter3d(
        y = db[db['cluster']==n]['Annual Income (k$)'],
        x = db[db['cluster']==n]['Spending Score (1-100)'],
        z = db[db['cluster']==n]['Age'],
        mode = 'markers',
        name = name,
        marker = dict(
            size = 5
        )
     )

traceminus1 = tracer(x5, -1, 'Outliers')
trace0 = tracer(x5, 0, 'Cluster 0')
trace1 = tracer(x5, 1, 'Cluster 1')
trace2 = tracer(x5, 2, 'Cluster 2')
trace3 = tracer(x5, 3, 'Cluster 3')

data = [traceminus1,trace0, trace1, trace2, trace3]

layout = go.Layout(
    title = 'Clusters by Density Based',
    scene = dict(
            xaxis = dict(title = 'Annual Income (k$)'),
            yaxis = dict(title = 'Spending Score'),
            zaxis = dict(title = 'Age')
        )
)

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

From The above details, we could conclude that density based clustering performance great on this data.

Reference :

1. https://www.kaggle.com/code/datark1/customers-clustering-k-means-dbscan-and-ap

2. https://www.kaggle.com/code/niteshyadav3103/customer-segmentation-using-kmeans-hc-dbscan