# Customer Segmentation using K-means clustering

In [None]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
# Load the data 

customer_data = pd.read_csv("/content/Mall_Customers.csv")

In [None]:
# Explore few rows from the customer dataframe
customer_data.head(5)

In [None]:
# Dimension of the dataframe
customer_data.shape

200 customers with 5 attributes 

In [None]:
# Descriptive statistics 
customer_data.describe(include='O')

In [None]:
customer_data.describe()

In [None]:
customer_data.info()

In [None]:
customer_data.isnull().sum()

## Choosing the annual income and spending score for clustering

In [None]:
X = customer_data.iloc[:,[3,4]].values 

In [None]:
print(X)

In [None]:
# x-axis -> annual income ; y-axis -> spending score

# Choosing the number of clusters by using wcss
# WCSS - Within cluster sum of squares - Distance b/w each data points with their centeroids

# Finding wcss value for different no of clusters
# idea is to find the cluster with minimum wcss value 

In [None]:
wcss = []
for i in range(1,11):
   #taking 10 clusters (n-1, 11-1 = 10)
   kmeans = KMeans(n_clusters=i, init='k-means++',random_state=42)
   kmeans.fit(X)
   wcss.append(kmeans.inertia_)#gives wcss value

# Plot an elbow graph to obtain the number of cluster with minimum wcss value

In [None]:
sns.set()
plt.plot(1,11,wcss)
plt.title("The elbow graph")
plt.xlabel('Number of clusters')
plt.ylabel("WCSS")
plt.show()

### we see that there are 2 drops .. one at 2 and the other at 4
### post 4 there is no significant drop 
### hence we will take the number of cluster to be 4

## Optimum number of clusters = 4

In [None]:
# training the k-means clustering model

In [None]:
kmeans = KMeans(n_clusters=4,init='k-means++', random_state = 42)

# return a label for each data point based on their cluster (4 cluster - so cluster 0, cluster 1... cluster 3)

Y = kmeans.fit_predict(X)
#Compute cluster centers and predict cluster index for each sample.

#Convenience method; equivalent to calling fit(X) followed by
#predict(X).

In [None]:
print(Y)

# Visualizing all the clusters - with their centroids

In [None]:
# matplotlib
plt.figure(figsize=(8,8))
plt.scatter(X[Y==0,0],X[Y==0,1], s = 50, c='blue',label='Cluster1')
plt.scatter(X[Y==1,0],X[Y==1,1], s = 50, c='red',label='Cluster2')
plt.scatter(X[Y==2,0],X[Y==2,1], s = 50, c='yellow',label='Cluster3')
plt.scatter(X[Y==3,0],X[Y==3,1], s = 50, c='green',label='Cluster4')
## 4 clusters - 0,1,2,3 
## Y== 0,0 ; Y==0,1 represents first cluster 0, 0 - income , 1 =spending score and so on 

# plot the centroid
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1], s = 100, c='black',label='Centeroid')
plt.title('Customer Clusters')
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.show()

In [None]:
# cluster 1 blue -  people have less income and less spending score 
# Cluster 2 red - have high income and high spending score - mall savvy
# cluster 3 yellow - have average to high income and are less mall savvy
# cluster 4 green - have less - to average income but are mall centric 