# CUSTOMER SEGMENTATION OF MALL CUSTOMERS
## Using unsupervised learning (KMeans)
### Importing the Libraries and the Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
#importing the data
df = pd.read_csv('Mall_Customers.csv')

In [None]:
df.head()

In [None]:
df = df.rename(columns ={'Annual Income (k$)':'Annual Income', 'Spending Score (1-100)':'Spend Score'})
df['Gender'].replace(0, 'Female',inplace=True)
df['Gender'].replace(1, 'Male',inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df[df['Spend Score'] == 99]

## Data Cleaning

In [None]:
sns.heatmap(df.isnull(), yticklabels = False, cbar= False, cmap = 'Reds')

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop('CustomerID',axis=1, inplace = True)

In [None]:
df.head()

## Data Visualization
### Univariate Analysis

In [None]:
#DISTPLOT
plt.figure(figsize=(5,15))
for i in range(1,4):
    plt.subplot(4,1,i+1)
    sns.distplot(df[df.columns[i]], kde_kws= {"color": "b", "lw": 3, "label": "KDE"}, hist_kws={"histtype": "step","linewidth": 3,"color": "g"})
    plt.title(df.columns[i])
    
plt.tight_layout()

In [None]:
#COUNTPLOT:Bargraph
plt.figure(figsize=(6,3))
sns.countplot(data = df , y ='Gender')

### Bivariate Analysis

In [None]:
#BOXPLOT
plt.figure(figsize = (20,10))
x=0
for i in ['Annual Income','Spend Score']:
    x=x+1
    plt.subplot(2,2,x)
    sns.boxplot(data=df , x=i, y = 'Gender' )
plt.show()


In [None]:
#VIOLINPLOT
plt.figure(figsize = (10,10))
x = 0
for i in ['Annual Income','Spend Score']:
    x = x+1
    plt.subplot(2,2,x)
    sns.violinplot(data = df, y ="Gender", x = i)
plt.show()

### Multivariate Analysis

In [None]:
#PAIRPLOT
sns.pairplot(df)
plt.show()

In [None]:
#HEATMAP:Correlation
corr = df.corr()
plt.figure(figsize=(10,5))
sns.heatmap(corr, annot=True)

## Data Modelling
### Applying Elbow Method

In [None]:
#Selecting the annual income and spend score columns to apply the model to.(because could not convert string to float in first column)
X = df.drop('Gender',axis=1)

In [None]:
#Applying the Elbow method to find the appropriate number of clusters
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init= 10,random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
plt.plot(wcss)
plt.title('Elbow Method')
#init = KMeans++ in order to prevent falling into random initialization trap.
#from the below graph we can say that the two possible elbow points are 3 and 5.

### Applying Kmeans to the dataset

In [None]:
#taking the number of clusters as 3 
kmeans3 = KMeans( n_clusters = 3, init='k-means++',n_init = 10, max_iter = 300, random_state =0)
kmeans3.fit_transform(X)
labels = kmeans3.labels_

In [None]:
#Scatter plot for 3 clusters
plt.figure(figsize = (15,10))
sns.set_style('whitegrid')
sns.scatterplot( X['Annual Income'], X['Spend Score'], hue=labels, palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()

In [None]:
#taking the number of cluster=5
kmeans5 = KMeans(n_clusters = 5,init='k-means++',n_init=10,max_iter=300,random_state=0)
kmeans5.fit_transform(X)
labels = kmeans5.labels_

In [None]:
#Scatter plot for 5 clusters
plt.figure(figsize = (15,10))
sns.scatterplot(X['Annual Income'],X['Spend Score'], hue=labels,palette=sns.color_palette('hsv',5))
plt.title('KMeans with 5 Clusters')
plt.show()

#### Hence, We can analyze the 5 clusters a s follows:
##### a. Label  : Low Income and Low Spending
##### b. Label  : High Income and High Spending
##### c. Label  : Medium Income and Medium Spending
##### d. Label : High Income and Low Spending
##### e. Label : Low Income and High Spending

In [None]:
fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(121)
sns.swarmplot(x=labels, y='Annual Income', data=X, ax=ax)
ax.set_title('Labels According to Annual Income')

ax = fig.add_subplot(122)
sns.swarmplot(x=labels, y='Spend Score', data=X, ax=ax)
ax.set_title('Labels According to Scoring History')

plt.show()