In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler

from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_samples, silhouette_score

%matplotlib inline
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [None]:
# EDA

data=pd.read_csv("Mall_Customers.csv")
print(data.head())
print(df.shape)
print(df.describe)
print(df.dtypes)


In [None]:
#Data visualization

plt.figure(figsize=(8,5))
plt.scatter('Annual Income (k$)','Spending Score (1-100)',data=data, s=30, color="red", alpha = 0.8)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.savefig('scatter_plot.png')

In [None]:

plt.figure(1 , figsize = (15 , 6))
n = 0 
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.histplot(df[x] , bins = 20,kde=True)
    plt.title('histplot of {}'.format(x))
    plt.savefig('histplot_{}.png'.format(x))
plt.show()

In [None]:
x= data.iloc[:,3:5]

x_array =  np.array(x)
print(x_array)

In [None]:
scaler = StandardScaler() 

x_scaled = scaler.fit_transform(x_array)
x_scaled

In [None]:
# Fitting the model for values in range(1,11)

SSD =[]
K = range(1,11)

for k in K:
    km = KMeans(n_clusters = k)
    km = km.fit(x_scaled)
    SSD.append(km.inertia_)

#Inertia: It is defined as the mean squared distance between each instance and its closest centroid. Logically, as per the definition lower the inertia better the model

In [None]:
plt.figure(figsize=(8,5))
plt.plot(K, SSD, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal K')
plt.show()
plt.savefig('elbow_plot.png')


Silhouette Coefficient Method:
The silhouette coefficient of a data measures how well data are assigned to its own cluster and how far they are from other clusters.

A silhouette close to 1 means the data points are in an appropriate cluster
A silhouette coefficient close to −1 implies out data is in the wrong cluster.
Silhouette Coefficient = (x-y)/ max(x,y)

In [None]:
KMean= KMeans(n_clusters=5)
KMean.fit(x_scaled)
label=KMean.predict(x_scaled)

print("Silhouette Score(n=5):",silhouette_score(x_scaled, label))

In [None]:
model = KMeans(random_state=123)

Visualizer = KElbowVisualizer(model, k=(2,6), metric='silhouette', timings=False)
plt.figure(figsize=(8,5))

Visualizer.fit(x_scaled)    
Visualizer.poof()

In [None]:
print(KMean.cluster_centers_)

In [None]:
print(KMean.labels_)

In [None]:
data["cluster"] = KMean.labels_
data.head()

In [None]:
plt.figure(figsize=(8,5))

plt.scatter(x_scaled[label==0, 0], x_scaled[label==0, 1], s=100, c='red', label ='Careless')
plt.scatter(x_scaled[label==1, 0], x_scaled[label==1, 1], s=100, c='blue', label ='Target')
plt.scatter(x_scaled[label==2, 0], x_scaled[label==2, 1], s=100, c='green', label ='Planner')
plt.scatter(x_scaled[label==3, 0], x_scaled[label==3, 1], s=100, c='cyan', label ='Sensible')
plt.scatter(x_scaled[label==4, 0], x_scaled[label==4, 1], s=100, c='magenta', label ='Moderate')

plt.title('Cluster of Clients')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show
plt.savefig('clustered_clients.png')
