<a href="https://colab.research.google.com/github/anomishra/Techademy_Artificial_intelligence/blob/master/ML_Models/Kmeans_techademy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The aim of this problem is to segment the clients of a wholesale distributor based on their annual spending on diverse product categories, like milk, grocery, region, etc. \\
Data Source: https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv

In [0]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
#import the dataset
data = pd.read_csv('Wholesale customers data.csv')
data.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [3]:
# statistics of the data
data.describe()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,1.322727,2.543182,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455
std,0.468052,0.774272,12647.328865,7380.377175,9503.162829,4854.673333,4767.854448,2820.105937
min,1.0,1.0,3.0,55.0,3.0,25.0,3.0,3.0
25%,1.0,2.0,3127.75,1533.0,2153.0,742.25,256.75,408.25
50%,1.0,3.0,8504.0,3627.0,4755.5,1526.0,816.5,965.5
75%,2.0,3.0,16933.75,7190.25,10655.75,3554.25,3922.0,1820.25
max,2.0,3.0,112151.0,73498.0,92780.0,60869.0,40827.0,47943.0


#### Normalizing over the standard deviation
Now let's normalize the dataset. But why do we need normalization in the first place? Normalization is a statistical method that helps mathematical-based algorithms to interpret features with different magnitudes and distributions equally. We use __StandardScaler()__ to normalize our dataset.

In [4]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# statistics of scaled data
pd.DataFrame(data_scaled).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,-2.452584e-16,-5.737834e-16,-2.422305e-17,-1.5896380000000002e-17,-6.030530000000001e-17,1.1354550000000001e-17,-1.917658e-17,-8.276208000000001e-17
std,1.001138,1.001138,1.001138,1.001138,1.001138,1.001138,1.001138,1.001138
min,-0.6902971,-1.995342,-0.9496831,-0.7787951,-0.8373344,-0.628343,-0.6044165,-0.5402644
25%,-0.6902971,-0.7023369,-0.7023339,-0.5783063,-0.6108364,-0.4804306,-0.5511349,-0.3964005
50%,-0.6902971,0.5906683,-0.2767602,-0.294258,-0.3366684,-0.3188045,-0.4336004,-0.1985766
75%,1.448652,0.5906683,0.3905226,0.1890921,0.2849105,0.09946441,0.2184822,0.1048598
max,1.448652,0.5906683,7.927738,9.18365,8.936528,11.919,7.967672,16.47845


In [0]:
# k means using 5 clusters and k-means++ initialization
# customizing the cluster number as 5, i.e., we want the ML model to group them into 5 groups only
kmeans = KMeans(n_jobs = -1, n_clusters = 5, init='k-means++')
kmeans.fit(data_scaled)
pred = kmeans.predict(data_scaled)
# training is done

In [9]:
labels = kmeans.labels_
print(labels)

[3 3 3 2 3 3 3 3 2 3 3 3 3 3 3 2 3 2 3 2 3 2 2 0 3 3 2 2 3 2 2 2 2 2 2 3 2
 3 3 2 2 2 3 3 3 3 3 4 3 3 2 2 3 3 2 2 4 3 2 2 3 4 3 3 2 4 2 3 2 2 2 0 2 3
 3 2 2 3 2 2 2 3 3 2 3 4 4 0 2 2 2 2 4 2 3 2 3 2 2 2 3 3 3 2 2 2 3 3 3 3 2
 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2
 2 2 2 2 2 2 2 3 3 2 3 3 3 2 2 3 3 3 3 2 2 2 3 3 2 3 2 3 2 2 2 2 2 0 2 0 2
 2 2 2 3 3 2 2 2 3 2 2 1 3 1 1 3 3 1 1 1 3 1 1 1 3 1 4 1 1 3 1 3 1 3 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 4 1 1 1 1 1 1 1
 1 1 1 1 1 3 1 3 1 3 1 1 1 1 2 2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 2 2 2 2 3 1 3
 1 3 3 1 3 3 3 3 3 3 3 1 1 3 1 1 3 1 1 3 1 1 1 3 1 1 1 1 1 0 1 1 1 1 1 3 1
 4 1 3 1 1 1 1 3 3 2 3 2 2 3 3 2 3 2 3 2 3 2 2 2 3 2 2 2 2 2 2 2 3 2 2 2 2
 3 2 2 3 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2
 3 3 2 2 2 2 2 2 3 3 2 3 2 2 3 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2]


In [10]:
#Integrating the labels in the dataset
data["Clus_km"] = labels
data.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Clus_km
0,2,3,12669,9656,7561,214,2674,1338,3
1,2,3,7057,9810,9568,1762,3293,1776,3
2,2,3,6353,8808,7684,2405,3516,7844,3
3,1,3,13265,1196,4221,6404,507,1788,2
4,2,3,22615,5410,7198,3915,1777,5185,3


In [11]:
data.groupby('Clus_km').mean()
#Extracting the cluster-center information 

Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Clus_km,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.166667,2.833333,44940.666667,22179.166667,17187.0,21831.166667,2758.0,17908.0
1,1.054945,1.307692,12183.945055,3254.714286,4130.923077,3458.252747,860.263736,1149.934066
2,1.0,3.0,13129.057971,3168.52657,3635.864734,3413.729469,749.038647,1135.468599
3,2.0,2.674603,8130.031746,8874.071429,14139.150794,1339.47619,6104.936508,1542.706349
4,2.0,2.5,15964.9,34708.5,48536.9,3054.6,24875.2,2942.8


In [12]:
#Extracting the cluster-center information 
kmeans.cluster_centers_

array([[-3.33805633e-01,  3.75167418e-01,  2.60749645e+00,
         2.22231860e+00,  9.72964050e-01,  3.86855866e+00,
        -2.59306898e-02,  5.81601504e+00],
       [-5.72772431e-01, -1.59749436e+00,  1.45371704e-02,
        -3.44758082e-01, -4.02466315e-01,  7.96677044e-02,
        -4.24411072e-01, -1.33102511e-01],
       [-6.90297086e-01,  5.90668285e-01,  8.93504967e-02,
        -3.56449331e-01, -4.54619674e-01,  7.04860444e-02,
        -4.47765749e-01, -1.38237755e-01],
       [ 1.44865163e+00,  1.69928497e-01, -3.06362835e-01,
         4.17500207e-01,  6.51879518e-01, -3.57269701e-01,
         6.76847966e-01,  6.33174701e-03],
       [ 1.44865163e+00, -5.58343155e-02,  3.13830315e-01,
         3.92190593e+00,  4.27561037e+00, -3.57419457e-03,
         4.61816580e+00,  5.03365339e-01]])

In [13]:
# extracting how many data points each cluster have
frame = pd.DataFrame(data_scaled)
frame['cluster'] = pred
frame['cluster'].value_counts()

2    207
3    126
1     91
4     10
0      6
Name: cluster, dtype: int64