## Step 1 : Load the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2 : Load the data

In [2]:
data = pd.read_csv('Mall_Customers.csv')
data.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


## Note : Just Check all the columns in the data are independent of each other indicating no target. Hence, it is a perfect case to apply Clustering algorithms on this data.

In [3]:
data.shape

(200, 5)

## Step 3 : Data Cleaning, Data Wrangling and Data Preprocessing

In [4]:
data = data.drop('CustomerID', axis = 1)
data.head()

Unnamed: 0,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [6]:
data = data.rename({'Genre' : 'Gender'}, axis = 1)
data.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [7]:
## Q. Is it necessary to consider all the columns present in the data or Can i select few columns to create 
## groups?

## Step 4 : Selecting any two columns from the data to perform K Means clustering

- Atleast 2 columns must be selected. More than 2 is also fine

In [9]:
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]
X.head()

Unnamed: 0,Annual Income (k$),Spending Score (1-100)
0,15,39
1,15,81
2,16,6
3,16,77
4,17,40


## Step 5 : Creating the numpy array of the dataframe from step 4

In [10]:
X = X.iloc[:, :].values
X

array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
       [ 21,  66],
       [ 23,  29],
       [ 23,  98],
       [ 24,  35],
       [ 24,  73],
       [ 25,   5],
       [ 25,  73],
       [ 28,  14],
       [ 28,  82],
       [ 28,  32],
       [ 28,  61],
       [ 29,  31],
       [ 29,  87],
       [ 30,   4],
       [ 30,  73],
       [ 33,   4],
       [ 33,  92],
       [ 33,  14],
       [ 33,  81],
       [ 34,  17],
       [ 34,  73],
       [ 37,  26],
       [ 37,  75],
       [ 38,  35],
       [ 38,  92],
       [ 39,  36],
       [ 39,  61],
       [ 39,  28],
       [ 39,  65],
       [ 40,  55],
       [ 40,  47],
       [ 40,  42],
       [ 40,  42],
       [ 42,  52],
       [ 42,  60],
       [ 43,

## Step 6 : Apply K Means clustering algorithm on the data

In [12]:
from sklearn.cluster import KMeans
kmeans = KMeans()
kmeans

In [13]:
kmeans.fit(X)

## Step 7 :Checking the centre values of the clusters

In [14]:
kmeans.cluster_centers_

array([[ 63.72093023,  46.1627907 ],
       [108.18181818,  82.72727273],
       [ 25.0952381 ,  80.04761905],
       [109.7       ,  22.        ],
       [ 25.14285714,  19.52380952],
       [ 80.18181818,  12.68181818],
       [ 47.29545455,  51.40909091],
       [ 78.03571429,  81.89285714]])

## Applying Kmeans clustering with a different k value

In [19]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 15)
kmeans

In [20]:
kmeans.fit(X)

In [21]:
kmeans.cluster_centers_

array([[ 62.16666667,  49.5       ],
       [ 75.1       ,   6.6       ],
       [ 78.28571429,  90.5       ],
       [ 25.375     ,  74.        ],
       [ 28.18181818,  33.27272727],
       [ 24.58333333,   9.58333333],
       [102.        ,  22.57142857],
       [100.875     ,  84.25      ],
       [ 47.68292683,  50.73170732],
       [127.66666667,  20.66666667],
       [ 77.78571429,  73.28571429],
       [ 73.14285714,  35.57142857],
       [127.66666667,  78.66666667],
       [ 26.66666667,  93.66666667],
       [ 84.41666667,  17.75      ]])