In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("Mall_Customers.csv")
data.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [3]:
data.shape

(200, 5)

# Observations:
1. There are 200 rows and 5 columns in the data
2. Each rows depicts the details of 1 customers
3. Every column in the dataset is independent of each other. This means that no target is present in this data.
4. When we do not have target column in the data, it is a case of Unsupervised ML.
5. The Unsupervised ML algorithms are: KMeans Clustering, Hierarchial Clustering, DBScan Clustering, etc

# KMeans Clustering :
- The idea of this algorithm is to create groups in the data that have similar characteristics.

In [4]:
# Data preprocessing

In [5]:
data.isnull().sum()

CustomerID                0
Genre                     0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [6]:
data = data.drop("CustomerID", axis = 1)
data.head()

Unnamed: 0,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [7]:
dic = {
    "Genre" : "Gender"
}

In [8]:
data = data.rename(dic, axis = 1)
data.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


#### When trying to apply KMeans Clustering Algorithm, it is not necessary to consider all the column from the data

In [9]:
data = data[['Annual Income (k$)', 'Spending Score (1-100)']]
data

Unnamed: 0,Annual Income (k$),Spending Score (1-100)
0,15,39
1,15,81
2,16,6
3,16,77
4,17,40
...,...,...
195,120,79
196,126,28
197,126,74
198,137,18


# Create the numpy array of the dataframe from the above step

In [10]:
X = data.iloc[:, :].values
X

array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
       [ 21,  66],
       [ 23,  29],
       [ 23,  98],
       [ 24,  35],
       [ 24,  73],
       [ 25,   5],
       [ 25,  73],
       [ 28,  14],
       [ 28,  82],
       [ 28,  32],
       [ 28,  61],
       [ 29,  31],
       [ 29,  87],
       [ 30,   4],
       [ 30,  73],
       [ 33,   4],
       [ 33,  92],
       [ 33,  14],
       [ 33,  81],
       [ 34,  17],
       [ 34,  73],
       [ 37,  26],
       [ 37,  75],
       [ 38,  35],
       [ 38,  92],
       [ 39,  36],
       [ 39,  61],
       [ 39,  28],
       [ 39,  65],
       [ 40,  55],
       [ 40,  47],
       [ 40,  42],
       [ 40,  42],
       [ 42,  52],
       [ 42,  60],
       [ 43,

## Apply KMeans on the data

In [11]:
from sklearn.cluster import KMeans
kmeans = KMeans()
kmeans

In [12]:
kmeans.fit(X)

# Check the values of the cluster center

In [13]:
kmeans.cluster_centers_

array([[ 78.03571429,  81.89285714],
       [ 25.14285714,  19.52380952],
       [109.7       ,  22.        ],
       [ 43.96969697,  51.12121212],
       [108.18181818,  82.72727273],
       [ 24.95      ,  81.        ],
       [ 61.30188679,  48.24528302],
       [ 79.70833333,  14.29166667]])

## Applying KMeans with a different value of k

In [14]:
from sklearn.cluster import KMeans
kmeans_3 = KMeans(n_clusters = 3)
kmeans_3

In [15]:
kmeans_3.fit(X)

In [16]:
kmeans_3.cluster_centers_

array([[87.        , 18.63157895],
       [86.53846154, 82.12820513],
       [44.15447154, 49.82926829]])

In [19]:
# 35 & 59
inp = np.array([35, 59])