## PCA Implementation on Shop dataset

#### Load necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Load the dataset

In [2]:
data = pd.read_csv('shop.csv')

data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),cluster
0,1,Male,19,15,39,4
1,2,Male,21,15,81,3
2,3,Female,20,16,6,4
3,4,Female,23,16,77,3
4,5,Female,31,17,40,4


### Apply Hierarchical Clustering on the data

#### Load the sklearn library

In [3]:
from sklearn.cluster import AgglomerativeClustering

#### Get the features and targets

In [4]:
x = data.drop(['CustomerID', 'Gender', 'cluster'], axis = 1)
y = data.cluster

x.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40


#### Create an instance of the model

In [5]:
clustering = AgglomerativeClustering(n_clusters = 6)

#### Fit the data into the model

In [6]:
clustering.fit(x)

#### Get the predicted labels

In [7]:
y_pred = clustering.labels_

y_pred[:5]

array([4, 3, 4, 3, 4], dtype=int64)

#### Calculate performance metrics

In [8]:
# import libraries
from sklearn.metrics.cluster import adjusted_rand_score

# calculate rand score
rand_score = adjusted_rand_score(y, y_pred)

print('Rand Score =', rand_score)

Rand Score = 0.8595631882218997


## Apply PCA on the data

#### Load the sklearn library

In [9]:
from sklearn.decomposition import PCA

#### Create an instance of the model

In [10]:
pca = PCA(n_components = 2)

#### Fit the data

In [11]:
x_new = pca.fit_transform(x)

#### Train the model using new data

In [12]:
clustering_new = AgglomerativeClustering(n_clusters = 6)

clustering_new.fit(x_new)

#### Get the predicted labels

In [13]:
y_pred_new = clustering_new.labels_

y_pred_new[:5]

array([1, 3, 1, 3, 1], dtype=int64)

#### Calculate the performance metrics

In [14]:
# calculate rand score
rand_score = adjusted_rand_score(y, y_pred_new)

print('Rand Score =', rand_score)

Rand Score = 0.6410628917257121


### The accuracy score reduces on applying PCA in this case as the number of features are already pretty less.

In [16]:
clustering_new