## Principal Component Analysis

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('Iris.csv')

In [11]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
label = df['Species']
data = df.drop("Species",axis=1)
data = data.drop("Id",axis=1)

In [4]:
print(data.shape)
print(label.shape)

(150, 4)
(150,)


In [5]:
# Data-preprocessing: Standardizing the data

from sklearn.preprocessing import StandardScaler
standardized_data = StandardScaler().fit_transform(data)
print(standardized_data.shape)


(150, 4)


## PCA using SCIKIT-LEARN

In [6]:
# initializing the pca
from sklearn.decomposition import PCA
pca = PCA()

In [7]:
pca.fit(standardized_data)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [9]:
pca.components_

array([[ 0.52237162, -0.26335492,  0.58125401,  0.56561105],
       [ 0.37231836,  0.92555649,  0.02109478,  0.06541577],
       [-0.72101681,  0.24203288,  0.14089226,  0.6338014 ],
       [-0.26199559,  0.12413481,  0.80115427, -0.52354627]])

In [10]:
pca.explained_variance_ratio_

array([0.72770452, 0.23030523, 0.03683832, 0.00515193])

In [17]:
np.cumsum(pca.explained_variance_ratio_)

array([0.72770452, 0.95800975, 0.99484807, 1.        ])

In [25]:
np.arange(len(pca.explained_variance_ratio_))

array([0, 1, 2, 3])

In [24]:
import plotly.express as px

px.line(x = np.arange(len(pca.explained_variance_ratio_)), y = np.cumsum(pca.explained_variance_ratio_))

<b>OBSERVATION:</b> As we can see that only 2 PCA components are explaining 95.8% variance, so lets go ahead and consider these two components.

In [31]:
# configuring the parameteres
# number of components = 2
pca_new = PCA(n_components = 2)
pca_new_data = pca_new.fit_transform(standardized_data)

# Lets look at the shape of data after PCA
print("shape = ", pca_new_data.shape)


shape =  (150, 2)


In [52]:
pca_df = pd.DataFrame(data=pca_new_data, columns=("1st_principal", "2nd_principal"))

pca_df["Label"] = label.values

pca_df.head()

Unnamed: 0,1st_principal,2nd_principal
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.36795,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767


In [55]:
px.scatter(pca_df, x = "1st_principal", 
           y = "2nd_principal", 
           color = "Label")