# Principal Component Analysis

### Importing Libraries

In [39]:
import numpy as np
import pandas as pd
from numpy.linalg import eig

### Define a Matrix

In [40]:
A = np.array([[1,2,3,4],
           [5,5,4,7],
           [1,5,2,3],
           [5,2,1,1],
           [8,3,2,2]])

print(A)

[[1 2 3 4]
 [5 5 4 7]
 [1 5 2 3]
 [5 2 1 1]
 [8 3 2 2]]


In [41]:
data = pd.DataFrame(A,columns  = ['f1','f2','f3','f4'])
data

Unnamed: 0,f1,f2,f3,f4
0,1,2,3,4
1,5,5,4,7
2,1,5,2,3
3,5,2,1,1
4,8,3,2,2


### Standardized the dataset

In [42]:
std_data  = (data - data.mean()) / (data.std())
std_data

Unnamed: 0,f1,f2,f3,f4
0,-1.0,-0.923133,0.526235,0.260623
1,0.333333,1.055009,1.403293,1.56374
2,-1.0,1.055009,-0.350823,-0.173749
3,0.333333,-0.923133,-1.227881,-1.042493
4,1.333333,-0.263752,-0.350823,-0.608121


### Find the covariance matrix for the given dataset

There are **2 methods** for this:
1. Sample Formula
2. Population Formula

We can use any of the formula,

### 1. Covariance sample formula (divide by N-1 )

In [43]:
V = np.cov(std_data.T , bias = 0)
print(V)

[[ 1.         -0.10989675 -0.14617634 -0.18098843]
 [-0.10989675  1.          0.46265195  0.58715398]
 [-0.14617634  0.46265195  1.          0.97147263]
 [-0.18098843  0.58715398  0.97147263  1.        ]]


### 2. Covariance population formula (divide by N)

In [44]:
V1 = np.cov(std_data.T , bias = 1)
print(V1)

[[ 0.8        -0.0879174  -0.11694107 -0.14479075]
 [-0.0879174   0.8         0.37012156  0.46972318]
 [-0.11694107  0.37012156  0.8         0.7771781 ]
 [-0.14479075  0.46972318  0.7771781   0.8       ]]


### Calculate Eigenvalue and Eigen Vector

In [45]:
values, vectors = eig(V)
print(vectors)


[[ 0.17757819 -0.9833757   0.02266973  0.0304023 ]
 [-0.46755449 -0.11411183 -0.12370483 -0.86779514]
 [-0.59870266 -0.11010221 -0.66549306  0.43191621]
 [-0.62563081 -0.08847725  0.73573241  0.24383536]]


In [46]:
print(values)

[2.41982989 0.95459699 0.01697271 0.60860041]


### Project Data

In [47]:
P = vectors.T.dot(std_data.T)
print(P.T)

[[-0.22407555  1.00771721 -0.06693033  1.06152627]
 [-2.25256047 -0.74104194  0.09365951  0.08200267]
 [-0.35211082  0.91698596 -0.04754169 -1.13982613]
 [ 1.87815929  0.00497784  0.17190281  0.02668559]
 [ 0.95058755 -1.18863906 -0.1510903  -0.0303884 ]]


### Result Using sklearn library

In [52]:
from sklearn.decomposition import PCA

pca = PCA(2)
pca.fit_transform(A)

print(pca.components_)

[[ 0.85757205 -0.19504244 -0.19734083 -0.43311109]
 [ 0.51420394  0.34072566  0.34293369  0.70844676]]


In [53]:
print(pca.explained_variance_)

[9.86007991 6.61303244]


In [54]:
# transform data
B = pca.transform(A)
print(B)

[[-2.6779279  -1.38879949]
 [-1.3294411   4.15846725]
 [-2.6326033  -1.41800294]
 [ 2.44637524 -2.14319138]
 [ 4.19359705  0.79152656]]


### I hope you liked my notebook