# Principal Component Analysis

In [1]:
from sklearn import datasets

In [2]:
iris=datasets.load_iris()

In [3]:
dir(iris)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [4]:
A0=iris.data

In [5]:
print("Dimensions:")
print(A0.shape)

Dimensions:
(150, 4)


In [8]:
print("-----")
print("First 5 samples:")
print(A0[:5,:])

-----
First 5 samples:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [9]:
print("-----")
print("Feature names:")
print(iris.feature_names)

-----
Feature names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [13]:
# Eigen - decomposition : 5 - step process

# 1. Normalize columns of A so that each feature has zero mean
import numpy as np
mu=np.mean(A0,axis=0)
A=A0-mu
print("Does A have zero mean across rows?")
print(np.mean(A,axis=0))

Does A have zero mean across rows?
[-1.12502600e-15 -7.60872846e-16 -2.55203266e-15 -4.48530102e-16]


$\Sigma={A^TA}/{(m-1)}$

In [29]:
# 2. Compute sample covariance matrix 

m,n=A.shape
sigma=(A.T@A)/(m-1)
print('-----')
print('Sigma:')
print(sigma)

-----
Sigma:
[[ 4.  6.]
 [ 6. 10.]]


In [32]:
# 3. Perform eigen-decomposition of sigma using 'np.linalg.eig(sigma)'
l,X = np.linalg.eig(sigma)
print('-----')
print('Evalues:')
print(l)

-----
Evalues:
[ 0.29179607 13.70820393]


In [34]:
print('-----')
print('Evectors:')
print(X)

-----
Evectors:
[[-0.85065081 -0.52573111]
 [ 0.52573111 -0.85065081]]


In [36]:
# 4. Compress by ordering k evectors according to larfest evalues 
print('----')
print('Compressed - 4D to 2D:')
Acomp=A@X[:,:2]
print(Acomp)

----
Compressed - 4D to 2D:
[[ 0.52573111 -0.85065081]
 [-0.12410828 -3.60341465]]


In [40]:
# 5. Reconstruct from compressed version by computing 
print('-----')
print('Reconstructed version - 2D to 4D:')
Arec=A@X[:,:2]@X[:,:2].T # first 2 evectors
print(Arec)

-----
Reconstructed version - 2D to 4D:
[[0. 1.]
 [2. 3.]]


In [42]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pca.fit(A0)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [43]:
print("Principla components")
print(pca.components_)

Principla components
[[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]]


In [44]:
print('-----')
print("Compressed - 4D to 2D:")
print(pca.transform(A0)[:5,:])

-----
Compressed - 4D to 2D:
[[-2.68412563  0.31939725]
 [-2.71414169 -0.17700123]
 [-2.88899057 -0.14494943]
 [-2.74534286 -0.31829898]
 [-2.72871654  0.32675451]]


In [48]:
print('-----')
print('reconstruced - 2D to 4D')
print(pca.inverse_transform(pca.transform(A0)))[:5,:]

-----
reconstruced - 2D to 4D
[[5.08303897 3.51741393 1.40321372 0.21353169]
 [4.7462619  3.15749994 1.46356177 0.24024592]
 [4.70411871 3.1956816  1.30821697 0.17518015]
 [4.6422117  3.05696697 1.46132981 0.23973218]
 [5.07175511 3.52655486 1.36373845 0.19699991]
 [5.50581049 3.79140823 1.67552816 0.32616959]
 [4.76528947 3.23041102 1.35723837 0.19551776]
 [5.00155648 3.39859911 1.47993231 0.2460815 ]
 [4.42052031 2.87903672 1.3855842  0.20882514]
 [4.80273233 3.20016781 1.48805402 0.2503016 ]
 [5.36090126 3.74023124 1.4985348  0.25243081]
 [4.90879014 3.28892521 1.51717562 0.26209953]
 [4.6820989  3.12115258 1.41198408 0.21884697]
 [4.34251794 2.95641673 1.08492393 0.08287986]
 [5.66151963 4.14156276 1.28795452 0.16277348]
 [5.85960752 4.23600886 1.48196707 0.24344301]
 [5.4275086  3.87100742 1.36995112 0.19816072]
 [5.09103106 3.50887425 1.43521594 0.22693854]
 [5.62144408 3.88058108 1.72215216 0.34527869]
 [5.24526768 3.65105838 1.4519108  0.23332171]
 [5.26539106 3.53834771 1.7110

TypeError: 'NoneType' object is not subscriptable