## PCA - Iris 사례
- PCA (Principal Component ANaltsis, 주성분 분석)

In [1]:
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

In [5]:
from sklearn.preprocessing import StandardScaler
iris_std = StandardScaler().fit_transform(iris.data)

#### 1. Numpy 로 직접 구하기 
- 공분산 (Convariance) 행렬

In [6]:
iris_cov = np.dot(iris_std.T, iris_std)/ (len(iris_std) -1)
iris_cov

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [7]:
np.cov(iris_std.T)

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

- 교유백터(eigenvector), 고유값(eigenvalue)

In [10]:
eig_vals, eig_vecs = np.linalg.eig(iris_cov)
eig_vecs

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

In [11]:
eig_vals

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

- 주성분: PC1, PC2Z

In [14]:
pc1 = np.dot(iris_std, eig_vecs[:, :1])
pc1[:5]

array([[-2.26470281],
       [-2.08096115],
       [-2.36422905],
       [-2.29938422],
       [-2.38984217]])

In [16]:
pc2 = np.dot(iris_std, eig_vecs[:, 1:2])
pc2[:5]

array([[-0.4800266 ],
       [ 0.67413356],
       [ 0.34190802],
       [ 0.59739451],
       [-0.64683538]])

In [17]:
# 4개의 차원의 값을 PC1, PC2(2차원)으로 만듬 
# ex)[[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ] ->[-2.26470281],[-0.4800266 ]
iris_std[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

#### 2. Scikit_LEarn 으로 구하기

In [18]:
from sklearn.decomposition import PCA 
pca = PCA(n_components=2)

In [19]:
# 두번째 주성분은 PC2와 부호가 다름
# 방향만 180도 다를 뿐, P1과 직교하는 것은 동일함. 
iris_pca = pca.fit_transform(iris_std)
iris_pca[:5]

array([[-2.26470281,  0.4800266 ],
       [-2.08096115, -0.67413356],
       [-2.36422905, -0.34190802],
       [-2.29938422, -0.59739451],
       [-2.38984217,  0.64683538]])

In [20]:
# 주성분으로 변환 했을 경우 원 자료중 몇 퍼센트를 설명하는가
pca.explained_variance_ratio_,sum(pca.explained_variance_ratio_)

(array([0.72962445, 0.22850762]), 0.9581320720000164)

 ####