In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### 데이터 표준화(d 차원)

In [None]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

### 공분산 행렬 만들기

In [None]:
import numpy as np
cov_mat = np.cov(X_train_std.T)
cov_mat

### 공분산 행렬을 고유값과 고유벡터로 분해 

In [None]:
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
print(eigen_vals)
print(eigen_vecs)

### 가장 큰 고유값(k)에 대한 고유벡터(k vector) 선택, k는 새로운 feature 부분공간의 차원( k <= d )

In [None]:
from operator import itemgetter
indices, eigen_sorted = zip(*sorted(enumerate(eigen_vals), reverse=True, key=itemgetter(1)))
topk = indices[0]
top_eigenvectors = eigen_vecs[topk]
top_eigenvectors

### 고유벡터(k)로 부터 투영행렬 W 생성

In [None]:
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
               for i in range(len(eigen_vals))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
eigen_pairs

In [None]:
w = np.hstack((eigen_pairs[0][1][:,np.newaxis], eigen_pairs[1][1][:,np.newaxis]))
w

### 투영행렬 W를 사용하여 d 차원의 입력 데이터를 변환, k 차원 데이터 생성

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

X_train_pca = X_train_std.dot(w)
colors = ['navy', 'turquoise', 'darkorange']
for l,c,m in zip(np.unique(y_train), colors, target_names):
    plt.scatter(X_train_pca[y_train==l, 0],
                X_train_pca[y_train==l, 1],
                c=c, label=m)

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='best')
plt.show()
                

### scikit-learn

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of IRIS dataset')

plt.show()