# Week 6 Session :: PCA

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

### eigen decomposition 을 이용한 PCA 

In [None]:
# generate toy dataset
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

X.shape

In [None]:
# PCA를 수행하기 위해 먼저 공분산행렬을 구합니다.
X_cen = X - X.mean(axis=0)  # scaling
X_cov = np.dot(X_cen.T, X_cen) / 59 # covariance matrix

print(X_cov)

In [None]:
# 다음으로 공분산행렬 X_cov에 대하여 eigenvalue와 eigenvector를 구합니다.
w, v = np.linalg.eig(X_cov)

print('eigenvalue :', w)
print('eigenvector :', v)

In [None]:
# 마지막으로 데이터에 대한 eigenvector들의 설명력을 확인합니다.
print('explained variance ratio :', w / w.sum())

### Singular Value Decomposition을 이용한 PCA

In [None]:
# generate toy dataset
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1
angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

X.shape

In [None]:
# PCA를 수행하기 위해 먼저 데이터셋 X를 스케일링 합니다.
X_cen = X - X.mean(axis=0)  # scaling

print(X[0:5])
print(X_cen[0:5])

In [None]:
# 다음으로 X_cen에 대하여 singular value와 singular vector를 구합니다.
U, D, V_t = np.linalg.svd(X_cen)

print('singular value :', D)
print('singular vector :\n', V_t.T)

In [None]:
# 마지막으로 설명력을 확인합니다.
print('explained variance ratio :', D ** 2 / np.sum(D**2))

# Week 6 LAB :: PCA & FA

In [None]:
# load iris dataset
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names']+['target'])
df['target'] = df['target'].map({0:'setosa', 1:'versicolor', 2:'virginica'})
df.head()

In [None]:
df.shape

In [None]:
df.describe()

### Principal Component Analysis

In [None]:
# 먼저 scaling을 실시합니다.
# Transform scales onto unit scale (mean=0 and variance=1)
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
X = df.loc[:, features].values
y = df.loc[:, ['target']].values

X = StandardScaler().fit_transform(X) # Standardization using Standard Scaler
X

In [None]:
pd.DataFrame(data=X, columns=features).head()

In [None]:
# 다음으로 공분산행렬을 구합니다.
covariance_matrix = np.cov(X.T)

print(covariance_matrix)

In [None]:
# 공분산행렬에 대하여 eigenvalue와 eigenvector를 구합니다.
eig_vals, eig_vecs = np.linalg.eig(covariance_matrix)

print('eigenvalue :', eig_vals) # eigen values
print('eigenvector :', eig_vecs) # eigen vectors

가장 큰 고유값인 2.93808505을 갖는 eigen vector가 제1주성분이 됩니다.

In [None]:
# PCA를 진행합니다.
pca = PCA(n_components=2) # PCA 선언 : 2차원으로 차원 축소 => 제1주성분, 제2주성분 총 2개의 주성분 반환
X_pca = pca.fit_transform(X) # PCA 적합

df_pca = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])
df_pca = pd.concat([df_pca, df[['target']]], axis=1)

In [None]:
# PCA 결과를 시각화를 통해 확인합니다.
plt.figure(figsize=(10, 5))
sns.scatterplot(x = df_pca['Principal Component 1'], y = df_pca['Principal Component 2'], hue = finalDf['target'])

In [None]:
# 주성분들의 설명력을 확인합니다.
pca.explained_variance_ratio_

PC1은 데이터를 약 72% 정도, PC2는 23% 정도 설명합니다.

### Factor Analysis : scikit-learn의 FactorAnalysis 이용

In [None]:
# Create factor analysis object and perform factor analysis
X = df.loc[:, features].values
fa = FactorAnalysis(n_components=10)
X_fa = fa.fit_transform(X)

In [None]:
df_fa = pd.DataFrame(data=X_fa, columns=['Factor1', 'Factor2', 'Factor3', 'Factor4'])
df_fa.head()

Factor1과 Factor2만 유의미합니다.

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x = df_fa['Factor1'], y = df_fa['Factor2'], hue=df['target'])

### Factor Analysis : factor_analyzer의 FactorAnalyzer 이용

In [None]:
# Import the required modules
import pandas as pd
import sklearn.datasets
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt

In [None]:
# Import the dataset
fpath = './bfi.csv'
df = pd.read_csv(fpath, index_col=0)
df.head()

In [None]:
# Drop unnecessary columns
df.drop(['gender', 'education', 'age'], axis=1, inplace=True)
df.head()

In [None]:
df = df.dropna()

In [None]:
# create a factor analyzer variable and perform factor analysis
fa = FactorAnalyzer(n_factors=6, rotation='varimax')

In [None]:
fa.fit(df)

In [None]:
# get the eigenvectors and eigenvalues
ev, v = fa.get_eigenvalues()

In [None]:
# Do a scree plot
xvals = range(1, df.shape[1]+1)

plt.scatter(xvals, ev)
plt.plot(xvals, ev)
plt.title('Scree Plot')
plt.xlabel('Factor')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

# Week6 HW

wine dataset을 활용하여 PCA 혹은 FA를 적용한 후, 적용하기 전과 적용한 후 분류모델의 성능을 비교해주세요.

### 0) Import Data

In [1]:
from sklearn.datasets import load_wine
wine = load_wine()

In [6]:
data = pd.DataFrame(wine.data, columns=wine.feature_names)
target = pd.Series(wine.target, dtype="category")
target = target.cat.rename_categories(wine.target_names)
data['class'] = target

In [8]:
data.rename(columns = {'od280/od315_of_diluted_wines' : 'diluted_wines'}, inplace = True)

In [14]:
data = data.replace({'class':'class_0'}, {'class':1}).copy()
data = data.replace({'class':'class_1'}, {'class':2}).copy()
data = data.replace({'class':'class_2'}, {'class':3}).copy()

In [15]:
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,1
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,1
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,1
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,1
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,1


### 1) Train-Test Split

In [16]:
data.iloc[:, 0:14]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,1
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,1
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,1
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,1
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,3
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,3
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,3
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,3


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data.iloc[:, 0:14], 
                                                    data['class'],
                                                   test_size = 0.3,
                                                   random_state = 26)

In [29]:
print(X_train.shape) ;print( X_test.shape)

(124, 14)
(54, 14)


In [30]:
from sklearn.preprocessing import StandardScaler
# Standardization using Standard Scaler
X_train_f = StandardScaler().fit_transform(X_train) 
X_test_f = StandardScaler().fit_transform(X_test)

In [31]:
Y_train.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

In [32]:
Y_train

0      2
1      1
2      1
3      2
4      2
      ..
119    2
120    2
121    1
122    2
123    1
Name: class, Length: 124, dtype: category
Categories (3, int64): [1, 2, 3]

### PCA, FA 사용 전 Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [36]:
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
Y_pred = forest.predict(X_test)

print(metrics.accuracy_score(Y_test,Y_pred))

1.0


### FA를 돌려보겠다.

In [41]:
pip install factor_analyzer

Collecting factor_analyzer
  Downloading factor_analyzer-0.3.2.tar.gz (40 kB)
Building wheels for collected packages: factor-analyzer
  Building wheel for factor-analyzer (setup.py): started
  Building wheel for factor-analyzer (setup.py): finished with status 'done'
  Created wheel for factor-analyzer: filename=factor_analyzer-0.3.2-py3-none-any.whl size=40383 sha256=5297d786f09b8ea9ac3b32f1988c9d2ac19640618f16854e48c4df2a15e57d6f
  Stored in directory: c:\users\dhxog\appdata\local\pip\cache\wheels\8d\9e\4c\fd4cb92cecf157b13702cc0907e5c56ddc48e5388134dc9f1a
Successfully built factor-analyzer
Installing collected packages: factor-analyzer
Successfully installed factor-analyzer-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [42]:
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt

In [45]:
cov_mat = np.cov(X_train_f.T)
eig_val, eig_vec = np.linalg.eig(cov_mat)

In [46]:
eig_val

array([5.41705365, 2.45823383, 1.50963167, 0.98872873, 0.96559724,
       0.71189244, 0.53409898, 0.05761138, 0.14058512, 0.16759196,
       0.35637604, 0.30425134, 0.23854965, 0.26361911])

#### Eigen Value가 1 이상인 것이 세 개라 세 개로 하겠다.

In [47]:
fa = FactorAnalyzer(n_factors=3, rotation='varimax')

In [48]:
fa_fit = fa.fit(X_train_f)
X_fa = fa_fit.transform(X_train_f)

In [49]:
X_fa_test = fa_fit.transform(X_test_f)

In [50]:
famodel = RandomForestClassifier(n_estimators=100)
famodel.fit(X_fa, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
print(famodel.score(X_fa, Y_train))

1.0


In [52]:
print(famodel.score(X_fa_test, Y_test))

1.0
