In [1]:
# 데이터 로드
from sklearn.datasets import fetch_openml
mnist=fetch_openml('mnist_784', version=1)

In [2]:
# train, test split
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]
X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

In [4]:
y_train

0        5
1        0
2        4
3        1
4        9
        ..
59995    8
59996    3
59997    5
59998    6
59999    8
Name: class, Length: 60000, dtype: category
Categories (10, object): ['0', '1', '2', '3', ..., '6', '7', '8', '9']

* 랜덤포레스트에 적용

In [5]:
# 랜덤포레스트에 적용 (n_estimators=10, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import numpy as np

rcf = RandomForestClassifier(random_state=42,n_estimators = 10)
scores = cross_val_score(rcf,X_train,y_train,scoring='accuracy',cv=3)
print('원본 데이터 교차 검증 개별 정확도:',scores)
print('원본 데이터 평균 정확도:',np.mean(scores))

[0.94095 0.9376  0.94055]
0.9396999999999999


In [6]:
# 정확도 결과
# 랜덤 포레스트 학습 및 별도의 테스트 세트로 예측 성능 평가
from sklearn.metrics import accuracy_score
rcf.fit(X_train, y_train)
pred = rcf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.9492


* PCA로 변환한 데이터 세트에 랜덤 포레스트 적용

In [7]:
from sklearn.preprocessing import StandardScaler

X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)

In [32]:
# PCA 적용 95% : n_components=0.95
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(X_train_scaled)
train_pca = pca.transform(X_train_scaled)
print(train_pca.shape)

(60000, 331)


In [27]:
X_train.shape

(60000, 784)

In [11]:
print(pca.explained_variance_ratio_)
np.sum(pca.explained_variance_ratio_)

[0.05646717 0.04078272 0.0373938  0.02885115 0.02521109 0.0219427
 0.01923344 0.01745799 0.01535092 0.0140172  0.01341743 0.01203742
 0.0111457  0.01089924 0.01028649 0.00994487 0.00936383 0.00921046
 0.00893437 0.00869913 0.00827363 0.00803417 0.00764846 0.00741772
 0.00715293 0.00691847 0.00684136 0.00656675 0.00631677 0.0061292
 0.00596255 0.00587716 0.00571592 0.00562307 0.00554682 0.00538418
 0.00531182 0.00519606 0.00508211 0.00480006 0.00476456 0.00469139
 0.00454349 0.00451346 0.00446963 0.00443383 0.00438215 0.00430382
 0.00426878 0.00423647 0.00404696 0.00399447 0.00397456 0.00393821
 0.00385814 0.00379043 0.00375403 0.00370776 0.00364944 0.00359301
 0.00352382 0.00347794 0.00344411 0.00339868 0.00335955 0.00334886
 0.00331864 0.00323026 0.00316277 0.00313244 0.00310731 0.00307243
 0.00304914 0.00302717 0.00299485 0.00297761 0.00295052 0.00290438
 0.00286856 0.00285678 0.00283398 0.00282627 0.00279551 0.00279305
 0.00278519 0.00277455 0.00275901 0.00274227 0.00271411 0.002692

0.9502951572319143

In [14]:
# PCA 이후 랜덤포레스트 훈련
rcf = RandomForestClassifier(random_state=42,n_estimators = 10)
scores = cross_val_score(rcf,data_pca,y_train,scoring='accuracy',cv=3)
print('원본 데이터 교차 검증 개별 정확도:',scores)
print('원본 데이터 평균 정확도:',np.mean(scores))

원본 데이터 교차 검증 개별 정확도: [0.87095 0.86565 0.87315]
원본 데이터 평균 정확도: 0.8699166666666667


In [34]:
# 정확도 결과 (X_test도 PCA 적용해야합니다.)
test_pca = pca.transform(X_test_scaled)

In [21]:
test_pca.shape

(10000, 331)

In [35]:
rcf.fit(train_pca, y_train)
pred = rcf.predict(test_pca)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.8900
