In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as lin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import time

df = pd.read_csv('train.csv')

In [2]:
X_train = df.drop('label', axis=1).values
target_label = df['label'].values

X_tr, X_test, y_tr, y_test = train_test_split(X_train, target_label, test_size=0.2, random_state = 2019)

## original data standardization

In [3]:
standardizer = StandardScaler().fit(X_tr)

X_tr_std = standardizer.transform(X_tr)
X_test_std = standardizer.transform(X_test)
X_tr_std.shape #(33600, 784)



(33600, 784)

## PCA (eigenvalue > 1)

In [4]:
cov_mat = np.cov(X_tr.T)
cov_mat.shape
explain_values_raw, components_raw = lin.eig(cov_mat)
pca_k = len(explain_values_raw[explain_values_raw > 1])

In [5]:
pca = PCA(pca_k).fit(X_tr_std)
pca_X_tr = pca.transform(X_tr_std)
pca_X_test = pca.transform(X_test_std)
components = pca.components_
pca_X_tr.shape #(33600, 649)

(33600, 649)

### knn with original data

In [6]:
time_start = time.time()
real_knn_k = int(np.sqrt(X_tr.shape[0]))
real_knn = KNeighborsClassifier(n_neighbors = real_knn_k, n_jobs = -1)
real_knn.fit(X_tr_std, y_tr)
#clusters = real_knn.predict(X_test_std)
print( 'Accuracy: {} '.format(real_knn.score(X_test_std, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.9182142857142858 
Time elapsed: 183.388032913208 seconds
'''

Accuracy: 0.9182142857142858 
Time elapsed: 183.388032913208 seconds


### knn with pca data

In [7]:
time_start = time.time()
pca_knn_k = int(np.sqrt(pca_X_tr.shape[0]))
pca_knn = KNeighborsClassifier(n_neighbors = pca_knn_k, n_jobs = -1)
pca_knn.fit(pca_X_tr, y_tr)
#clusters = pca_knn.predict(pca_X_test)
print( 'Accuracy: {} '.format(pca_knn.score(pca_X_test, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.9205952380952381 
Time elapsed: 162.1506233215332 seconds
'''

Accuracy: 0.9205952380952381 
Time elapsed: 162.1506233215332 seconds


### random forest with original data

In [8]:
time_start = time.time()
real_rf = RandomForestClassifier(n_estimators =10,  n_jobs = -1, random_state = 2019)
real_rf.fit(X_tr_std, y_tr)
#clusters = real_rf.predict(X_test_std)
print( 'Accuracy: {} '.format(real_rf.score(X_test_std, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.9364285714285714 
Time elapsed: 1.7449746131896973 seconds
'''

Accuracy: 0.9364285714285714 
Time elapsed: 1.7449746131896973 seconds


### random forest with pca data

In [9]:
time_start = time.time()
pca_rf = RandomForestClassifier(n_estimators = 10,  n_jobs = -1, random_state = 2019)
pca_rf.fit(pca_X_tr, y_tr)
#clusters = pca_rf.predict(pca_X_test)
print( 'Accuracy: {} '.format(pca_rf.score(pca_X_test, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.8263095238095238 
Time elapsed: 6.1536946296691895 seconds
'''

Accuracy: 0.8263095238095238 
Time elapsed: 6.1536946296691895 seconds


knn의 k 개수는 통상적으로 사용되는 전체 차원수의 제곱근을 사용하였다.

knn의 경우 original data에 비해 pca 데이터가 시간이 20초 가량 빨라졌다. ( test accuracy 약 0.01 하락)

random forest의 경우 test accuracy 약 0.1 감소하며 오히려 소요시간이 증가하는 모습을 보인다.

이는 original data의 원소들의 상당수가 0인것에 비해 pca 데이터는 모든 원소가 특정한 수를 가지고 있어서 연산량이 늘어난 것으로 보인다.

## PCA (10)

In [6]:
pca = PCA(10).fit(X_tr_std)
pca_X_tr = pca.transform(X_tr_std)
pca_X_test = pca.transform(X_test_std)
components = pca.components_
pca_X_tr.shape #(33600, 10)

(33600, 10)

### knn with pca (10) data

In [8]:
time_start = time.time()
pca_knn_k = int(np.sqrt(pca_X_tr.shape[0]))
pca_knn = KNeighborsClassifier(n_neighbors = pca_knn_k, n_jobs = -1)
pca_knn.fit(pca_X_tr, y_tr)
#clusters = pca_knn.predict(pca_X_test)
print( 'Accuracy: {} '.format( pca_knn.score(pca_X_test, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.8990476190476191 
Time elapsed: 0.6804764270782471 seconds
'''

Accuracy: 0.8676190476190476 
Time elapsed: 2.8538858890533447 seconds


'\nAccuracy: 0.8990476190476191 \nTime elapsed: 0.6804764270782471 seconds\n'

### random forest with pca (10) data

In [12]:
time_start = time.time()
pca_rf = RandomForestClassifier(n_estimators = 10,  n_jobs = -1, random_state = 2019)
pca_rf.fit(pca_X_tr, y_tr)
#clusters = pca_rf.predict(pca_X_test)
print( 'Accuracy: {} '.format( pca_rf.score(pca_X_test, y_test)))
print( 'Time elapsed: {} seconds'.format(time.time() - time_start))
'''
Accuracy: 0.8775 
Time elapsed: 0.8635261058807373 seconds
'''

Accuracy: 0.8775 
Time elapsed: 0.8635261058807373 seconds


pca 의 시간 감소를 확인하기위해 차원 수를 10으로 줄였다.

knn의 경우 original data가 183초 걸렸던 것에 비해 0.7초로 연산이 매우 빨라졌고 accuracy는 0.92에서 0.90으로 줄었다.

random forest의 경우 original data의 1.7초에서 0.86초로 빨라졌고 accuracy는 0.93에서 0.88로 감소하였다.