In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/features.txt'
feature_name_df = pd.read_csv(url, sep='\s+', header=None, names=['column_index', 'column_name'])
feature_names = feature_name_df.iloc[:, 1].values.tolist()

X_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/X_train.txt'
X_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/X_test.txt'

X_train = pd.read_csv(X_train_url, delim_whitespace=True, header=None)
X_test = pd.read_csv(X_test_url, delim_whitespace=True, header=None)

X_train.columns = feature_names
X_test.columns = feature_names

In [2]:
y_train_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/train/y_train.txt'
y_test_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/HAR_dataset/test/y_test.txt'

y_train = pd.read_csv(y_train_url, delim_whitespace=True, header=None, names=['action'])
y_test = pd.read_csv(y_test_url, delim_whitespace=True, header=None, names=['action'])

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings

warnings.filterwarnings('ignore')

In [4]:
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(X_train, y_train)

gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print('GBM 정확도: {0:.4f}'.format(gb_accuracy))
print('Fit time: {}'.format(time.time() - start_time))

GBM 정확도: 0.9389
Fit time: 942.4561104774475


In [5]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 500],
    'learning_rate': [0.05, 0.1],
}

start_time = time.time()
grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('Fit time: {}'.format(time.time() - start_time))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
Fit time: 12045.124438762665


In [6]:
print(grid_cv.best_params_)
grid = grid_cv.best_estimator_
grid_pred = grid.predict(X_test.values)
grid_accuracy = accuracy_score(y_test, grid_pred)
print('GridSearhCV Best Estimator 정확도: {0:.4f}'.format(grid_accuracy))

{'learning_rate': 0.05, 'n_estimators': 500}
GridSearhCV Best Estimator 정확도: 0.9393


In [12]:
import numpy as np

np.unique(y_train)

array([1, 2, 3, 4, 5, 6])

In [13]:
from xgboost import XGBClassifier

start_time = time.time()

# eta: learning rate, num_boost_rounds: n_estimators
xgb = XGBClassifier(eta=0.1, num_boost_rounds=400, max_depth=3)

# xgb에선 입력데이터를 array로 만들어야 함
# xgb에선 레이블이 0부터 시작하는 숫자이길 기대하기 때문에 레이블을 조정해야 함
y_train_adjust = y_train - 1
xgb.fit(X_train.values, y_train_adjust)
print('Fit time: {}'.format(time.time() - start_time))

xgb_pred = xgb.predict(X_test.values)

y_test_adjust = y_test - 1
xgb_accuracy = accuracy_score(y_test_adjust, xgb_pred)

print('XGBoost 정확도: {0:.4f}'.format(xgb_accuracy))

Fit time: 31.33473539352417
XGBoost 정확도: 0.9359


In [18]:
# 조기 종료 조건 추가
evals = [(X_test.values, y_test_adjust)]

start_time = time.time()
xgb = XGBClassifier(learning_rate=0.1, n_estimators=400, max_depth=3) #sklearn wrapper

# 조기 종료 조건을 추가하기 위해선 eval_set을 설정해야 함
xgb.fit(X_train.values, y_train_adjust, early_stopping_rounds=10, eval_set=evals)
print('Fit time: {}'.format(time.time() - start_time))

xgb_pred = xgb.predict(X_test.values)
xgb_accuracy = accuracy_score(y_test_adjust, xgb_pred)

print('XGBoost 정확도: {0:.4f}'.format(xgb_accuracy))

[0]	validation_0-mlogloss:1.58912
[1]	validation_0-mlogloss:1.43298
[2]	validation_0-mlogloss:1.30579
[3]	validation_0-mlogloss:1.19398
[4]	validation_0-mlogloss:1.10151
[5]	validation_0-mlogloss:1.01952
[6]	validation_0-mlogloss:0.94821
[7]	validation_0-mlogloss:0.88468
[8]	validation_0-mlogloss:0.82846
[9]	validation_0-mlogloss:0.77660
[10]	validation_0-mlogloss:0.73051
[11]	validation_0-mlogloss:0.68873
[12]	validation_0-mlogloss:0.65163
[13]	validation_0-mlogloss:0.61809
[14]	validation_0-mlogloss:0.58776
[15]	validation_0-mlogloss:0.55936
[16]	validation_0-mlogloss:0.53447
[17]	validation_0-mlogloss:0.51131
[18]	validation_0-mlogloss:0.49076
[19]	validation_0-mlogloss:0.47043
[20]	validation_0-mlogloss:0.45119
[21]	validation_0-mlogloss:0.43441
[22]	validation_0-mlogloss:0.41777
[23]	validation_0-mlogloss:0.40352
[24]	validation_0-mlogloss:0.38949
[25]	validation_0-mlogloss:0.37684
[26]	validation_0-mlogloss:0.36371
[27]	validation_0-mlogloss:0.35286
[28]	validation_0-mlogloss:0.3

In [32]:
from lightgbm import LGBMClassifier
from lightgbm import early_stopping

evals = [(X_test.values, y_test)]

start_time = time.time()
lgbm = LGBMClassifier(n_estimators=400)
lgbm.fit(X_train.values, y_train,
         eval_set=evals,
         callbacks=[early_stopping(stopping_rounds=100)])
print('Fit time: {}'.format(time.time() - start_time))

lgbm_pred = lgbm.predict(X_test.values)
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)

print('LGBM 정확도: {0:.4f}'.format(lgbm_accuracy))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140170
[LightGBM] [Info] Number of data points in the train set: 7352, number of used features: 561
[LightGBM] [Info] Start training from score -1.791216
[LightGBM] [Info] Start training from score -1.924514
[LightGBM] [Info] Start training from score -2.009071
[LightGBM] [Info] Start training from score -1.743436
[LightGBM] [Info] Start training from score -1.677246
[LightGBM] [Info] Start training from score -1.653513
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's multi_logloss: 0.233106
Fit time: 55.45976281166077
LGBM 정확도: 0.9260
