In [2]:
# GBM based classification using GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
#### UTIL ####

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# In feature.txt, the feature indicies and the feature names are separated by blanks (one blank for each set).
feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+', header=None, names=['column_index', 'column_name'])

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x: x[0]+'_'+str(x[1]) if x[1]>0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

def get_human_dataset():
    # Each data file is separated by blanks. So in read_csv, set the 'sep' as '\s+' (blank).
    feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+', header=None, names=['column_index', 'column_name'])
    # Create new_feature_name_df.
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    # To create a DataFrame with the new feature names as its columns, change it to a list object.
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    # Now, get train sets and test sets of features and labels.
    X_train = pd.read_csv('./human_activity/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    y_train = pd.read_csv('./human_activity/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./human_activity/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

In [4]:
# Create human dataset

X_train, X_test, y_train, y_test = get_human_dataset()

In [5]:
# Let's estimate the running time of the GBM classifier fitting
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print('GBM 정확도: {0:.4f}'.format(gb_accuracy))
print('GBM 수행 시간: {0:.1f}'.format(time.time() - start_time))

GBM 정확도: 0.9389
GBM 수행 시간: 472.3


In [6]:
# As it can be seen above, GBM method takes a very long time to fit the data.
# GradientBoostingClassifier trains itself by continuous prediction error correction (step by step).
# Therefore, multi-thread training is not applicable for GBM, unlike random forest, which is often much faster than GBM.

# The hyper parameters of this model can also be optimised using GridSearchCV.
# The bottom code willfind the best set of params, but it will take about an hour or two to finish finding the optimal parameters.

from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 500],
    'learning_rate': [0.05, 0.1]
}

start_time = time.time()

grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
print('GBM 수행 시간: {0:.1f}'.format(time.time() - start_time))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
최적 하이퍼 파라미터:
 {'learning_rate': 0.1, 'n_estimators': 500}
최고 예측 정확도: 0.9011
GBM 수행 시간: 7718.7


In [None]:
# As shown, it took about two hours to finish the task.