In [1]:
import pandas as pd
import os

HOME = os.path.expanduser('~')

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name',
                                                             'dup_cnt']].apply(lambda x: x[0] + '_' + str(x[1])
                                                                              if x[1] > 0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df


def get_human_dataset():
    name = pd.read_csv(HOME+'/temp/human_activity/features.txt', sep='\s+',
                      header=None, names=['column_index', 'column_name'])
    
    new = get_new_feature_name_df(name)
    
    feature = new.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv(HOME+'/temp/human_activity/train/X_train.txt', sep='\s+',
                         names=feature)
    X_test = pd.read_csv(HOME+'/temp/human_activity/test/X_test.txt', sep='\s+',
                         names=feature)
    y_train = pd.read_csv(HOME+'/temp/human_activity/train/y_train.txt', sep='\s+',
                         names=['action'], header=None)
    y_test = pd.read_csv(HOME+'/temp/human_activity/test/y_test.txt', sep='\s+',
                         names=['action'], header=None)
    
    return X_train, X_test, y_train, y_test

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = get_human_dataset()

start_time = time.time()

gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)
pred = gb.predict(X_test)

time.time() - start_time

586.7200148105621

In [7]:
accuracy_score(y_test, pred)

0.9385816084153377

In [8]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1]
}

grid = GridSearchCV(gb, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
grid.best_params_

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed: 10.2min remaining: 30.5min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 16.2min finished


{'learning_rate': 0.05, 'n_estimators': 200}

In [9]:
grid.best_score_

0.9007072905331883

In [10]:
best = grid.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, best)
accuracy

0.9345096708517137