In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time 
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

In [6]:
# 중복된 feature명을 수정해주는 함수 제작

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),
                                 columns=['dup_cnt'])  # 중복값을 카운트하여 피쳐들의 누적 개수를 담은 데이터프레임을 만듦
    feature_dup_df = feature_dup_df.reset_index()  # 인덱스값은 모두 리셋함
    
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    # 아우터조인(합집합): 각 피쳐들의 누적 갯수와 피쳐 이름을 병합
    
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    # 누적 중복값이 1개 이상인 경우 피쳐 이름에 _n으로 숫자를 적어줌
    
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    # indext 열 삭제
    
    return new_feature_name_df

In [7]:
# 데이터셋 불러오기
def get_human_dataset():
    feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+',
                                     header=None, names=['column_index', 'column_name'])

    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('./human_activity/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('./human_activity/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./human_activity/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [9]:
# 시간 측정
start_time = time.time()

In [12]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
accuracy = accuracy_score(y_test, pred)

print('accuracy: {0:.4f}'.format(accuracy))
print('running time: {0:.1f} s'.format(time.time() - start_time))

accuracy: 0.9389
running time: 5549.7 s


In [13]:
# 너무 오래걸려 돌리다가 그냥 끊어버림
# 하이퍼파라미터 최적화
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100, 500], 
    'learning_rate' : [0.05, 0.1]
}

grid_cv = GridSearchCV(gb, param_grid=params, cv=2, verbose=1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼파라미터: \n', grid_cv.best_params_)
print('최고 정확도: {0:.4f}'.format(grid_cv.best_score_))

Fitting 2 folds for each of 4 candidates, totalling 8 fits


KeyboardInterrupt: 

In [None]:
# 최적화된 파라미터를 가지고 다시 학습시켜봄
gb_pred = grid_cv.best_estimator_.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print('GBM 정확도: {0:.4f}'.format(gb_accuracy))