In [None]:
import xgboost
import hyperopt
import lightgbm

print(f'xgboost : {xgboost.__version__}')
print(f'lightgbm : {lightgbm.__version__}')
print(f'hyperopt : {hyperopt.__version__}')

xgboost : 1.5.0
lightgbm : 3.3.2
hyperopt : 0.2.7


### HyperOpt를 이요한 하이퍼 파리미터 튜닝

1. 검색 공간 설정
2. 대체모델을 위한 목적함수 지정
3. 최적의 파라미터를 유추

In [9]:
from hyperopt import hp

In [19]:
### 검색 공간
search_space = {
    'x' : hp.quniform('x', -10, 10, 1) # -10에서 10사이의 1
    , 'y' : hp.quniform('y', -15, 15, 1)
}

In [22]:
### 목적함수 생성

from hyperopt import STATUS_OK
def objective_func(search_space):
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y

    return retval

In [27]:
### 최적 입력값을 유추

from hyperopt import fmin, tpe, Trials
import numpy as np

#### fmin - 목적함수가 최적의 값을 찾도록 도움
#### Trials - 실제로 실행함

trial_val = Trials()

best_01 = fmin(fn=objective_func
               , space = search_space
               , algo = tpe.suggest
               , max_evals = 5
               , trials = trial_val
               , rstate = np.random.default_rng(seed = 0)
               )

100%|██████████| 5/5 [00:00<00:00, 999.83trial/s, best loss: -224.0]


{'x': -4.0, 'y': 12.0}

In [30]:
trial_val = Trials()

best_02 = fmin(fn=objective_func
               , space = search_space
               , algo = tpe.suggest
               , max_evals = 20
               , trials = trial_val
               , rstate = np.random.default_rng(seed= 0)
               )

best_02

100%|██████████| 20/20 [00:00<00:00, 1052.60trial/s, best loss: -296.0]


{'x': 2.0, 'y': 15.0}

In [36]:
trial_val.vals

{'x': [-6.0,
  -4.0,
  4.0,
  -4.0,
  9.0,
  2.0,
  10.0,
  -9.0,
  -8.0,
  -0.0,
  -0.0,
  1.0,
  9.0,
  6.0,
  9.0,
  2.0,
  -2.0,
  -4.0,
  7.0,
  -0.0],
 'y': [5.0,
  10.0,
  -2.0,
  12.0,
  1.0,
  15.0,
  7.0,
  -10.0,
  0.0,
  -5.0,
  -3.0,
  2.0,
  4.0,
  10.0,
  3.0,
  3.0,
  -14.0,
  -8.0,
  11.0,
  -0.0]}

In [37]:
trial_val.results

[{'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -296.0, 'status': 'ok'},
 {'loss': -40.0, 'status': 'ok'},
 {'loss': 281.0, 'status': 'ok'},
 {'loss': 64.0, 'status': 'ok'},
 {'loss': 100.0, 'status': 'ok'},
 {'loss': 60.0, 'status': 'ok'},
 {'loss': -39.0, 'status': 'ok'},
 {'loss': 1.0, 'status': 'ok'},
 {'loss': -164.0, 'status': 'ok'},
 {'loss': 21.0, 'status': 'ok'},
 {'loss': -56.0, 'status': 'ok'},
 {'loss': 284.0, 'status': 'ok'},
 {'loss': 176.0, 'status': 'ok'},
 {'loss': -171.0, 'status': 'ok'},
 {'loss': 0.0, 'status': 'ok'}]

In [39]:
losses = [loss_dict['loss'] for loss_dict in trial_val.results]
losses

[-64.0,
 -184.0,
 56.0,
 -224.0,
 61.0,
 -296.0,
 -40.0,
 281.0,
 64.0,
 100.0,
 60.0,
 -39.0,
 1.0,
 -164.0,
 21.0,
 -56.0,
 284.0,
 176.0,
 -171.0,
 0.0]

In [41]:
import pandas as pd

result_df = pd.DataFrame({'x' : trial_val.vals['x']
                        , 'y' : trial_val.vals['y']
                        , 'losses' : losses})
result_df

Unnamed: 0,x,y,losses
0,-6.0,5.0,-64.0
1,-4.0,10.0,-184.0
2,4.0,-2.0,56.0
3,-4.0,12.0,-224.0
4,9.0,1.0,61.0
5,2.0,15.0,-296.0
6,10.0,7.0,-40.0
7,-9.0,-10.0,281.0
8,-8.0,0.0,64.0
9,-0.0,-5.0,100.0


In [55]:
### XGBoost 하이퍼 파리미터 최적화

from hyperopt import fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np

dataset = load_breast_cancer()
cancer_df = pd.DataFrame(data = dataset.data, columns = dataset.feature_names)
cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [43]:
X_features = dataset.data
y_label = dataset.target

# 1단계
X_train, X_test, y_train, y_test = \
train_test_split(X_features, y_label, test_size = 0.8, random_state= 156)

# 2단계
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)

In [102]:
# 검색 공간
search_space = {
    'n_estimator' : scope.int(hp.quniform('n_estimator', 50, 300, 10))
    , 'max_depth' : scope.int(hp.quniform('max_depth', 5, 20, 1))
    , 'learning_rate' : hp.uniform('learning_rate', 0.01, 0.3) # 지수로 뽑음
    , 'subsample' : hp.uniform('subsample', 0.5, 1.0)
    , 'colsample_bytree' :hp.uniform('colsample_bytree', 0.5, 1.0)
}

### 목적 함수

In [110]:
def objective_func_xgb(params):
    model = XGBClassifier(
        n_estimator = params['n_estimator']
        , max_depth = params['max_depth']
        , learning_rate = params['learning_rate']
        , subsample = params['subsample']
        , colsample_bytree = params['colsample_bytree']
        , random_state = 42
        , eval_metric = 'logloss'
    )

    score_mean = cross_val_score(model
                                , X_train
                                , y_train
                                , cv = 5
                                , scoring = 'accuracy'
                                ).mean()
    
    return {'loss' : -1 * score_mean, 'status' : STATUS_OK}
                   


### 파라미터 유추

In [107]:
import warnings

warnings.filterwarnings('ignore')

In [111]:
trials = Trials()

best_params = fmin(
                fn = objective_func_xgb
                , space = search_space
                , algo = tpe.suggest
                , max_evals = 50
                , trials = trials
            )
best_params

Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimator" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimator" } might not be use

{'colsample_bytree': 0.9736023910000469,
 'learning_rate': 0.20555040309652475,
 'max_depth': 9.0,
 'n_estimator': 240.0,
 'subsample': 0.9107832477446638}

In [122]:
best_model = XGBClassifier(n_estimators = int(best_params['n_estimator'])
                           , max_depth = int(best_params['max_depth'])
                           , learning_rate = best_params['learning_rate']
                           , subsample = best_params['subsample']
                           , colsample_bytree = best_params['colsample_bytree']
                           , random_state = 42
                           , eval_metrics = 'logloss'
                           )

best_model

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9736023910000469,
              enable_categorical=False, eval_metrics='logloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=0.20555040309652475, max_delta_step=None,
              max_depth=9, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=240, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=42,
              reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
              subsample=0.9107832477446638, tree_method=None,
              validate_parameters=None, verbosity=None)

In [123]:
best_model.fit(X_train, y_train)

Parameters: { "eval_metrics" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9736023910000469,
              enable_categorical=False, eval_metrics='logloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.20555040309652475, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=240, n_jobs=16, num_parallel_tree=1,
              predictor='auto', random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.9107832477446638,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [124]:
from sklearn.metrics import accuracy_score

pred = best_model.predict(X_test)

accuracy_score(y_test, pred)

0.9517543859649122