In [40]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.25, random_state=0)
clf = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [100,150,200],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 6, 7, 8]
    }

# GridSearch

In [44]:
import time
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=1)

start = time.time()
grid.fit(X_train, y_train)
print(time.time() - start, '초')

pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score',ascending=False).reset_index().loc[0,
                                ['param_criterion','param_max_depth',
                                'param_max_features','param_min_samples_leaf',
                                'param_min_samples_split','param_n_estimators',
                                'mean_test_score','std_test_score']]

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
75.50424480438232 초


param_criterion             entropy
param_max_depth                   8
param_max_features             log2
param_min_samples_leaf            1
param_min_samples_split           3
param_n_estimators              200
mean_test_score            0.976247
std_test_score             0.002945
Name: 0, dtype: object

# RandomSearch

In [45]:
from sklearn.model_selection import RandomizedSearchCV

# n_iter 조절 가능
rand = RandomizedSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=1, n_iter=100)
start = time.time()
rand.fit(X_train, y_train)
print(time.time() - start, '초')

pd.DataFrame(rand.cv_results_).sort_values(by='mean_test_score',ascending=False).reset_index().loc[0,
                                ['param_criterion','param_max_depth',
                                'param_max_features','param_min_samples_leaf',
                                'param_min_samples_split','param_n_estimators',
                                'mean_test_score','std_test_score']]

Fitting 5 folds for each of 100 candidates, totalling 500 fits
8.271660566329956 초


param_criterion             entropy
param_max_depth                   8
param_max_features             sqrt
param_min_samples_leaf            2
param_min_samples_split           5
param_n_estimators              100
mean_test_score            0.973273
std_test_score             0.002793
Name: 0, dtype: object

# Baysesian Optimization
sklearn과 syntax가 다르지만 같은 기능을 수행한다.

In [46]:
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score

# maximized score를 목표로
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0))

In [49]:
# study.optimize(objective, n_trials)으로 baysesian optimization을 수행
# search space를 정의하는 함수가 필요
# grid search에서의 탐색 범위 : 
    # 'n_estimators': [100,150,200],
    # 'criterion': ['gini', 'entropy'],
    # 'min_samples_split': [2, 3, 4, 5],
    # 'min_samples_leaf': [1, 2, 3, 4, 5],
    # 'max_features': ['sqrt', 'log2'],
    # 'max_depth': [5, 6, 7, 8]

def objective(trial):
    # hyperparameter를 trial.suggest_로 정의
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    max_depth = trial.suggest_int('max_depth', 5, 8)
    
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 criterion=criterion,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 max_features=max_features,
                                 max_depth=max_depth,
                                 random_state=0)
    # cross validation score를 최대화하는 hyperparameter를 찾음
    score = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=-1).mean()
    return score

In [50]:
# iteration마다의 verbose를 False로 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)
start = time.time()
study.optimize(objective, n_trials=100)
print(time.time() - start, '초')

57.715790033340454 초


In [56]:
display(pd.Series(study.best_params))
print('best score : ',study.best_value)

n_estimators             192
criterion            entropy
min_samples_split          3
min_samples_leaf           1
max_features            log2
max_depth                  8
dtype: object

best score :  0.9762439763183257
