credits: https://github.com/DrMatters/hyperoptsearchcv

In [53]:
import numpy as np
from script.hpscv import HyperoptSearchCV

from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

In [54]:
X, y = make_classification(n_samples=1000, random_state=123)

print(X.shape, y.shape)

(1000, 20) (1000,)


In [55]:
pipe = Pipeline([
    ('selector', SelectKBest(f_classif, k='all')),
    ('estimator', DecisionTreeClassifier(random_state=123))
])

In [56]:
cv = KFold(n_splits=10, shuffle=True, random_state=123)

In [58]:
scores = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=cv, n_jobs=2, verbose=0)

print(scores)
print(scores.mean())

[0.99 0.96 0.98 0.97 1.   0.95 0.98 1.   0.97 0.97]
0.9770000000000001


In [45]:
from hyperopt import hp
from hyperopt.pyll.base import scope

search = {
    'selector__k': scope.int(hp.quniform('selector__k', 1, 20, 1)),
    
    'estimator__criterion': hp.choice('estimator__criterion', ['gini', 'entropy']),
    'estimator__max_depth': scope.int(hp.quniform('estimator__max_depth', 1, 30, 1)),
    'estimator__min_samples_split': scope.int(hp.quniform('estimator__min_samples_split', 2, 100, 1)),
    'estimator__min_samples_leaf': scope.int(hp.quniform('estimator__min_samples_leaf', 1, 100, 1)),
    'estimator__max_features': hp.choice('estimator__max_features', ['auto','sqrt','log2']),
    'estimator__min_impurity_decrease': hp.quniform('estimator__min_impurity_decrease', 0, 10, 1e-3)
}

In [46]:
hpscv = HyperoptSearchCV(
    estimator=pipe,
    search_space=search, 
    n_iter=1000, 
    scoring='accuracy',
    cv=cv,
    verbose=0,
    n_jobs=-1,
    greater_is_better=True
)

In [47]:
hpscv.fit(X, y)

100%|███████████████████████████████████████████| 1000/1000 [02:12<00:00,  7.54trial/s, best loss: -0.9780000000000001]


HyperoptSearchCV(cv=KFold(n_splits=10, random_state=123, shuffle=True),
                 estimator=Pipeline(steps=[('selector', SelectKBest(k=4)),
                                           ('estimator',
                                            DecisionTreeClassifier(criterion='entropy',
                                                                   max_depth=22,
                                                                   max_features='log2',
                                                                   min_impurity_decrease=0.007,
                                                                   min_samples_leaf=14,
                                                                   min_samples_split=88,
                                                                   random_state=123))]),
                 greater_is_better=True, n_iter=1000, n_jobs=-1,
                 s...
                               'estimator__max_features': <hyperopt.pyll.base.Apply object

In [48]:
hpscv.best_score_

-0.9780000000000001

In [49]:
hpscv.best_params_

{'estimator__criterion': 'entropy',
 'estimator__max_depth': 22,
 'estimator__max_features': 'log2',
 'estimator__min_impurity_decrease': 0.007,
 'estimator__min_samples_leaf': 14,
 'estimator__min_samples_split': 88,
 'selector__k': 4}

In [59]:
pipe.set_params(**hpscv.best_params_)

Pipeline(steps=[('selector', SelectKBest(k=4)),
                ('estimator',
                 DecisionTreeClassifier(criterion='entropy', max_depth=22,
                                        max_features='log2',
                                        min_impurity_decrease=0.007,
                                        min_samples_leaf=14,
                                        min_samples_split=88,
                                        random_state=123))])

In [60]:
scores = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=cv, n_jobs=2, verbose=0)

print(scores)
print(scores.mean())

[0.98 0.98 0.99 0.95 1.   0.97 0.98 0.99 0.96 0.98]
0.9780000000000001
