In [1]:
import pandas as pd
import numpy as np
import os


# visualization
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from IPython.display import display
pd.set_option('display.max_columns', None)

%matplotlib inline

#general tooling
from functools import partial

#custom functions
import sys
sys.path.append('../data/')
sys.path.append('../src/')

# 0. Load Data

In [2]:
from sklearn.datasets import load_boston,\
                            load_diabetes,\
                            load_iris,\
                            fetch_california_housing,\
                            fetch_species_distributions

In [3]:
species = fetch_species_distributions(data_home='../data')

In [4]:
iris = load_iris()

X = pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length','petal_width'])
y = iris.target

In [5]:
X.head(),y

(   sepal_length  sepal_width  petal_length  petal_width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2
 2           4.7          3.2           1.3          0.2
 3           4.6          3.1           1.5          0.2
 4           5.0          3.6           1.4          0.2,
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

# 1. create missing

# 2. Pipeline

In [6]:
features_numericals = ['sepal_length','sepal_width','petal_length','petal_width']
features_mean_imputation = ['sepal_length']
features_median_imputation = ['sepal_width']
features_no_imputation = ['petal_length','petal_width']

In [7]:
#Pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
#Transformers
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,Imputer,StandardScaler
from transformers import ColumnSelector,CustomFeatureSelector
#Estimators
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# specification of different model types and their defaults
model_dictionary = {'xgb': XGBClassifier(),
                   'lgbm': LGBMClassifier()}

model_name = 'xgb'

estimator = Pipeline([
        ('features',CustomFeatureSelector(columns=[features_mean_imputation,features_median_imputation,features_no_imputation]) 
#          FeatureUnion(
#             [                
#                 ('mean_imputations', Pipeline([
#                                 ('selector', ColumnSelector(columns=features_mean_imputation))
#                                ,('imputer', Imputer(strategy='mean'))                            ])),
#                 ('median_imputations', Pipeline([
#                                 ('selector', ColumnSelector(columns=features_median_imputation))
#                                ,('imputer', Imputer(strategy='median'))                            ])),
#                 ('no_imputations', Pipeline([
#                                 ('selector', ColumnSelector(columns=features_no_imputation))        ]))
#             ], 
#         )
            ),  
        ('classifier', model_dictionary[model_name]),
])

estimator

Pipeline(memory=None,
     steps=[('features', CustomFeatureSelector(columns=[['sepal_length'], ['sepal_width'], ['petal_length', 'petal_width']])), ('classifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, m...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

## 3.0. Scoring

In [8]:
from sklearn.metrics import r2_score, mean_squared_error, make_scorer,accuracy_score

def MSE(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred)
    #print('MSE: %2.3f' % mse)
    return mse

def R2(y_true,y_pred):    
    r2 = r2_score(y_true, y_pred)
    #print('R2: %2.3f' % r2)
    return r2

def current_scorer():
    return make_scorer(accuracy_score, greater_is_better=True) # change for false if using MSE


## 3.1. TPE

In [9]:
from hyperopt import hp, tpe,Trials,STATUS_OK
from hyperopt.fmin import fmin

from sklearn.model_selection import cross_val_score,RepeatedKFold

In [18]:
n_iteration = 0
def objective(params,X,y,estimator,scorer,cv):
    global n_iteration
    # this is unfortunately a necessity because we have to transform floats to int
    params['classifier__n_estimators'] = int(params['classifier__n_estimators'])
    params['classifier__max_depth'] = int(params['classifier__max_depth'])
    
    estimator.set_params(**params)
    #since r2 is normally maximized
    score = 1.-cross_val_score(estimator, X, y, scoring=scorer, cv=cv).mean()
    print("- iter {}: mean_score {:.5f}".format(n_iteration,1.-score))
    n_iteration += 1
    return {'loss': score, 'status': STATUS_OK}

cv_dev = RepeatedKFold(n_splits=3,n_repeats=5)

trials_data = Trials() #define a database which will retain the trials
xgb_space = {
    'classifier__n_estimators': hp.quniform('classifier__n_estimators', 50, 500,1),#hp.choice('regressor__n_estimators', np.arange(50, 600, dtype=int)),
     'classifier__learning_rate': hp.quniform('classifier__learning_rate', 0.025, 0.25, 0.025), # A problem with max_depth casted to float instead of int with the hp.quniform method.
      'classifier__max_depth':  hp.quniform('classifier__max_depth', 1, 20,1),
      'classifier__min_child_weight': hp.quniform('classifier__min_child_weight', 1, 10, 1),
        'classifier__subsample': hp.quniform('classifier__subsample', 0.7, 1, 0.05),
       'classifier__gamma': hp.quniform('classifier__gamma', 0., 1, 0.05),
        'classifier__colsample_bytree': hp.quniform('classifier__colsample_bytree', 0.7, 1, 0.05),
        'classifier__reg_alpha' :  hp.quniform('classifier__reg_alpha', 0, 10, 1),
        'classifier__reg_lambda': hp.quniform('classifier__reg_lambda', 0, 10, 1)
}
space = hp.choice('0',
    [{'features__columns':
      hp.choice('features__columns',
                [
                    [['sepal_length','sepal_width'],['petal_length'],['petal_width']],
                    [['sepal_length'],['sepal_width','petal_length'],['petal_width']] 
                    #Note that you can easily write code to generate all the partitions
                ]),
      **xgb_space}]
)
     

best = fmin(fn=partial(objective,X=X,y=y,estimator=estimator,scorer=current_scorer(),cv=cv_dev),
            space=space,
            algo=tpe.suggest,
            trials = trials_data,
            max_evals= 20)
print('best result with accuracy score {}'.format(1.-trials_data.best_trial['result']['loss']))
print(best)

- iter 0: mean_score 0.94533
- iter 1: mean_score 0.94533
- iter 2: mean_score 0.94133
- iter 3: mean_score 0.93867
- iter 4: mean_score 0.93867
- iter 5: mean_score 0.94000
- iter 6: mean_score 0.93200
- iter 7: mean_score 0.93867
- iter 8: mean_score 0.94000
- iter 9: mean_score 0.94133
- iter 10: mean_score 0.95867
- iter 11: mean_score 0.94400
- iter 12: mean_score 0.94267
- iter 13: mean_score 0.93733
- iter 14: mean_score 0.93867
- iter 15: mean_score 0.94533
- iter 16: mean_score 0.94400
- iter 17: mean_score 0.94000
- iter 18: mean_score 0.94000
- iter 19: mean_score 0.95067
best result with accuracy score 0.9586666666666668
{'features__columns': 1, 'classifier__min_child_weight': 1.0, 'classifier__learning_rate': 0.225, 'classifier__n_estimators': 275.0, '0': 0, 'classifier__reg_lambda': 1.0, 'classifier__reg_alpha': 1.0, 'classifier__gamma': 0.9, 'classifier__subsample': 0.8500000000000001, 'classifier__colsample_bytree': 0.8500000000000001, 'classifier__max_depth': 11.0}


In [17]:
results = pd.DataFrame(trials_data.trials)
results.head()

Unnamed: 0,book_time,exp_key,misc,owner,refresh_time,result,spec,state,tid,version
0,2018-05-01 14:51:53.284,,"{'cmd': ('domain_attachment', 'FMinIter_Domain...",,2018-05-01 14:51:53.762,"{'loss': 0.06222222222222229, 'status': 'ok'}",,2,0,0
1,2018-05-01 14:51:53.770,,"{'cmd': ('domain_attachment', 'FMinIter_Domain...",,2018-05-01 14:51:54.205,"{'loss': 0.05111111111111122, 'status': 'ok'}",,2,1,0
2,2018-05-01 14:51:54.250,,"{'cmd': ('domain_attachment', 'FMinIter_Domain...",,2018-05-01 14:51:54.683,"{'loss': 0.06000000000000005, 'status': 'ok'}",,2,2,0
3,2018-05-01 14:51:54.689,,"{'cmd': ('domain_attachment', 'FMinIter_Domain...",,2018-05-01 14:51:54.837,"{'loss': 0.046666666666666856, 'status': 'ok'}",,2,3,0
4,2018-05-01 14:51:54.843,,"{'cmd': ('domain_attachment', 'FMinIter_Domain...",,2018-05-01 14:51:55.217,"{'loss': 0.06222222222222229, 'status': 'ok'}",,2,4,0
