In [1]:
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error
from seqmm import SeqUDSklearn, SeqUDOptimizer, GPEISklearn,\
        SMACSklearn, TPESklearn, GridSklearn, RandSklearn, LHSSklearn, SobolSklearn, UDSklearn

dt = datasets.load_diabetes()
sx = MinMaxScaler()
sy = MinMaxScaler()
x = sx.fit_transform(dt.data)
y = sy.fit_transform(dt.target.reshape([-1,1]))

ParaSpace = {'booster':          {'Type': 'categorical', 'Mapping': ['gbtree', 'gblinear']},
             'max_depth':        {'Type': 'integer',     'Mapping': np.linspace(2,10,9)}, 
             'n_estimators':     {'Type': 'integer',     'Mapping': np.linspace(100,500,401)},
             'min_child_weight': {'Type': 'integer',     'Mapping': np.linspace(1,100,100)},
             'subsample':        {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'colsample_bytree': {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'learning_rate':    {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'gamma':            {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_lambda':       {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_alpha':         {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x}}

estimator = xgb.XGBRegressor()
score_metric = make_scorer(mean_squared_error, False)
cv = KFold(n_splits=5, random_state=0, shuffle=True)

In [2]:
model_list = [SeqUDSklearn, GPEISklearn, SMACSklearn, TPESklearn, GridSklearn,
              RandSklearn, LHSSklearn, SobolSklearn, UDSklearn]
for model in model_list:
    clf = model(estimator, cv, ParaSpace, max_runs = 20, refit = True, verbose = False)
    clf.fit(x, y)
    print(model, clf.best_score_)

<class 'seqmm.pysequd.sk_sequd.SeqUDSklearn'> 0.47613848805132014
<class 'seqmm.pybayopt.sk_gpei.GPEISklearn'> 0.489883413049473
<class 'seqmm.pybayopt.sk_smac.SMACSklearn'> 0.4794488408083157
<class 'seqmm.pybayopt.sk_tpe.TPESklearn'> 0.4649643092863407


HBox(children=(IntProgress(value=0, max=1024), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-7bb5e2d41c34>", line 5, in <module>
    clf.fit(x, y)
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/seqmm/pybatdoe/batch_base.py", line 153, in fit
    self._run(obj_func)
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/seqmm/pybatdoe/sk_grid.py", line 127, in _run
    for parameters in tqdm(candidate_params))
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/joblib/parallel.py", line 924, in __call__
    while self.dispatch_one_batch(iterator):
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/r7user1/anaconda2_local/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch
   

KeyboardInterrupt: 

In [6]:
clf = GridSklearn(estimator, cv, ParaSpace, max_runs = 20, refit = True, verbose = False)
clf.fit(x, y)

KeyError: 'score'

In [4]:
import warnings
warnings.filterwarnings("ignore")

import os
import time
import shutil
import numpy as np
import pandas as pd
from joblib import delayed
from joblib import Parallel
from itertools import product
from matplotlib import pylab as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import cross_val_score
from seqmm.pybatdoe.batch_base import BatchSklearn

class GridSklearn(BatchSklearn):
    """ 
    Sklearn Hyperparameter optimization interface based on Random Search. 

    Parameters
    ----------
    :type  estimator: estimator object
    :param estimator: This is assumed to implement the scikit-learn estimator interface.
    
    :type  cv: cross-validation method, an sklearn object.
    :param cv: e.g., `StratifiedKFold` and KFold` is used.
    
    :type  para_space: dict or list of dictionaries
    :param para_space: It has three types:
    
        Continuous: 
            Specify `Type` as `continuous`, and include the keys of `Range` (a list with lower-upper elements pair) and
            `Wrapper`, a callable function for wrapping the values.  
        Integer:
            Specify `Type` as `integer`, and include the keys of `Mapping` (a list with all the sortted integer elements).
        Categorical:
            Specify `Type` as `categorical`, and include the keys of `Mapping` (a list with all the possible categories).
    
    :type max_runs: int, optional, default = 100
    :param max_runs: The maximum number of trials to be evaluated. When this values is reached, 
        then the algorithm will stop. 
        
    :type scoring: string, callable, list/tuple, dict or None, optional, default = None
    :param scoring: A sklearn type scoring function. 
        If None, the estimator's default scorer (if available) is used. See the package `sklearn` for details.
    
    :type refit: boolean, or string, optional, default = True
    :param refit: It controls whether to refit an estimator using the best found parameters on the whole dataset.
    
    :type rand_seed: int, optional, default = 0
    :param rand_seed: The random seed for optimization.
    
    :type verbose: boolean, optional, default = False
    :param verbose: It controls whether the searching history will be printed. 

    Examples
    ----------
    >>> import numpy as np
    >>> from sklearn import svm
    >>> from sklearn import datasets
    >>> from seqmm.pybatdoe import GridSklearn
    >>> from sklearn.model_selection import KFold
    >>> iris = datasets.load_iris()
    >>> ParaSpace = {'C':{'Type': 'continuous', 'Range': [-6, 16], 'Wrapper': np.exp2}, 
               'gamma': {'Type': 'continuous', 'Range': [-16, 6], 'Wrapper': np.exp2}}
    >>> estimator = svm.SVC()
    >>> cv = KFold(n_splits=5, random_state=0, shuffle=True)
    >>> clf = GridSklearn(estimator, cv, ParaSpace, max_runs = 100, refit = True, verbose = True)
    >>> clf.fit(iris.data, iris.target)

    Attributes
    ----------
    :ivar best_score_: float
        The best average cv score among the evaluated trials.  

    :ivar best_params_: dict
        Parameters that reaches `best_score_`.

    :ivar best_estimator_: 
        The estimator refitted based on the `best_params_`. 
        Not available if `refit=False`.

    :ivar search_time_consumed_: float
        Seconds used for whole searching procedure.

    :ivar refit_time_: float
        Seconds used for refitting the best model on the whole dataset.
        Not available if `refit=False`.
    """    

    def __init__(self, estimator, cv, para_space, max_runs = 100, 
                 scoring = None, n_jobs = None, refit = False, rand_seed = 0, verbose = False):

        super(GridSklearn,self).__init__(estimator, cv, para_space, max_runs, scoring, 
                               n_jobs, refit, rand_seed, verbose)
        
    def _run(self, obj_func):
        """
        Main loop for searching the best hyperparameters. 
        
        """  
        discrete_runs = 1
        discrete_count = 0
        grid_para = {}
        for item, values in self.para_space.items():
            if (values['Type']=="categorical"):
                grid_para[item] = values['Mapping']
                discrete_runs = discrete_runs * len(values['Mapping'])
                discrete_count = discrete_count + 1
        
        grid_number = np.ceil((self.max_runs/discrete_runs)**(1/(self.factor_number-discrete_count)))        
        for item, values in self.para_space.items():
            if (values['Type']=="continuous"):
                grid_para[item] = values['Wrapper'](np.linspace(values['Range'][0],values['Range'][1], grid_number))
            if (values['Type']=="integer"):
                grid_para[item] = np.round(np.linspace(min(values['Mapping']),max(values['Mapping']),grid_number)).astype(int)
        # generate grid
        para_set = pd.DataFrame([item for item in product(*grid_para.values())], columns = self.para_names)
        if (para_set.shape[0]>self.max_runs): 
            return 0

        candidate_params = [{para_set.columns[j]: para_set.iloc[i,j] 
                             for j in range(para_set.shape[1])} 
                            for i in range(para_set.shape[0])] 
        if self.verbose:
            out = Parallel(n_jobs=self.n_jobs)(delayed(obj_func)(parameters)
                                for parameters in tqdm(candidate_params))
        else:
            out = Parallel(n_jobs=self.n_jobs)(delayed(obj_func)(parameters)
                                for parameters in candidate_params)

        self.logs = para_set.to_dict()
        self.logs.update(pd.DataFrame(out, columns = ["score"]))
        self.logs = pd.DataFrame(self.logs).reset_index(drop=True)
        if self.verbose:
            print("Search completed (%d/%d) with best score: %.5f."
                %(self.logs.shape[0], self.max_runs, self.logs["score"].max()))