# Example 1: SVM for Classification

In [1]:
import numpy as np
from sklearn import svm
from sklearn import datasets
from matplotlib import pylab as plt
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from seqmm.pybayopt import GPEISklearn

sx = MinMaxScaler()
dt = datasets.load_breast_cancer()
x = sx.fit_transform(dt.data)
y = dt.target

ParaSpace = {'C':     {'Type': 'continuous', 'Range': [-6, 16], 'Wrapper': np.exp2}, 
             'gamma': {'Type': 'continuous', 'Range': [-16, 6], 'Wrapper': np.exp2}}

estimator = svm.SVC()
score_metric = make_scorer(accuracy_score, True)
cv = KFold(n_splits=5, random_state=0, shuffle=True)

clf = GPEISklearn(estimator, cv, ParaSpace, max_runs = 100, time_out = 10, refit = True, verbose = True)
clf.fit(x, y)
clf.plot_scores()

HBox(children=(IntProgress(value=0), HTML(value='')))

Early Stop!
Search completed in 24.08 seconds.
The best score is: 0.97719.
The best configurations are:
C                   : 32.0
gamma               : 0.03125



<Figure size 600x400 with 1 Axes>

In [1]:
import os
import time
import signal
import shutil
import collections
import numpy as np
import pandas as pd

from spearmint.ExperimentGrid import GridMap
import spearmint.chooser.GPEIOptChooser as module
grid_size = 20000


def spmint_opt(chooser, grid, values, grid_status):
    
    ## The status of jobs, 0 - candidate, 1 - pending, 2 - complete. 
    ## Here we only have two status: 0 or 2 available. 
    job_id = chooser.next(grid, np.squeeze(values), [],
                          np.nonzero(grid_status == 0)[0],
                          np.nonzero(grid_status == 1)[0],
                          np.nonzero(grid_status == 2)[0])
    return job_id


ParaSpace = {'booster':          {'Type': 'categorical', 'Mapping': ['gbtree', 'gblinear']},
             'max_depth':        {'Type': 'integer',     'Mapping': np.linspace(2,10,9)}, 
             'n_estimators':     {'Type': 'integer',     'Mapping': np.linspace(100,500,401)},
             'min_child_weight': {'Type': 'integer',     'Mapping': np.linspace(1,100,100)},
             'subsample':        {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'colsample_bytree': {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'learning_rate':    {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'gamma':            {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_lambda':       {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_alpha':         {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x}}



import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score


estimator = xgb.XGBRegressor()
score_metric = make_scorer(mean_squared_error, False)
cv = KFold(n_splits=5, random_state=0, shuffle=True)
def obj_func(cfg):
    next_params = pd.DataFrame(np.array([cfg]), columns = ParaSpace)
    parameters = {}
    for item, values in ParaSpace.items():
        if (values['Type']=="continuous"):
            parameters[item] = values['Wrapper'](float(next_params[item].iloc[0]))
        elif (values['Type']=="integer"):
            parameters[item] = int(next_params[item].iloc[0]) 
        elif (values['Type']=="categorical"):
            parameters[item] = next_params[item][0]
    estimator.set_params(**parameters)
    out = cross_val_score(estimator, x, y, cv = cv)
    score = np.mean(out)

    return -score, score, score
    
dt = datasets.load_diabetes()
sx = MinMaxScaler()
sy = MinMaxScaler()
x = sx.fit_transform(dt.data)
y = sy.fit_transform(dt.target.reshape([-1,1]))

In [2]:
Max_Runs = 100
Rand_Seed = 1
variables = {}
np.random.seed(Rand_Seed)
factor_number = len(ParaSpace)
for i, label in enumerate(ParaSpace.keys()):
    if ParaSpace[label]['Type'] =="continuous":
        variables[label] =  collections.OrderedDict({'name': label, 
                         'type':'float',
                         'min': ParaSpace[label]['Range'][0],
                         'max': ParaSpace[label]['Range'][1],
                         'size': 1})
    elif ParaSpace[label]['Type'] =="integer":
        variables[label] = collections.OrderedDict({'name': label, 
                         'type':'int',
                         'min': min(ParaSpace[label]['Mapping']),
                         'max': max(ParaSpace[label]['Mapping']),
                         'size': 1})
    elif ParaSpace[label]['Type'] =="categorical":
        variables[label] = collections.OrderedDict({'name': label, 
                         'type':'enum',
                         'options': ParaSpace[label]['Mapping'],
                         'size': 1})

param_unit = []; Params = []; Val = []; Test = []; Time = []
file_dir = "./Benchmark/temp/" + str(time.time()) + str(np.random.rand(1)[0]) + "/"
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

chooser = module.init(file_dir, "mcmc_iters=10")
vkeys = [k for k in variables]
gmap = GridMap([variables[k] for k in vkeys], grid_size)
grid = np.asarray(gmap.hypercube_grid(grid_size, 1)) 
values = np.zeros(grid_size) + np.nan
grid_status = np.zeros(grid.shape[0])

for i in range(np.int(Max_Runs)):
    try:
        job_id = spmint_opt(chooser, grid, values, grid_status)
    except:
        print('Spearmint Early Stop!')
        break


    if isinstance(job_id, tuple):
        (job_id, candidate) = job_id
        grid = np.vstack((grid, candidate))
        grid_status = np.append(grid_status, 2)
        values = np.append(values, np.zeros(1)+np.nan)
        job_id = grid.shape[0]-1
    else:
        candidate = grid[job_id,:]
        grid_status[job_id] = 2

    print(job_id, candidate)
    next_params = gmap.unit_to_list(candidate)
    valacc, testacc, tm = obj_func(next_params)
    values[job_id] = valacc

0 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
0 gbtree
1 [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
0 gbtree
20000 [0.53151872 0.5126023  0.55828276 0.70306628 0.5051355  0.50268264
 1.         0.91035543 0.52234061 0.99193004]
1 gblinear
20001 [0.70472684 0.53067201 0.35905349 0.50490099 0.48998212 0.49386614
 0.27512002 0.03095751 0.81374183 0.27266062]
1 gblinear
20002 [0.36399014 0.47144928 0.61743317 0.49596104 0.50950466 0.50108895
 0.4978231  0.72040467 0.36901153 0.57301609]
0 gbtree
20003 [0.39793078 0.27830787 0.58309346 0.23047892 0.7375476  0.50089345
 0.50138538 0.71168139 0.23692757 0.52568991]
0 gbtree
20004 [0.         0.36097738 0.79963174 0.36444964 1.         0.50156653
 0.51807454 0.62022023 0.         0.54347659]
0 gbtree
20005 [0.10720899 0.38223987 1.         0.25946626 1.         1.
 0.49673133 1.         0.         0.68162256]
0 gbtree
20006 [0.         1.         1.         0.         1.         0.
 0.         0.82703719 0.         0.        ]
0 gbtree
20007 [0.         0.

20064 [0.8379983  0.22128081 1.         0.36716645 0.         1.
 1.         0.         0.18214869 0.        ]
1 gblinear
20065 [1.         0.         1.         0.72296693 0.33195247 0.57870935
 0.90724747 0.46389895 0.37408581 0.        ]
1 gblinear
20066 [0.62528114 0.         1.         0.86202645 0.         1.
 0.9733715  0.37179514 0.35185579 0.        ]
1 gblinear
20067 [1.         0.         1.         1.         0.         0.
 1.         0.         0.41626236 0.        ]
1 gblinear
20068 [1.         0.         1.         0.43205766 0.         0.33155666
 1.         0.         0.79041064 0.        ]
1 gblinear
20069 [1.         0.         1.         1.         0.         0.48218579
 1.         0.         0.11718364 0.        ]
1 gblinear
20070 [1.         0.         1.         1.         0.62398024 0.
 0.97185204 0.25259663 0.27475582 0.        ]
1 gblinear
20071 [1.         0.65960638 1.         1.         0.         0.
 1.         0.         0.02256648 0.        ]
1 gblinear


In [None]:
gmap.unit_to_list(np.array([1.,         0. ,        1.  ,       1. ,        1. ,        0.,
 1.    ,     0.     ,    0.13944059 , 0.        ]))

A contour plot based on a thorough grid search

In [None]:
grid_num = 25
xlist = np.linspace(-6, 16, grid_num)
ylist = np.linspace(-16, 6, grid_num)
X, Y = np.meshgrid(xlist, ylist)
Z = np.zeros((grid_num,grid_num))
for i, C in enumerate(xlist):
    for j, gamma in enumerate(ylist):
        estimator = svm.SVC(C=2**C,gamma = 2**gamma)
        out = cross_val_score(estimator, x, y, cv = cv, scoring = score_metric)
        Z[j,i] = np.mean(out)
        
levels = [0.2, 0.4, 0.8, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0]
cp = plt.contourf(X, Y, Z, levels)
plt.colorbar(cp)
plt.xlabel('C')
plt.ylabel('gamma')
plt.scatter(np.log2(clf.logs.loc[:,['C']]), 
            np.log2(clf.logs.loc[:,['gamma']]), color = "red")
plt.show()

# Example 2: Xgboost for Regression

In [None]:
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error
from seqmm.pybayopt import GPEISklearn

dt = datasets.load_diabetes()
sx = MinMaxScaler()
sy = MinMaxScaler()
x = sx.fit_transform(dt.data)
y = sy.fit_transform(dt.target.reshape([-1,1]))

ParaSpace = {'booster':          {'Type': 'categorical', 'Mapping': ['gbtree', 'gblinear']},
             'max_depth':        {'Type': 'integer',     'Mapping': np.linspace(2,10,9)}, 
             'n_estimators':     {'Type': 'integer',     'Mapping': np.linspace(100,500,401)},
             'min_child_weight': {'Type': 'integer',     'Mapping': np.linspace(1,100,100)},
             'subsample':        {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'colsample_bytree': {'Type': 'continuous',  'Range': [0, 1],  'Wrapper': lambda x:x},
             'learning_rate':    {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'gamma':            {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_lambda':       {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x},
             'reg_alpha':         {'Type': 'continuous',  'Range': [-5, 0], 'Wrapper': lambda x: 10**x}}

estimator = xgb.XGBRegressor()
score_metric = make_scorer(mean_squared_error, False)
cv = KFold(n_splits=5, random_state=0, shuffle=True)

clf = GPEISklearn(estimator, cv, ParaSpace, max_runs = 100, scoring = score_metric, time_out = 30, refit = True, verbose = True)
clf.fit(x, y)
clf.plot_scores()

 # Example 3: Kmeans for Unsupervised Clustering  

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler
from seqmm.pybayopt import GPEISklearn

sx = MinMaxScaler()
dt = datasets.load_iris()
x = sx.fit_transform(dt.data)
y = dt.target.reshape([-1,1])

ParaSpace = {'n_clusters':  {'Type': 'integer',    'Mapping': np.linspace(2,9,8)}, 
             'tol':         {'Type': 'continuous', 'Range': [-6, -3], 'Wrapper': lambda x: 10**x}}

estimator = KMeans()
cv = KFold(n_splits=5, random_state=0, shuffle=True)

clf = GPEISklearn(estimator, cv, ParaSpace, max_runs = 100, refit = True, verbose = True)
clf.fit(x, y)
clf.plot_scores()