In [13]:
#Import libraries
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [1]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25ldone
[?25h  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11686 sha256=f1f2b682f32cc7543b2af837a6c116d8a1632ed80c3e63e69292ee8e7f6031da
  Stored in directory: /Users/liuchuqiao/Library/Caches/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [39]:
#Bayesian optimization
def bayesian_optimization(dataset, function, parameters):
   X_train, y_train, X_test, y_test = dataset
   n_iterations = 5
   gp_params = {"alpha": 1e-4}

   BO = BayesianOptimization(function, parameters)
   BO.maximize(n_iter=n_iterations, **gp_params)

   return BO.max


In [40]:
def rfc_optimization(cv_splits):
    def function(n_estimators, max_depth, min_samples_split):
        return cross_val_score(
               RandomForestRegressor(
                   n_estimators=int(max(n_estimators,0)),                                                               
                   max_depth=int(max(max_depth,1)),
                   min_samples_split=int(max(min_samples_split,2)), 
                   n_jobs=-1, 
                   random_state=42),   
                   #class_weight="balanced"),  
               X=X_train, 
               y=y_train, 
               cv=cv_splits,
               scoring="roc_auc",
               n_jobs=-1).mean()

    parameters = {"n_estimators": (10, 1000),
                  "max_depth": (1, 150),
                  "min_samples_split": (2, 10)}
    
    return function, parameters

In [5]:
def xgb_optimization(cv_splits, eval_set):
    def function(eta, gamma, max_depth):
            return cross_val_score(
                   xgb.XGBClassifier(
                       objective="binary:logistic",
                       learning_rate=max(eta, 0),
                       gamma=max(gamma, 0),
                       max_depth=int(max_depth),                                               
                       seed=42,
                       nthread=-1,
                       scale_pos_weight = len(y_train[y_train == 0])/
                                          len(y_train[y_train == 1])),  
                   X=X_train, 
                   y=y_train, 
                   cv=cv_splits,
                   scoring="roc_auc",
                   fit_params={
                        "early_stopping_rounds": 10, 
                        "eval_metric": "auc", 
                        "eval_set": eval_set},
                   n_jobs=-1).mean()

    parameters = {"eta": (0.001, 0.4),
                  "gamma": (0, 20),
                  "max_depth": (1, 2000)}
    
    return function, parameters

In [44]:
from sklearn import datasets

n = 5
X_train = [0]*n
X_test =[0]*n
y_train = [0]*n
y_test = [0]*n

for i in range(0,n):
    data = datasets.make_friedman1(n_samples = 100, n_features = 5, noise = 1)
    X_train[i] = data[0][0:50]
    X_test[i] = data[0][50:data[0].shape[0]]
    y_train[i] = data[1][0:50]
    y_test[i] = data[1][50:data[1].shape[0]]

In [41]:
#Train model
def train(X_train, y_train, X_test, y_test, function, parameters):
    dataset = (X_train, y_train, X_test, y_test)
    cv_splits = 4
    
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution["params"]

    model = RandomForestRegressor(
             n_estimators=int(max(params["n_estimators"], 0)),
             max_depth=int(max(params["max_depth"], 1)),
             min_samples_split=int(max(params["min_samples_split"], 2)), 
             n_jobs=-1, 
             random_state=42)   
             #class_weight="balanced")

    model.fit(X_train, y_train)
    
    return model


In [45]:
function, parameters = rfc_optimization(5)
train(X_train, y_train, X_test, y_test, function, parameters)

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 71.19   [0m | [0m 4.99    [0m | [0m 268.2   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 141.3   [0m | [0m 5.581   [0m | [0m 712.0   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 19.85   [0m | [0m 7.428   [0m | [0m 395.0   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 88.82   [0m | [0m 7.972   [0m | [0m 966.7   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 111.2   [0m | [0m 4.377   [0m | [0m 201.9   [0m |


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').