<a href="https://colab.research.google.com/github/Zhangfeyy/Bayesian_Optimizers/blob/main/BayesianOptimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split, cross_validate
from sklearn.metrics import accuracy_score,precision_score,f1_score,confusion_matrix,make_scorer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from scipy.stats import randint,uniform
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
import time


In [7]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-3.1.0-py3-none-any.whl (36 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-3.1.0 colorama-0.4.6


In [10]:
from bayes_opt import BayesianOptimization

# Load the dataset

In [9]:
data1 = pd.read_csv(r"./data.csv")

# renames the data column
data1.columns=['D','H','M','co','EC','T','E','h','fc']

# pandas uses .loc/iloc(int position)
X = data1.loc[:,['D','H','M','co','EC','T','E','h']]

# compared to .loc, this method keeps a 1-dim series, instead of a 2-dim dataframe.
y = data1['fc']

FileNotFoundError: [Errno 2] No such file or directory: './data.csv'

# Define loss function f(x)

In [11]:
def bayesopt_objective(a,b,c,d):

  # only for continuous numerical hyperparameters
  # RFR model requires int hyperparameters, conversion is necessary

  reg = RFR(n_estimators=int(a),
        max_depth=int(b),
        max_features=int(c),
        min_impurity_decrease=int(d),
        random_state=60,
        verbose=False,# surpresses output during training
  )
  # cross validation as the dependent variable
  # to find the maximum, we use the -RMSE
  cv = KFold(n_splits=5,shuffle=True,random_state=60)
  validation_loss = cross_validate(reg,
                    X,
                    y,
                    scoring="neg_root_mean_squared_error",
                    cv=cv,
                    verbose=False,
                    n_jobs=-1,# use all available CPU cores for faster computation
                    error_score="raise" # raises an exception if an error occurs
                    )
  return np.mean(validation_loss["test_score"])



# Define x domain

In [12]:
# here, the parameters search space must be defined in dic,
# the domain must be in tuple(including the front and end), where the value will be float

# here is manually set the limited range
param_grid_simple = {"a":(20,300),
            "b":(2,20),
            "c":(10,20),
            "d":(0,1)
           }

# Bayes optimization

In [13]:

def param_bayes_opt(init_points,n_iter):
  opt = BayesianOptimization(bayesopt_objective,
                param_grid_simple,
                random_state=60)
  # sets hyperparameters
  opt.maximize(init_points = init_points,
        n_iter = n_iter)

  # returns the best values of parameters
  params_best = opt.max['params']
  score_best = opt.max['target']

  return params_best, score_best

# Implementation

In [None]:
start = time.time()
params_best, score_best = param_bayes_opt(20,200) # iterate 220 times in total, each point represents an iteration
print("it takes %s second" % (time.time()-start))
validation_score = bayes_opt_validation(params_best)
print("\n","\n",validation_score)


In [None]:
# creates a random forest model
reg2 = RFR(n_estimators=68,
      max_depth = 20,
      max_features=20,
      min_impurity_decrease=0,
      random_state=60,
      verbose= False,
      n_jobs=-1)

In [None]:
# defines scoring criteria
scoring= {
    'r2' : 'r2',
    'mae' : 'neg_mean_absolute_error',
    'mse' : 'neg_mean_squared_error',
    'rmse' : 'neg_root_mean-squared-error'
}

cv = KFold(n_splits=5,shuffle=True,random_state=6)
results = cross_validate(reg2, X, y, cv, scoring=scoring)

print(f"R^2:{np.mean(results['test_r2'])}")
print(f"MAE:{-np.mean(results['test_mae'])}")
print(f"MSE:{-np.mean(results['test_mse'])}")
print(f"RMSE:{np.mean(np.sqrt(results['test_mse']))}")


# Plot

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.plot(y,color='red',label='True value')
plt.plot(y_pred, color='blue',label='Predictive value',linestyle='--')
plt.legend()

plt.xlabel('sample points')
plt.ylabel('value')
plt.show()