## Bayesian optimization using Gaussian Processes

In [11]:
import pandas as pd
import numpy as np


from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from functools import partial
from skopt import space, gp_minimize

In [12]:
df = pd.read_csv("./mobile_price_data/train.csv")
X = df.drop("price_range", axis=1).values
y = df["price_range"].values

## 定義代理函數

In [13]:
def optimize_model(params, param_names, x, y):
    params = dict(zip(param_names, params))
    model = RandomForestClassifier(**params)
    kf = StratifiedKFold(n_splits=5)
    accuracies = []

    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        x_train = x[train_idx]
        y_train = y[train_idx]

        x_test = x[test_idx]
        y_test = y[test_idx]

        model.fit(x_train, y_train)
        preds = model.predict(x_test)

        fold_acc = metrics.accuracy_score(y_test, preds)
        accuracies.append(fold_acc)

        return -1.0 * np.mean(accuracies)


## 參數空間選定

In [17]:
param_space = [
    space.Integer(3, 15, name="max_depth"),
    space.Integer(100, 600, name="n_estimators"),
    space.Categorical(["gini", "entropy"], name="criterion"),
    space.Real(0.01, 1, prior="uniform", name="max_features")
]

param_names = [ "max_depth", "n_estimators", "criterion", "max_features"]

In [18]:
optimize_function = partial(
    optimize_model,
    param_names=param_names,
    x=X,
    y=y
)


## 高斯優化

In [19]:
result = gp_minimize(
    optimize_function,
    dimensions=param_space,
    n_calls=15,
    n_random_starts=10,
    verbose=10
)


Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.3171
Function value obtained: -0.8325
Current minimum: -0.8325
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.9206
Function value obtained: -0.8800
Current minimum: -0.8800
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.7970
Function value obtained: -0.8700
Current minimum: -0.8800
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.3497
Function value obtained: -0.9025
Current minimum: -0.9025
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.4380
Function value obtained: -0.8575
Current minimum: -0.9025
Iteration No: 6 started. 

In [25]:
print("Best parameters found:", dict(zip(param_names, result.x)))
print("Best accuracy:", -result.fun)

Best parameters found: {'max_depth': 11, 'n_estimators': 594, 'criterion': 'entropy', 'max_features': 0.9780310947415617}
Best accuracy: 0.9175
