In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score 
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import time

Bayesian optimization using [Hyperopt](https://mlwhiz.com/blog/2017/12/28/hyperopt_tuning_ml_model/).

In [2]:
# Make a toy dataset
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=300,
    n_features=25,
    n_informative=2,
    n_redundant=10,
    n_classes=2,
    random_state=8,
)

In [3]:
# Create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=465
)

In [7]:
def objective(space):

    clf = XGBClassifier(
        n_estimators=space["n_estimators"],
        colsample_bytree=space["colsample_bytree"],
        learning_rate=space["learning_rate"],
        max_depth=space["max_depth"],
        min_child_weight=space["min_child_weight"],
        subsample=space["subsample"],
        gamma=space["gamma"],
    )

    eval_set = [(X_train, y_train), (X_test, y_test)]

    clf.fit(
        X_train,
        y_train,
        eval_set=eval_set,
        eval_metric="auc",
        early_stopping_rounds=10,
        verbose=False,
    )

    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test, pred)

    return {"loss": auc, "status": STATUS_OK}


space = {
    "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
    "min_child_weight": hp.choice("min_child_weight", np.arange(2, 11, dtype=int)),
    "n_estimators": hp.choice("n_estimators", np.arange(10, 1001, dtype=int)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "gamma": hp.uniform("gamma", 0, 5),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.4),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.7, 1),
}


trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

{'colsample_bytree': 0.8772890295173994, 'gamma': 0.36455399113729536, 'learning_rate': 0.10890762328307076, 'max_depth': 6.0, 'min_child_weight': 9.0, 'reg_lambda': 0.8676589084988392, 'subsample': 0.9411745386011886}
{'colsample_bytree': 0.8058941661780578, 'gamma': 0.38219199413803573, 'learning_rate': 0.10555663815331348, 'max_depth': 7.0, 'min_child_weight': 6.0, 'reg_lambda': 0.4307733123112034, 'subsample': 0.8608179944915223}
{'colsample_bytree': 0.7894410579466669, 'gamma': 0.10149102452879136, 'learning_rate': 0.04865044955768253, 'max_depth': 4.0, 'min_child_weight': 2.0, 'reg_lambda': 0.924084758283155, 'subsample': 0.8841456459388891}
{'colsample_bytree': 0.919603911972503, 'gamma': 0.1741317714564891, 'learning_rate': 0.28228265199736585, 'max_depth': 6.0, 'min_child_weight': 4.0, 'reg_lambda': 0.7756492078808049, 'subsample': 0.9873349413399752}
{'colsample_bytree': 0.7656008001213244, 'gamma': 0.42100781524023445, 'learning_rate': 0.45335886355889776, 'max_depth': 6.0, 

In [None]:
best

In [None]:
xgb_opt = XGBClassifier(**best)

In [None]:
# Function: Train classifier and evaluate performance
def model_fit(model, data_train, data_test, target_train, target_test):

    # Fit on training data
    model.fit(data_train, target_train)

    # Cross validate on training data
    scores = cross_val_score(model, data_train, target_train, cv=10, scoring="roc_auc")
    print("AUC in each fold: {}; mean: {}\n".format(scores, scores.mean()))

    # Predict outcome for testing data
    y_pred = model.predict(data_test)

    # Display performance metrics
    xgb_eval = classification_report(target_test, y_pred, labels=[0, 1])
    print(xgb_eval)

    # Confusion matrix
    plot_confusion_matrix(
        model,
        data_test,
        target_test,
        display_labels=["survived", "dead"],
        cmap=plt.cm.Blues,
        normalize=None,
    )
    plt.title("Confusion matrix")

In [None]:
model_fit(xgb_opt, X_train, X_test, y_train, y_test)