## 1. 贝叶斯优化

In [1]:
# import package
import xgboost as xgb
import pandas as pd
import numpy as np
# !pip install bayesian-optimization
from bayes_opt import BayesianOptimization
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn import model_selection

import json

In [2]:
# read data
# data link: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
wine_df = pd.read_csv('./winequality-red.csv', sep=';')
wine_df['ID'] = range(1, len(wine_df)+1)
# Y = wine_df.quality.values
# wine_df.quality = np.asarray([1 if i>=6 else 0 for i in Y])
wine_df['quality'] = wine_df['quality'].apply(lambda x: 1 if x>=6 else 0)

# split train_test data
msk = np.random.rand(len(wine_df)) < 0.8
train = wine_df[msk]
test = wine_df[~msk]

label = 'quality'
IDcol = 'ID'
features = [x for x in train.columns if x not in [label, IDcol]]

# convert data format
dtrain = xgb.DMatrix(train[features].values, train[label].values)

In [3]:
# define optimize function
def xgb_optimize(learning_rate, n_estimators, min_child_weight, max_depth, subsample, gamma, alpha):
    params = {}
    params['learning_rate'] = float(learning_rate)
    params['min_child_weight'] = int(min_child_weight)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)
    params['objective'] = 'binary:logistic'

    cv_result = xgb.cv( params, 
                        dtrain, 
                        num_boost_round=int(n_estimators),
                        nfold=5,
                        seed=10,
                        metrics=['auc'],
                        callbacks=[xgb.callback.EarlyStopping(50)])
    return cv_result['test-auc-mean'].iloc[-1]

In [4]:
# instantiation and input params
xgb_opt = BayesianOptimization(xgb_optimize, {  'learning_rate': (0.05, 0.5),
                                                'n_estimators': (50, 500),
                                                'min_child_weight': (1, 10),
                                                'max_depth': (4, 10),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha':(0, 10)})

In [5]:
xgb_opt.maximize(init_points=5, n_iter=30)

|   iter    |  target   |   alpha   |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8258  [0m | [0m 1.728   [0m | [0m 9.429   [0m | [0m 0.1192  [0m | [0m 9.302   [0m | [0m 6.505   [0m | [0m 177.6   [0m | [0m 0.9912  [0m |
| [0m 2       [0m | [0m 0.8228  [0m | [0m 3.834   [0m | [0m 7.81    [0m | [0m 0.17    [0m | [0m 7.969   [0m | [0m 2.224   [0m | [0m 426.6   [0m | [0m 0.5931  [0m |
| [95m 3       [0m | [95m 0.8549  [0m | [95m 1.731   [0m | [95m 1.274   [0m | [95m 0.2261  [0m | [95m 5.692   [0m | [95m 1.147   [0m | [95m 398.9   [0m | [95m 0.5138  [0m |
| [0m 4       [0m | [0m 0.8168  [0m | [0m 4.727   [0m | [0m 8.562   [0m | [0m 0.1233  [0m | [0m 7.645   [0m | [0m 3.508   [0m | [0m 205.2   [0m | [0m 0.5345  [0m |
| [0m 5       [0m | [0m 0.8372  [0m | 

In [6]:
print(json.dumps(xgb_opt.max['params'], indent=4, sort_keys=True))

{
    "alpha": 0.16558540376060082,
    "gamma": 0.0003657217002028812,
    "learning_rate": 0.07551952882529753,
    "max_depth": 9.850471727665845,
    "min_child_weight": 3.0281294265751795,
    "n_estimators": 238.0502069289133,
    "subsample": 0.5174565818606155
}


In [7]:
# define fit function
def model_fit(bst, train, test, features, cv_result):
    bst.set_params(n_estimators=cv_result.shape[0], eval_metric=['auc'])
    bst.fit(train[features], train[label])
    train_predict_prob = bst.predict_proba(train[features])[:,1]
    train_auc = metrics.roc_auc_score(train[label], train_predict_prob)
    print("AUC得分 (训练集): %f" % train_auc)
    
    test['prob'] = bst.predict_proba(test[features])[:,1]
    test_auc = metrics.roc_auc_score(test[label], test['prob'])
    print('AUC得分 (测试集): %f' % test_auc)

In [8]:
# define cross validation
def model_cv(bst, train, features, nfold=5, early_stopping_rounds=30):
    params = bst.get_xgb_params()
    train = xgb.DMatrix(train[features].values, train[label].values)
    
    cv_result = xgb.cv(
                        params,
                        train,
                        num_boost_round=bst.get_params()['n_estimators'],
                        nfold=nfold,
                        metrics=['auc'],
                        early_stopping_rounds = early_stopping_rounds)
    print("最优轮数 : %d" % cv_result.shape[0])
    print("最优轮详情：")
    print(cv_result[cv_result.shape[0] - 1:])
    return cv_result

In [9]:
model = XGBClassifier(  learning_rate =0.184,
                        n_estimators=176,
                        max_depth=10,
                        min_child_weight=1.8963,
                        objective= 'binary:logistic',
                        subsample=0.8627,
                        nthread=4,
                        gamma=0.25,
                        alpha=0.05342,
                        seed=10)
cv_result = model_cv(model, train, features)

model_fit(model, train, test, features, cv_result)

最优轮数 : 50
最优轮详情：
    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
49        0.999984        0.00001       0.851399      0.021094
AUC得分 (训练集): 1.000000
AUC得分 (测试集): 0.871088


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
