In [52]:
# import package
import xgboost as xgb
import pandas as pd
import numpy as np
# !pip install bayesian-optimization
from bayes_opt import BayesianOptimization
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV

In [53]:
# read data
# data link: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
wine_df = pd.read_csv('./winequality-red.csv', sep=';')
wine_df['ID'] = range(1, len(wine_df)+1)
# Y = wine_df.quality.values
# wine_df.quality = np.asarray([1 if i>=6 else 0 for i in Y])
wine_df['quality'] = wine_df['quality'].apply(lambda x: 1 if x>=6 else 0)

# split train_test data
msk = np.random.rand(len(wine_df)) < 0.8
train = wine_df[msk]
test = wine_df[~msk]

label = 'quality'
IDcol = 'ID'
features = [x for x in train.columns if x not in [label, IDcol]]

# convert data format
dtrain = xgb.DMatrix(train[features].values, train[label].values)

In [54]:
# define cross validation
def model_cv(bst, train, features, nfold=5, early_stopping_rounds=30):
    params = bst.get_xgb_params()
    train = xgb.DMatrix(train[features].values, train[label].values)
    
    cv_result = xgb.cv(
                        params,
                        train,
                        num_boost_round=bst.get_params()['n_estimators'],
                        nfold=nfold,
                        metrics=['auc'],
                        early_stopping_rounds = early_stopping_rounds)
    print("最优轮数 : %d" % cv_result.shape[0])
    print("最优轮详情：")
    print(cv_result[cv_result.shape[0] - 1:])
    return cv_result

In [55]:
# define fit function
def model_fit(bst, train, test, features, cv_result):
    bst.set_params(n_estimators=cv_result.shape[0], eval_metric=['auc'])
    bst.fit(train[features], train[label])
    train_predict_prob = bst.predict_proba(train[features])[:,1]
    train_auc = metrics.roc_auc_score(train[label], train_predict_prob)
    print("AUC得分 (训练集): %f" % train_auc)
    
    test['prob'] = bst.predict_proba(test[features])[:,1]
    test_auc = metrics.roc_auc_score(test[label], test['prob'])
    print('AUC得分 (测试集): %f' % test_auc)

In [56]:
model1 = XGBClassifier( learning_rate =0.1,
                        n_estimators=1000,
                        max_depth=4,
                        min_child_weight=1,
                        objective= 'binary:logistic',
                        subsample=0.8,
                        colsample_bytree=0.8,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=10)
cv_result = model_cv(model1, train, features)
model_fit(model1, train, test, features, cv_result)

最优轮数 : 129
最优轮详情：
     train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
128        0.994242       0.001389       0.871962      0.010059
AUC得分 (训练集): 0.989219
AUC得分 (测试集): 0.845669


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


#### max_depth、min_child_wright参数调优

In [57]:
param1 = {
    "max_depth":range(3, 10, 2),
    "min_child_weight":range(1, 6, 2)
}

bst1 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=136,
                        max_depth=4,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        scale_pos_weight=1,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        seed=10)

grid_search1 = GridSearchCV(estimator=bst1,
                            param_grid=param1,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search1.fit(train[features], train[label])
print(grid_search1.best_score_,grid_search1.best_params_)

0.815400689902854 {'max_depth': 3, 'min_child_weight': 1}


In [63]:
param2 = {
    "max_depth":[2,3,4],
    "min_child_weight":[1,2]
}

bst2 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=136,
                        objective='binary:logistic',
                        nthread=8,
                        scale_pos_weight=1,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        seed=10)

grid_search2 = GridSearchCV(estimator=bst2,
                            param_grid=param1,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search2.fit(train[features], train[label])
print(grid_search2.best_score_,grid_search2.best_params_)

0.815400689902854 {'max_depth': 3, 'min_child_weight': 1}


#### gamma参数调优

In [59]:
param3 = {
    "gamma":[i/10.0  for i in range(0,5)]
}

bst3 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=136,
                        max_depth=3,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        scale_pos_weight=1,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        seed=10)

grid_search3 = GridSearchCV(estimator=bst3,
                            param_grid=param3,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search3.fit(train[features], train[label])
print(grid_search3.best_score_,grid_search3.best_params_)

0.8179423788955168 {'gamma': 0.1}


#### 以最优参数训啦

In [60]:
model2 = XGBClassifier( learning_rate =0.1,
                        n_estimators=500,
                        max_depth=3,
                        min_child_weight=1,
                        gamma=0.1,
                        objective= 'binary:logistic',
                        subsample=0.8,
                        colsample_bytree=0.8,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=10)
cv_result = model_cv(model2, train, features)
model_fit(model2, train, test, features, cv_result)

最优轮数 : 175
最优轮详情：
     train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
174         0.98037       0.001266       0.866058      0.011882
AUC得分 (训练集): 0.973058
AUC得分 (测试集): 0.846356


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


#### subsample和colsample_bytree参数调优

In [62]:
param4 = {
    "subsample":[i/10.0  for i in range(6,11)],
    "colsample_bytree":[i/10.0  for i in range(6,11)]
}

bst4 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=175,
                        max_depth=3,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        gamma=0.1,
                        scale_pos_weight=1,
                        seed=10)

grid_search4 = GridSearchCV(estimator=bst4,
                            param_grid=param4,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search4.fit(train[features], train[label])
print(grid_search4.best_score_,grid_search4.best_params_)

0.8200104473833933 {'colsample_bytree': 0.6, 'subsample': 0.9}


In [64]:
param5 = {
    "subsample":[i/100.0  for i in range(55,65,5)],
    "colsample_bytree":[i/100.0  for i in range(85,95,5)]
}

bst5 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=175,
                        max_depth=3,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        gamma=0.1,
                        scale_pos_weight=1,
                        seed=10)

grid_search5 = GridSearchCV(estimator=bst5,
                            param_grid=param5,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search5.fit(train[features], train[label])
print(grid_search5.best_score_,grid_search5.best_params_)

0.8111210718075009 {'colsample_bytree': 0.85, 'subsample': 0.55}


#### 正则参数调优

In [65]:
param6 = {
    "reg_alpha":[0, 1e-5, 1e-2, 0.1, 1, 100]
}

bst6 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=175,
                        max_depth=3,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        gamma=0.1,
                        scale_pos_weight=1,
                        colsample_bytree=0.85,
                        subsample=0.55,
                        seed=10)

grid_search6 = GridSearchCV(estimator=bst6,
                            param_grid=param6,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search6.fit(train[features], train[label])
print(grid_search6.best_score_,grid_search6.best_params_)

0.8111210718075009 {'reg_alpha': 0}


In [69]:
param7 = {
    "reg_alpha":[0, 1e-08, 1e-07, 1e-06]
}

bst7 = XGBClassifier(
                        learning_rate=0.1,
                        n_estimators=175,
                        max_depth=3,
                        min_child_weight=1,
                        objective='binary:logistic',
                        nthread=8,
                        gamma=0.1,
                        scale_pos_weight=1,
                        colsample_bytree=0.85,
                        subsample=0.55,
                        seed=10)

grid_search7 = GridSearchCV(estimator=bst7,
                            param_grid=param7,
                            scoring='roc_auc',
                            n_jobs=8,
                            cv=5)

grid_search7.fit(train[features], train[label])
print(grid_search7.best_score_,grid_search7.best_params_)

0.8111210718075009 {'reg_alpha': 0}


#### reg_alpha and learning_rate

In [70]:
model3 = XGBClassifier( learning_rate =0.05,
                        n_estimators=500,
                        max_depth=3,
                        min_child_weight=1,
                        gamma=0.1,
                        reg_alpha=0,
                        objective= 'binary:logistic',
                        subsample=0.8,
                        colsample_bytree=0.8,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=10)
cv_result = model_cv(model3, train, features)
model_fit(model3, train, test, features, cv_result)

最优轮数 : 225
最优轮详情：
     train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
224        0.959825       0.001379       0.863418      0.010058
AUC得分 (训练集): 0.953947
AUC得分 (测试集): 0.849751


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
