<a href="https://colab.research.google.com/github/aigonna/ML_Skills/blob/main/3_XGboost_CV_tuning_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cp -r /content/drive/Shareddrives/mm/kaggles/tatanic/data /content/

In [22]:
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

import xgboost as xgb
from pathlib import Path

In [4]:
data_dir = Path("/content/data")

train_data = pd.read_csv(data_dir/'train.csv')
test_data = pd.read_csv(data_dir/'test.csv')
sample = pd.read_csv(data_dir/'gender_submission.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_data.shape

(891, 12)

In [6]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)  # 搜索字母后接.像Mr.这种敬称
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


def create_extra_features(data):
    data['Ticket_type'] = data['Ticket'].map(lambda x: x[0:3])  # Ticket_type只要Ticket前3字符
    data['Name_Words_Count'] = data['Name'].map(lambda x: len(x.split()))  # name 词数
    # 如果不nan就是有Cabin,转换为1,0int型数据
    data['Has_Cabin'] = data["Cabin"].map(lambda x: 1 - int(type(x) == float))
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

    
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean()).astype('int')
    data['Age'] = data['Age'].fillna(data['Age'].mean()).astype('int')

    data['Title'] = data['Name'].apply(get_title).replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
    data['Sex'] = data['Sex'].map({'female': 1, 'male':0}).astype(int)
    data['Embarked'] = data['Embarked'].map({"C":0,"Q":1,"S":2}).fillna(0).astype(int)

    return data

In [7]:
train_data = create_extra_features(train_data)
test_data = create_extra_features(test_data)

In [8]:
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Ticket_type', 'Cabin'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Ticket_type', 'Cabin'], axis=1, inplace=True)
X = train_data.drop(['Survived'], axis=1, inplace=False)
y = train_data['Survived']

In [9]:
def f1_metric(y_true, y_pred):
    return f1_score(y_true, (y_pred > 0.5).astype(int))

##1.先调n_estimators

In [10]:
SEED = 2020
cv_params = {'n_estimators': [50, 100, 200, 300, 500, 700]}
fixed_params = {'eta': 1e-4,
                'max_depth': 8,
                'min_child_weight': 1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_lambda': 0.1,
                'reg_alpha': 0.1,
                'gamma': 0.1,
                'seed':SEED,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor'
                }

In [11]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=8,
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs

In [12]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([ 2.51054478,  3.47501941,  6.49996791,  9.39237518, 14.83499751,
       17.15951338]), 'std_fit_time': array([0.36991085, 0.0988221 , 0.20566824, 0.23797885, 0.38571457,
       3.348788  ]), 'mean_score_time': array([0.00460768, 0.00748944, 0.00533576, 0.00774097, 0.01062379,
       0.00810347]), 'std_score_time': array([0.0002009 , 0.00233454, 0.00010953, 0.0012977 , 0.00260538,
       0.00257486]), 'param_n_estimators': masked_array(data=[50, 100, 200, 300, 500, 700],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 200}, {'n_estimators': 300}, {'n_estimators': 500}, {'n_estimators': 700}], 'split0_test_score': array([0.87068511, 0.87160738, 0.8671278 , 0.86475626, 0.86488801,
       0.86291173]), 'split1_test_score': array([0.8138369 , 0.81330214, 0.81717914, 0.8177139 , 0.81450535,
       0.81610963]), 'split2_test_sc

- 现在还可把n_estimators 更细致调一调





In [16]:
SEED = 2020
cv_params = {'n_estimators': [50, 100, 150, 200]}
fixed_params = {'eta': 1e-4,
                'max_depth': 8,
                'min_child_weight': 1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_lambda': 0.1,
                'reg_alpha': 0.1,
                'gamma': 0.1,
                'seed':SEED,
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor'
                }


In [17]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   21.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=8,
                                     min_child_weight=1, missing=None,
                                     n_estimators=100, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs

In [18]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([1.97086782, 3.47999945, 5.0122467 , 5.73091154]), 'std_fit_time': array([0.03435313, 0.10213554, 0.15638116, 0.75338172]), 'mean_score_time': array([0.00501428, 0.00728388, 0.00750213, 0.00423846]), 'std_score_time': array([0.00109571, 0.00188978, 0.00133593, 0.00098611]), 'param_n_estimators': masked_array(data=[50, 100, 150, 200],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 150}, {'n_estimators': 200}], 'split0_test_score': array([0.87068511, 0.87160738, 0.87147563, 0.8671278 ]), 'split1_test_score': array([0.8138369 , 0.81330214, 0.81610963, 0.81717914]), 'split2_test_score': array([0.90127005, 0.89699198, 0.89725936, 0.89993316]), 'split3_test_score': array([0.8540107 , 0.86296791, 0.86390374, 0.86176471]), 'split4_test_score': array([0.91304348, 0.90692727, 0.89868369, 0.89602446]), 'mean_test_score': array([0.87056925, 0.870

##2.再来调eta

In [48]:
SEED = 2020
cv_params = {'eta': [1e-4, 5e-3, 1e-3, 5e-2, 1e-2, 0.1]}
fixed_params = {
        'n_estimators': 50,
        'max_depth': 8,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [49]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.1s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0.1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=8, min_child_weight=1,
                                     missing=None, n_estimators=50, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'eta': [0.0001, 0.00

In [50]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.78393226, 0.79432197, 0.78639483, 0.78861899, 0.78311682,
       0.79058104]), 'std_fit_time': array([0.03190882, 0.01153002, 0.03931565, 0.01593621, 0.02149259,
       0.01677429]), 'mean_score_time': array([0.00556259, 0.00541902, 0.00622802, 0.00527864, 0.00539079,
       0.00454397]), 'std_score_time': array([0.00057804, 0.00082089, 0.00103792, 0.00071185, 0.00080593,
       0.00078153]), 'param_eta': masked_array(data=[0.0001, 0.005, 0.001, 0.05, 0.01, 0.1],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'eta': 0.0001}, {'eta': 0.005}, {'eta': 0.001}, {'eta': 0.05}, {'eta': 0.01}, {'eta': 0.1}], 'split0_test_score': array([0.87068511, 0.87068511, 0.87068511, 0.87068511, 0.87068511,
       0.87068511]), 'split1_test_score': array([0.8138369, 0.8138369, 0.8138369, 0.8138369, 0.8138369, 0.8138369]), 'split2_test_score': array([0.90127005, 0.90127005, 0.90127005, 0.90127005, 0

##3.调节max_depth

In [51]:
SEED = 2020
cv_params = {'max_depth': [3, 5, 7, 9, 11, 13, 15]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        # 'max_depth': 8,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [52]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   13.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=

In [53]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.34111514, 0.49554572, 0.68136458, 0.850109  , 0.94185147,
       0.98529663, 0.92862816]), 'std_fit_time': array([0.01236393, 0.02044822, 0.01989193, 0.03562242, 0.02319972,
       0.043426  , 0.151972  ]), 'mean_score_time': array([0.00752678, 0.00601721, 0.00519514, 0.00505009, 0.00537682,
       0.00584059, 0.00517554]), 'std_score_time': array([0.00505329, 0.00065057, 0.00051231, 0.00019179, 0.00043296,
       0.00062884, 0.00115313]), 'param_max_depth': masked_array(data=[3, 5, 7, 9, 11, 13, 15],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 3}, {'max_depth': 5}, {'max_depth': 7}, {'max_depth': 9}, {'max_depth': 11}, {'max_depth': 13}, {'max_depth': 15}], 'split0_test_score': array([0.87549407, 0.87793149, 0.87068511, 0.86725955, 0.86884058,
       0.87384717, 0.87200264]), 'split1_test_score': array([0.80100267, 0.81470588, 0.81437166, 0.81717914, 0.8

##4.调节min_child_weight




In [54]:
SEED = 2020
cv_params = {'min_child_weight': [1, 3, 5, 7, 9, 11, 13, 15]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        # 'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [55]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   11.0s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=15,
                                     min_child_weight=1, missing=None,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs

In [56]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([1.02854362, 0.6071228 , 0.50521631, 0.46808972, 0.42857432,
       0.41834879, 0.39690003, 0.35146747]), 'std_fit_time': array([0.0605476 , 0.03285861, 0.01796376, 0.01169402, 0.01275798,
       0.01065689, 0.01766939, 0.07448436]), 'mean_score_time': array([0.0052206 , 0.00548019, 0.00534625, 0.00483546, 0.00493088,
       0.00478988, 0.00476856, 0.00461946]), 'std_score_time': array([0.00050926, 0.00049977, 0.00096154, 0.00061707, 0.00069148,
       0.00060287, 0.00054449, 0.00089622]), 'param_min_child_weight': masked_array(data=[1, 3, 5, 7, 9, 11, 13, 15],
             mask=[False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_child_weight': 1}, {'min_child_weight': 3}, {'min_child_weight': 5}, {'min_child_weight': 7}, {'min_child_weight': 9}, {'min_child_weight': 11}, {'min_child_weight': 13}, {'min_child_weight': 15}], 'split0_test_score': array([0.87200264, 0.878722  , 0.8726614

##5.调节subsample和colsample_bytree

In [57]:
SEED = 2020
cv_params = {'subsample': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        # 'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [58]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=15,
                                     min_child_weight=3, missing=None,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=1, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-

In [59]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.43708768, 0.52094293, 0.54455209, 0.58706832, 0.62721395,
       0.60642319]), 'std_fit_time': array([0.01049597, 0.02297427, 0.01201353, 0.02792849, 0.00807993,
       0.08976797]), 'mean_score_time': array([0.00647306, 0.00596566, 0.00530581, 0.00672669, 0.00521293,
       0.0047245 ]), 'std_score_time': array([0.00263034, 0.00051566, 0.00041786, 0.00120522, 0.00035705,
       0.00094035]), 'param_subsample': masked_array(data=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'subsample': 0.3}, {'subsample': 0.5}, {'subsample': 0.6}, {'subsample': 0.7}, {'subsample': 0.8}, {'subsample': 0.9}], 'split0_test_score': array([0.85592885, 0.86963109, 0.87239789, 0.87147563, 0.878722  ,
       0.87964427]), 'split1_test_score': array([0.82045455, 0.81463904, 0.82339572, 0.81878342, 0.81637701,
       0.81838235]), 'split2_test_score': array([0.89324866, 

In [61]:
SEED = 2020
cv_params = {'colsample_bytree': [0.3, 0.5, 0.6, 0.7, 0.8, 0.9]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        'subsample': 0.8,
        # 'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [62]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    9.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eta=0.0001, gamma=0.1,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=15, min_child_weight=3,
                                     missing=None, n_estimators=50, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsampl

In [63]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.55026426, 0.59251771, 0.60416899, 0.60876675, 0.6167788 ,
       0.57918034]), 'std_fit_time': array([0.01694185, 0.02190984, 0.00455646, 0.00988367, 0.00929614,
       0.07615666]), 'mean_score_time': array([0.00577297, 0.00568962, 0.00548048, 0.00547042, 0.00570989,
       0.00556664]), 'std_score_time': array([0.00141506, 0.0008376 , 0.00045586, 0.00057581, 0.00052207,
       0.00153038]), 'param_colsample_bytree': masked_array(data=[0.3, 0.5, 0.6, 0.7, 0.8, 0.9],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'colsample_bytree': 0.3}, {'colsample_bytree': 0.5}, {'colsample_bytree': 0.6}, {'colsample_bytree': 0.7}, {'colsample_bytree': 0.8}, {'colsample_bytree': 0.9}], 'split0_test_score': array([0.86805007, 0.87160738, 0.87147563, 0.87503294, 0.878722  ,
       0.88043478]), 'split1_test_score': array([0.81530749, 0.82052139, 0.81898396, 0.81731283, 0.81637701,
       0.823

##6.调节lambda和alpha

In [64]:
SEED = 2020
cv_params = {'reg_lambda': [0.01, 0.1, 10, 50, 100]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        # 'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [65]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    6.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=15,
                                     min_child_weight=3, missing=None,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-

In [66]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.64272504, 0.62641859, 0.48316941, 0.41912675, 0.36391215]), 'std_fit_time': array([0.0339422 , 0.01794728, 0.02095507, 0.01671348, 0.08484494]), 'mean_score_time': array([0.00590215, 0.00538368, 0.005235  , 0.00487971, 0.00475225]), 'std_score_time': array([0.00140502, 0.0007724 , 0.00086332, 0.00081788, 0.00092795]), 'param_reg_lambda': masked_array(data=[0.01, 0.1, 10, 50, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'reg_lambda': 0.01}, {'reg_lambda': 0.1}, {'reg_lambda': 10}, {'reg_lambda': 50}, {'reg_lambda': 100}], 'split0_test_score': array([0.87055336, 0.878722  , 0.86455863, 0.86772069, 0.86791831]), 'split1_test_score': array([0.82025401, 0.81637701, 0.81584225, 0.81276738, 0.82372995]), 'split2_test_score': array([0.90233957, 0.89886364, 0.89318182, 0.88850267, 0.8822861 ]), 'split3_test_score': array([0.87459893, 0.87887701, 0.87486631, 0.85989305, 0.86089572]), 's

In [67]:
SEED = 2020
cv_params = {'reg_alpha': [0.01, 0.1, 10, 50, 100]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        # 'reg_alpha': 0.1,
        'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [68]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    6.0s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001,
                                     gamma=0.1, learning_rate=0.1,
                                     max_delta_step=0, max_depth=15,
                                     min_child_weight=3, missing=None,
                                     n_estimators=50, n_jobs=1, nthread=None,
                                     objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-

In [69]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.63597422, 0.62608085, 0.3942028 , 0.34525862, 0.28737803]), 'std_fit_time': array([0.01477973, 0.01032729, 0.01537306, 0.01516913, 0.0484985 ]), 'mean_score_time': array([0.00603008, 0.00536957, 0.00480642, 0.00484819, 0.00459123]), 'std_score_time': array([0.00097383, 0.00071669, 0.00055792, 0.00038274, 0.00098055]), 'param_reg_alpha': masked_array(data=[0.01, 0.1, 10, 50, 100],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'reg_alpha': 0.01}, {'reg_alpha': 0.1}, {'reg_alpha': 10}, {'reg_alpha': 50}, {'reg_alpha': 100}], 'split0_test_score': array([0.87714097, 0.878722  , 0.86936759, 0.8583004 , 0.82602108]), 'split1_test_score': array([0.81276738, 0.81637701, 0.81537433, 0.81590909, 0.81477273]), 'split2_test_score': array([0.89872995, 0.89886364, 0.88629679, 0.84766043, 0.82152406]), 'split3_test_score': array([0.87727273, 0.87887701, 0.85828877, 0.82600267, 0.75534759]), 'split4_

In [70]:
SEED = 2020
cv_params = {'gamma': [0.01, 0.1, 5, 10, 20]}
fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        # 'gamma': 0.1,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [71]:
model = xgb.XGBClassifier(**fixed_params)
optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1, )
optimized_XGB.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    7.9s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, eta=0.0001, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=15, min_child_weight=3,
                                     missing=None, n_estimators=50, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     predictor='gpu_predictor', random_state=0,
                                     reg_alpha=0.1, reg_lambda=0.1,
                                     scale_pos_weight=1, seed=2020, silent=None,
                                     subsample=0.8, tree_method='gpu_hist',
                                     verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': 

In [72]:
cv_result = optimized_XGB.cv_results_
print("每轮结果:", cv_result)
print("参数最佳值{}".format(optimized_XGB.best_params_))
print("最佳参数时模型分数:{}".format(optimized_XGB.best_score_))

每轮结果: {'mean_fit_time': array([0.62123842, 0.62062287, 0.63552232, 0.62420659, 0.55225496]), 'std_fit_time': array([0.04079127, 0.01526135, 0.01708355, 0.03024109, 0.11191989]), 'mean_score_time': array([0.00608149, 0.00545659, 0.00532842, 0.00533862, 0.00491285]), 'std_score_time': array([0.00124101, 0.00090832, 0.00056707, 0.00057685, 0.0011712 ]), 'param_gamma': masked_array(data=[0.01, 0.1, 5, 10, 20],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'gamma': 0.01}, {'gamma': 0.1}, {'gamma': 5}, {'gamma': 10}, {'gamma': 20}], 'split0_test_score': array([0.878722, 0.878722, 0.878722, 0.878722, 0.878722]), 'split1_test_score': array([0.81637701, 0.81637701, 0.81637701, 0.81637701, 0.81637701]), 'split2_test_score': array([0.89886364, 0.89886364, 0.89886364, 0.89886364, 0.89886364]), 'split3_test_score': array([0.87887701, 0.87887701, 0.87887701, 0.87887701, 0.87887701]), 'split4_test_score': array([0.91769711, 0.9176

##7.最终参数

In [23]:
SEED = 2020

fixed_params = {
        'eta': 1e-4,
        'n_estimators': 50,
        'max_depth': 15,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_lambda': 0.1,
        'reg_alpha': 0.1,
        'gamma': 0.01,
        'seed':SEED,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
        }

In [24]:
n_splits = 10
test_preds = None
kf_f1 = [] #每个分fold的验证集f1列表

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    model = xgb.XGBClassifier(**fixed_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='error', verbose=False)

    valid_pred = model.predict(X_val)
    f1 = f1_metric(y_val, valid_pred)
    print(f'kFold {fold + 1} / {n_splits} f1:{f1:.4f}')
    kf_f1.append(f1)

    if test_preds is None:
        test_preds = model.predict(test_data)
    else:
        test_preds += model.predict(test_data)



kFold 1 / 10 f1:0.7576
kFold 2 / 10 f1:0.7273
kFold 3 / 10 f1:0.8837
kFold 4 / 10 f1:0.7733
kFold 5 / 10 f1:0.7692
kFold 6 / 10 f1:0.8358
kFold 7 / 10 f1:0.6800
kFold 8 / 10 f1:0.6383
kFold 9 / 10 f1:0.7368
kFold 10 / 10 f1:0.7246


In [26]:
test_preds = test_preds / n_splits
print(f'Average KFold f1 score :{np.mean(np.array(kf_f1)):.5f}')
sample['Survived'] = (test_preds > 0.5).astype(int)
sample.to_csv('/content/xgb_CV_tuning.csv', index=False)

Average KFold f1 score :0.75267


In [27]:
!rm -f /root/.kaggle/kaggle.json
!mkdir /root/.kaggle/
!cp /content/drive/MyDrive/kaggle.json /root/.kaggle/kaggle.json

In [28]:
!kaggle competitions submit -c titanic -f /content/xgb_CV_tuning.csv -m "optuna_CV_tuning"

100% 2.77k/2.77k [00:03<00:00, 868B/s]
Successfully submitted to Titanic - Machine Learning from Disaster