# Implementing gridsearchcv for XGBoost

In [14]:
import pandas as pd
import numpy as np
import modelinghelper as helper
import xgboost as xgb
import os
import joblib
# supress future warnings
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
#set up random seed
rng = 42

# set up folder to save results
output_path = 'gridsearch_all_models'
dataset = 'import'
mod_name = "XGBoost"
run_num = "run_1"

if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    print("Folder already exists")

# file name
prefix = f'{output_path}'

Folder already exists


In [3]:
# read in data
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')
# convert month to string
df['ship_date_mm'] = df['ship_date_mm'].astype(str)

In [4]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date',
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat',
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 'wildlf_cat',
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd',
                'value', 'ship_date_mm']

import_df = helper.df_filtering(df, i_e = 'I', f_cols = feature_cols)

In [5]:
X_train, X_test, y_train, y_test = helper.data_split(import_df)

### XGBoost

In [13]:
clf_xgb = xgb.XGBClassifier(random_state = rng, verbosity = 0, silent=True, use_label_encoder=False, tree_method="hist")

xgb_params = {
    'clf__n_estimators' : [100, 500, 1000],
    "clf__max_depth" : [1, 6,10,20,30,40],
    "clf__learning_rate" : np.linspace(.03, .3, 5),
    # used to prevent overfitting
    #"clf__subsample": [.75],
    #"clf__colsample_bylevel" : [.5,.6,.7],

     #early stopping?
}

xgb_pipe = helper.gridsearch_pipeline(X_train, y_train, clf_xgb, xgb_params)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=100;, score=0.212 total time=  13.3s
[CV 2/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=100;, score=0.229 total time=  13.2s
[CV 3/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=100;, score=0.215 total time=  13.0s
[CV 4/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=100;, score=0.209 total time=  12.6s
[CV 5/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=100;, score=0.215 total time=  12.5s
[CV 1/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=500;, score=0.430 total time=  15.5s
[CV 2/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=500;, score=0.426 total time=  15.9s
[CV 3/5] END clf__learning_rate=0.03, clf__max_depth=1, clf__n_estimators=500;, score=0.423 total time=  15.3s
[CV 4/5] END clf__learning_rate=0.03, clf__max_dep

In [15]:
joblib.dump(xgb_pipe, f'{prefix}/xgb_pipe_import.joblib')

['gridsearch_all_models/xgb_pipe_import.joblib']