# Implementing gridsearchcv for LGBM

In [27]:
import pandas as pd
import numpy as np
import modelinghelper as helper
import lightgbm as lgb
import os
import joblib
# supress future warnings
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [3]:
#set up random seed
rng = 42

# set up folder to save results
output_path = 'gridsearch_all_models'
dataset = 'import'
mod_name = "LGBM"
run_num = "run_1"

if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    print("Folder already exists")

# file name
prefix = f'{output_path}'

Folder already exists


In [5]:
# read in data
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')
# convert month to string
df['ship_date_mm'] = df['ship_date_mm'].astype(str)

In [8]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date',
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat',
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 'wildlf_cat',
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd',
                'value', 'ship_date_mm']

import_df = helper.df_filtering(df, i_e = 'I', f_cols = feature_cols)

In [9]:
X_train, X_test, y_train, y_test = helper.data_split(import_df)

### LGBM

In [26]:
clf_lgbm = lgb.LGBMClassifier(random_state=rng,verbose=-1)

lgbm_params = {
    'clf__n_estimators' : [500, 1000],
    "clf__max_depth" : [5,10,15,20,-1],
    "clf__num_leaves" : [10,20,50,70],
    "clf__learning_rate" : [.05,.1],
    "clf__is_unbalance" : [True, False],
}

lgbm_pipe = helper.gridsearch_pipeline(X_train, y_train, clf_lgbm, lgbm_params)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV 1/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=10;, score=0.823 total time=  19.4s
[CV 2/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=10;, score=0.815 total time=  24.0s
[CV 3/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=10;, score=0.819 total time=  17.5s
[CV 4/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=10;, score=0.821 total time=  16.9s
[CV 5/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=10;, score=0.797 total time=  16.8s
[CV 1/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=5, clf__n_estimators=500, clf__num_leaves=20;, score=0.829 total time=  18.9s
[CV 2/5

In [28]:
joblib.dump(lgbm_pipe, f'{prefix}/lgbm_pipe_import.joblib')

['gridsearch_all_models/lgbm_pipe_import.joblib']