# Implementing gridsearchcv for LGBM

In [11]:
import pandas as pd
import numpy as np
import modelinghelper as helper
import lightgbm as lgb
import os
import joblib
# supress future warnings
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
#set up random seed
rng = 42

# set up folder to save results
output_path = 'gridsearch_all_models'
dataset = 'export'
mod_name = "LGBM"
run_num = "run_1"

if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    print("Folder already exists")

# file name
prefix = f'{output_path}'

Folder already exists


In [3]:
# read in data
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')
# convert month to string
df['ship_date_mm'] = df['ship_date_mm'].astype(str)

In [4]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date',
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat',
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 'wildlf_cat',
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd',
                'value', 'ship_date_mm']

export_df = helper.df_filtering(df, i_e = 'E', f_cols = feature_cols)

In [5]:
X_train, X_test, y_train, y_test = helper.data_split(export_df)

### LGBM

In [9]:
clf_lgbm = lgb.LGBMClassifier(random_state=rng,)

lgbm_params = {
    'clf__n_estimators' : [100, 500, 1000],
    "clf__max_depth" : [10,20,30,-1],
    "clf__num_leaves" : [10,30,50],
    "clf__learning_rate" : [.05,.1],
    "clf__is_unbalance" : [True,False],
    #early stopping?
}

lgbm_pipe = helper.gridsearch_pipeline(X_train, y_train, clf_lgbm, lgbm_params)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=10;, score=0.610 total time=   3.5s
[CV 2/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=10;, score=0.652 total time=   3.3s
[CV 3/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=10;, score=0.683 total time=   3.6s
[CV 4/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=10;, score=0.648 total time=   3.3s
[CV 5/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=10;, score=0.653 total time=   3.4s
[CV 1/5] END clf__is_unbalance=True, clf__learning_rate=0.05, clf__max_depth=10, clf__n_estimators=100, clf__num_leaves=30;, score=0.629 total time=   3.5s
[

In [12]:
joblib.dump(lgbm_pipe, f'{prefix}/lgbm_export.joblib')

['gridsearch_all_models/lgbm_export.joblib']