### Decision Tree Model Tuning for 2016 - 2020 Data (exports)

In [16]:
import pandas as pd
import numpy as np
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
# from sklearn.decomposition import PCA, SparsePCA
# from sklearn.covariance import empirical_covariance
import matplotlib.pyplot as plt
import matplotlib.cm as cm
# from matplotlib.colors import ListedColormap, BoundaryNorm
# import matplotlib.patches as mpatches

import seaborn as sns
import joblib
import os
#from adspy_shared_utilities import plot_decision_tree
from datetime import datetime

In [3]:
#set up random seed
rng = 42

# set up folder to save results
os.makedirs('export_run4_TargetEncoding_gridsearchCV')
prefix = 'export_run4_TargetEncoding_gridsearchCV/export'

In [4]:
# read in data
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')

In [5]:
# filter for exports

def df_filtering(df, i_e = 'I', f_cols = []):

    filtered_df = df[df.i_e == i_e]
    filtered_df = filtered_df[f_cols+['act']]
    
    return filtered_df

non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date', 
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat', 
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd', 
                 'value', 'ship_date_mm']

#import_df = df_filtering(df, i_e = 'I', f_cols = feature_cols)
export_df = df_filtering(df, i_e = 'E', f_cols = feature_cols)
# import: 590505 rows × 11 columns
# export: 299340 rows × 11 columns

#### Modeling Pipeline

In [7]:
def data_split(df):
    X, y = df.iloc[:,:-1], df.iloc[:,-1:]
    y = np.where(y['act']=='R',1,0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y) 
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = data_split(export_df)

In [42]:
def gridsearch_pipeline(classifer, grid_param): 
    categorical_var = ['species_code', 'wildlf_desc', 'ctry_org', 'ctry_ie','purp', 'src', 
                   'trans_mode', 'pt_cd']
    numerical_var = ['value']
    
    ct_target = make_column_transformer(
                            (StandardScaler(), numerical_var),
                            (TargetEncoder(), categorical_var),
                            remainder='passthrough')
    
    clf = classifer
    params = grid_param

    pipe = Pipeline([('transformer', ct_target), 
                     ('clf', clf)
                    ], verbose=False)

    grid_pipe = GridSearchCV(pipe,
                             param_grid=params,
                             scoring='recall',
                             cv=5,
                             verbose=3)

    grid_pipe.fit(X_train, y_train)
    print('Grid best parameter (max. recall): ', grid_pipe.best_params_)
    print('Grid best score (recall): ', grid_pipe.best_score_)
    
    return grid_pipe

In [31]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning) # setting ignore as a parameter and further adding category

#### Decision Tree

In [43]:
clf = DecisionTreeClassifier(random_state=rng)
dt_params={
    'clf__class_weight':['balanced', {0:100, 1:1}, {0:50, 1:1}],
    'clf__max_depth': [5, 6, 7, 8, None]
}

%time dt_pipe = gridsearch_pipeline(clf, dt_params) 
dt_pipe

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=5;, score=0.610 total time=   1.4s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=5;, score=0.681 total time=   1.4s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=5;, score=0.725 total time=   1.3s
[CV 4/5] END clf__class_weight=balanced, clf__max_depth=5;, score=0.630 total time=   1.3s
[CV 5/5] END clf__class_weight=balanced, clf__max_depth=5;, score=0.653 total time=   1.3s
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=6;, score=0.579 total time=   1.4s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=6;, score=0.642 total time=   1.4s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=6;, score=0.647 total time=   1.4s
[CV 4/5] END clf__class_weight=balanced, clf__max_depth=6;, score=0.645 total time=   1.4s
[CV 5/5] END clf__class_weight=balanced, clf__max_depth=6;, score=0.640 total time=   1.4s
[CV 1/5] END clf__class_weigh

In [39]:
joblib.dump(dt_pipe, f'{prefix}_dt_pipe.joblib')

['export_run4_TargetEncoding_gridsearchCV/export_dt_pipe.joblib']

#### Random Forest

In [None]:
clf = RandomForestClassifier(random_state=rng)
rf_params={
    'clf__n_estimators':[100, 200, 500],
    'clf__max_depth': [5, 6, 7, 8, None],
    'clf__class_weight':['balanced', {0:100, 1:1}, {0:50, 1:1}]
}

%time rf_pipe = gridsearch_pipeline(clf, rf_params) 
rf_pipe

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=100;, score=0.548 total time=   6.3s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=100;, score=0.595 total time=   6.2s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=100;, score=0.621 total time=   6.3s
[CV 4/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=100;, score=0.552 total time=   6.2s
[CV 5/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=100;, score=0.598 total time=   6.3s
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=200;, score=0.558 total time=  11.2s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=200;, score=0.610 total time=  11.3s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=5, clf__n_estimators=200;, score=0.629 total time=  11.3s
[CV 4/5] END clf__class_we

In [None]:
joblib.dump(rf_pipe, f'{prefix}_rf_pipe.joblib')