### Decision Tree Model Tuning for 2016 - 2020 Data (imports)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import modelinghelper as helper

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from adspy_shared_utilities import plot_feature_importances
#from adspy_shared_utilities import plot_decision_tree


import joblib
import os
from datetime import datetime

# supress future warnings
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
#set up random seed
rng = 42

# set up folder to save results
output_path = 'import_run8_TargetEncoding_gridsearchCV'
dataset = 'import'

if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    print("Folder already exists")

# file name
prefix = f'{output_path}/{dataset}'

In [3]:
# read in data
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')

In [4]:
df['ship_date_mm'] = df['ship_date_mm'].astype(str)

In [5]:
df.describe()

Unnamed: 0,control_number,qty,value,qty_new,ship_date_yyyy
count,1890132.0,1890132.0,1890132.0,1890132.0,1890132.0
mean,2018142000.0,1955213.0,9658.016,1954949.0,2017.558
std,1263350.0,264407800.0,935505.3,264407800.0,1.279415
min,2014539000.0,-1.0,0.0,-1.0,2016.0
25%,2016978000.0,1.5,51.0,1.0,2016.0
50%,2017984000.0,9.0,318.0,8.0,2017.0
75%,2019505000.0,93.0,2104.0,86.0,2019.0
max,2020755000.0,99907080000.0,1242236000.0,99907080000.0,2020.0


In [6]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date', 
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat', 
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 'wildlf_cat',
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd', 
                 'value', 'ship_date_mm']

export_df = helper.df_filtering(df, i_e = 'I', f_cols = feature_cols)
# import: 590505 rows × 11 columns
# export: 299340 rows × 11 columns

#### Modeling Pipeline

In [7]:
X_train, X_test, y_train, y_test = helper.data_split(export_df)

#### Decision Tree

In [8]:
clf = DecisionTreeClassifier(random_state=rng)
dt_params={
    'clf__class_weight':[None,
                         'balanced', 
                         #{0:100, 1:1}, 
                         {0:50, 1:1}],
    'clf__max_depth': [2,3,4,5,6,7,8,9,10, None]
}

%time dt_pipe = helper.gridsearch_pipeline(X_train, y_train, clf, dt_params) 
dt_pipe

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END clf__class_weight=None, clf__max_depth=2;, score=0.300 total time=  10.2s
[CV 2/5] END clf__class_weight=None, clf__max_depth=2;, score=0.303 total time=  10.3s
[CV 3/5] END clf__class_weight=None, clf__max_depth=2;, score=0.303 total time=  10.4s
[CV 4/5] END clf__class_weight=None, clf__max_depth=2;, score=0.231 total time=  10.2s
[CV 5/5] END clf__class_weight=None, clf__max_depth=2;, score=0.222 total time=  10.2s
[CV 1/5] END clf__class_weight=None, clf__max_depth=3;, score=0.420 total time=  10.5s
[CV 2/5] END clf__class_weight=None, clf__max_depth=3;, score=0.431 total time=  10.4s
[CV 3/5] END clf__class_weight=None, clf__max_depth=3;, score=0.428 total time=  10.7s
[CV 4/5] END clf__class_weight=None, clf__max_depth=3;, score=0.392 total time=  10.5s
[CV 5/5] END clf__class_weight=None, clf__max_depth=3;, score=0.386 total time=  10.3s
[CV 1/5] END clf__class_weight=None, clf__max_depth=4;, score=0.410 

In [9]:
joblib.dump(dt_pipe, f'{prefix}_dt_pipe.joblib')

['import_run8_TargetEncoding_gridsearchCV/import_dt_pipe.joblib']

#### Random Forest

In [None]:
clf = RandomForestClassifier(random_state=rng)
rf_params={
    'clf__n_estimators':[4,6,8,10, 20, 30, 50, 100],
    'clf__max_depth': [None, 3, 4, 5,7,8,9, 10],
    'clf__class_weight':['balanced', None]
}

%time rf_pipe = helper.gridsearch_pipeline(X_train, y_train, clf, rf_params) 
rf_pipe

Fitting 5 folds for each of 128 candidates, totalling 640 fits
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=4;, score=0.553 total time=  12.9s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=4;, score=0.571 total time=  12.7s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=4;, score=0.560 total time=  13.1s
[CV 4/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=4;, score=0.567 total time=  13.0s
[CV 5/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=4;, score=0.557 total time=  12.9s
[CV 1/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=6;, score=0.571 total time=  14.5s
[CV 2/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=6;, score=0.586 total time=  14.3s
[CV 3/5] END clf__class_weight=balanced, clf__max_depth=None, clf__n_estimators=6;, score=0.574 total time=  14.2s
[CV 4/5] END clf_

In [11]:
clf = RandomForestClassifier(random_state=rng)
rf_params={
    'clf__n_estimators':[100, 200, 500],
    'clf__max_depth': [5, 6, 7, 8, None],
    'clf__class_weight':['balanced', {0:100, 1:1}, {0:50, 1:1}]
}

%time rf_pipe = gridsearch_pipeline(clf, rf_params) 
rf_pipe

NameError: name 'gridsearch_pipeline' is not defined

NameError: name 'rf_pipe' is not defined

In [None]:
joblib.dump(rf_pipe, f'{prefix}_rf_pipe.joblib')

In [None]:
outputs = True
dt_predicted = dt_pipe.predict(X_test)
confusion = confusion_matrix(y_test, dt_predicted, labels = [1,0])
if outputs:
    print('Decision Tree (best model)\n', confusion)
cm_display = ConfusionMatrixDisplay(confusion, display_labels=[1,0])
if outputs:
    cm_display.plot()
    
    