### Baseline Models Template for 2016 - 2020 Data

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA, SparsePCA
from sklearn.covariance import empirical_covariance
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches

# additional library for boosted trees
import xgboost as xgb

#FIGS
from imodels import FIGSClassifier

# imbalanced dataset tool
import imblearn

# target encoding
from category_encoders.target_encoder import TargetEncoder

import seaborn as sns
import joblib
import os
#from adspy_shared_utilities import plot_decision_tree
from datetime import datetime

In [5]:
rng = 42

In [6]:
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')

#### Preprocessing

In [7]:
def df_filtering(df, i_e = 'I', f_cols = []):

    filtered_df = df[df.i_e == i_e]
    filtered_df = filtered_df[f_cols+['act']]
    
    return filtered_df

In [8]:
df.columns

Index(['control_number', 'species_code', 'genus', 'species', 'wildlf_desc',
       'wildlf_cat', 'cartons', 'qty', 'unit', 'value', 'ctry_org', 'ctry_ie',
       'purp', 'src', 'trans_mode', 'act', 'dp_cd', 'disp_date', 'ship_date',
       'i_e', 'pt_cd', 'specific_generic_name', 'disp_date_yyyy',
       'disp_date_mm', 'ship_date_yyyy', 'ship_date_mm', 'disp_ship_date'],
      dtype='object')

In [9]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date', 
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat', 
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd', 
                 'value', 'ship_date_mm']

In [10]:
import_df = df_filtering(df, i_e = 'I', f_cols = feature_cols)
#export_df = df_filtering(df, i_e = 'E', f_cols = feature_cols)
# import: 590505 rows × 11 columns
# export: 299340 rows × 11 columns

In [11]:
# set up folder to save results
os.makedirs('import_run3_TargetEncoding_JP')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'import_run3_TargetEncoding_JP'

In [12]:
prefix = 'import_run3_TargetEncoding_JP/import'

In [13]:
def data_transformation(df):
    X, y = df.iloc[:,:-1], df.iloc[:,-1:]
    # X = pd.get_dummies(X, sparse=True)
    y = np.where(y['act']=='R',1,0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y) 
    
    return X_train, X_test, y_train, y_test
    

In [14]:
X_train, X_test, y_train, y_test = data_transformation(import_df)

#### column transformer w/ TargetEncoding

In [15]:
categorical_var = ['species_code', 'wildlf_desc', 'ctry_org', 'ctry_ie','purp', 'src', 
                   'trans_mode', 'pt_cd']
numerical_var = ['value']

In [16]:
ct_target = make_column_transformer(
    (StandardScaler(), numerical_var),
    (TargetEncoder(), categorical_var),
    remainder='passthrough')
le = LabelEncoder()



In [17]:
X_train = ct_target.fit_transform(X_train, y_train)
joblib.dump(X_train, 'X_train_targetEncoding.joblib')

X_test = ct_target.transform(X_test)
joblib.dump(X_test, 'X_test_targetEncoding.joblib')
# Found unknown categories 




['X_test_targetEncoding.joblib']

#### Dummy Classifiers

In [18]:
start = datetime.now()
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

# dummy_majority.score(X_test, y_test)
dummy_predicted = dummy_majority.predict(X_test)

model run time: 0:00:00.025819


In [19]:
joblib.dump(dummy_majority, f'{prefix}_dummy_majority_clf.joblib')

['import_run3_TargetEncoding_JP/import_dummy_majority_clf.joblib']

#### Logistic Regression

In [20]:
start = datetime.now()
lr = LogisticRegression(random_state=rng, max_iter=1000).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_predicted)
print('Logistic regression classifier (default settings)\n', confusion)

model run time: 0:00:03.829476
Logistic regression classifier (default settings)
 [[390548    568]
 [  3575   2936]]


In [21]:
start = datetime.now()
lr_balanced = LogisticRegression(random_state=rng, class_weight='balanced', max_iter=1000).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

lr_balanced_predicted = lr_balanced.predict(X_test)
confusion = confusion_matrix(y_test, lr_balanced_predicted)
print('Logistic regression classifier (balanced)\n', confusion)

model run time: 0:00:02.891928
Logistic regression classifier (balanced)
 [[364434  26682]
 [  1587   4924]]


In [22]:
joblib.dump(lr, f'{prefix}_lr_clf.joblib')
joblib.dump(lr_balanced, f'{prefix}_lr_balanced_clf.joblib')

['import_run3_TargetEncoding_JP/import_lr_balanced_clf.joblib']

#### Decision Tree

In [23]:
start = datetime.now()
decision_tree_clf = DecisionTreeClassifier(random_state=rng).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:00:05.054390


In [24]:
tree_predicted = decision_tree_clf.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)
print('Decision Tree Classifier\n', confusion)

Decision Tree Classifier
 [[388786   2330]
 [  2296   4215]]


In [25]:
joblib.dump(decision_tree_clf, f'{prefix}_decision_tree_clf.joblib')

['import_run3_TargetEncoding_JP/import_decision_tree_clf.joblib']

#### Random Forest

In [26]:
start = datetime.now()
rf_clf = RandomForestClassifier(random_state=rng).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:02:13.502730


In [27]:
rf_predicted = rf_clf.predict(X_test)
confusion = confusion_matrix(y_test, rf_predicted)
print('Random Forest Classifier\n', confusion)

Random Forest Classifier
 [[390526    590]
 [  2352   4159]]


In [28]:
joblib.dump(rf_clf, f'{prefix}_rf_clf.joblib')

['import_run3_TargetEncoding_JP/import_rf_clf.joblib']

#### XGBoost Classifier
(default settings)

In [29]:
start = datetime.now()
xgb_params = {"seed": rng }
xgb_clf = xgb.XGBClassifier(random_state = rng).fit(X_train,y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:00:13.937397


In [30]:
xgb_predicted = xgb_clf.predict(X_test)
confusion = confusion_matrix(y_test, xgb_predicted)
print('XGB Classifier Confusion Matrix\n', confusion)

XGB Classifier Confusion Matrix
 [[390739    377]
 [  2746   3765]]


In [31]:
joblib.dump(rf_clf, f'{prefix}_xgb_clf.joblib')

['import_run3_TargetEncoding_JP/import_xgb_clf.joblib']

#### FIGS

In [32]:
start = datetime.now()
figs_clf = FIGSClassifier(random_state=rng).fit(X_train,y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:00:33.773844


In [33]:
figs_predicted = figs_clf.predict(X_test)
confusion = confusion_matrix(y_test, figs_predicted)
print('FIGS Classifier Confusion Matrix\n', confusion)

FIGS Classifier Confusion Matrix
 [[390558    558]
 [  3731   2780]]


In [34]:
joblib.dump(rf_clf, f'{prefix}_figs_clf.joblib')

['import_run3_TargetEncoding_JP/import_figs_clf.joblib']

#### Model Evaluation

In [35]:
print('Random class-proportional (dummy)\n', 
      classification_report(y_test, dummy_predicted, target_names=['clear', 'seized']))
print('Decision Tree \n', 
      classification_report(y_test, tree_predicted, target_names=['clear', 'seized']))
print('Random Forest \n', 
      classification_report(y_test, rf_predicted, target_names=['clear', 'seized']))
print('Logistic Regression \n', 
      classification_report(y_test, lr_predicted, target_names=['clear', 'seized']))
print('Logistic Regression (balanced) \n', 
      classification_report(y_test, lr_balanced_predicted, target_names=['clear', 'seized']))
print('XGBoost Classifier \n',
      classification_report(y_test, xgb_predicted, target_names=['clear', 'seized']))
print('FIGS Classifier \n',
      classification_report(y_test, figs_predicted, target_names=['clear', 'seized']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random class-proportional (dummy)
               precision    recall  f1-score   support

       clear       0.98      1.00      0.99    391116
      seized       0.00      0.00      0.00      6511

    accuracy                           0.98    397627
   macro avg       0.49      0.50      0.50    397627
weighted avg       0.97      0.98      0.98    397627

Decision Tree 
               precision    recall  f1-score   support

       clear       0.99      0.99      0.99    391116
      seized       0.64      0.65      0.65      6511

    accuracy                           0.99    397627
   macro avg       0.82      0.82      0.82    397627
weighted avg       0.99      0.99      0.99    397627

Random Forest 
               precision    recall  f1-score   support

       clear       0.99      1.00      1.00    391116
      seized       0.88      0.64      0.74      6511

    accuracy                           0.99    397627
   macro avg       0.93      0.82      0.87    397627
weighte

## Variable Selection

In [39]:
X_train.shape, y_train.shape

((1192878, 10), (1192878,))

#### Forward Selection

In [2]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

In [41]:
# Logistic Regression
lr = LogisticRegression(random_state=rng, max_iter=1000)
sfs_lr = sfs(lr, k_features=10, forward=True, floating=False)
sfs_lr = sfs_lr.fit(X_train, y_train)
sfs_lr_features = list(sfs_lr.k_feature_names_)

In [43]:
# Logistic Regression - Balanced
lrb = LogisticRegression(random_state=rng, class_weight='balanced', max_iter=1000)
sfs_lrb = sfs(lrb, k_features=10, forward=True, floating=False)
sfs_lrb = sfs_lrb.fit(X_train, y_train)
sfs_lrb_features = list(sfs_lrb.k_feature_names_)

In [44]:
# Decision Tree 
dtree = DecisionTreeClassifier(random_state=rng)
sfs_dtree = sfs(dtree, k_features=10, forward=True, floating=False)
sfs_dtree = sfs_dtree.fit(X_train, y_train)
sfs_dtree_features = list(sfs_dtree.k_feature_names_)

In [45]:
# Random Forest
rf = RandomForestClassifier(random_state=rng)
sfs_rf = sfs(rf, k_features=10, forward=True, floating=False)
sfs_rf = sfs_rf.fit(X_train, y_train)
sfs_rf_features = list(sfs_rf.k_feature_names_)

In [46]:
# XGBoost
xgbst = xgb.XGBClassifier(random_state = rng)
sfs_xgbst = sfs(xgbst, k_features=10, forward=True, floating=False)
sfs_xgbst = sfs_xgbst.fit(X_train, y_train)
sfs_xgbst_features = list(sfs_xgbst.k_feature_names_)

In [50]:
# FIGS -- Doesn't work

In [48]:
print('Forward Selection Results for Logistic Regression:', sfs_lr_features)
print('Forward Selection Results for Logistic Regression - Balanced:', sfs_lrb_features)
print('Forward Selection Results for Decision Tree:', sfs_dtree_features)
print('Forward Selection Results for Random Forest:', sfs_rf_features)
print('Forward Selection Results for XGBoost:', sfs_xgbst_features)

Forward Selection Results for Logistic Regression: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Forward Selection Results for Logistic Regression - Balanced: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Forward Selection Results for Decision Tree: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Forward Selection Results for Random Forest: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Forward Selection Results for XGBoost: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


NameError: name 'sfs_figs_features' is not defined

#### Backward Selection

In [51]:
# Logistic Regression
lr = LogisticRegression(random_state=rng, max_iter=1000)
sfs_lr = sfs(lr, k_features=10, forward=False, floating=False)
sfs_lr = sfs_lr.fit(X_train, y_train)
sfs_lr_features = list(sfs_lr.k_feature_names_)

In [52]:
# Logistic Regression - Balanced
lrb = LogisticRegression(random_state=rng, class_weight='balanced', max_iter=1000)
sfs_lrb = sfs(lrb, k_features=10, forward=False, floating=False)
sfs_lrb = sfs_lrb.fit(X_train, y_train)
sfs_lrb_features = list(sfs_lrb.k_feature_names_)

In [53]:
# Decision Tree 
dtree = DecisionTreeClassifier(random_state=rng)
sfs_dtree = sfs(dtree, k_features=10, forward=False, floating=False)
sfs_dtree = sfs_dtree.fit(X_train, y_train)
sfs_dtree_features = list(sfs_dtree.k_feature_names_)

In [54]:
# Random Forest
rf = RandomForestClassifier(random_state=rng)
sfs_rf = sfs(rf, k_features=10, forward=True, floating=False)
sfs_rf = sfs_rf.fit(X_train, y_train)
sfs_rf_features = list(sfs_rf.k_feature_names_)

In [55]:
# XGBoost
xgbst = xgb.XGBClassifier(random_state = rng)
sfs_xgbst = sfs(xgbst, k_features=10, forward=False, floating=False)
sfs_xgbst = sfs_xgbst.fit(X_train, y_train)
sfs_xgbst_features = list(sfs_xgbst.k_feature_names_)

In [56]:
# FIGS -- Doesn't work

In [57]:
print('Backward Selection Results for Logistic Regression:', sfs_lr_features)
print('Backward Selection Results for Logistic Regression - Balanced:', sfs_lrb_features)
print('Backward Selection Results for Decision Tree:', sfs_dtree_features)
print('Backward Selection Results for Random Forest:', sfs_rf_features)
print('Backward Selection Results for XGBoost:', sfs_xgbst_features)

Backward Selection Results for Logistic Regression: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Backward Selection Results for Logistic Regression - Balanced: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Backward Selection Results for Decision Tree: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Backward Selection Results for Random Forest: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Backward Selection Results for XGBoost: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


## Standard Resampling with replacement?

In [30]:
# https://www.analyticsvidhya.com/blog/2021/06/5-techniques-to-handle-imbalanced-data-for-a-classification-problem/
from sklearn.utils import resample

# probably need to take the original dataframe, resample it

## Resampling with SMOTE ?

In [None]:
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 

#### PCA

def plot_labelled_scatter(X, y, class_labels):
    num_labels = len(class_labels)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    marker_array = ['o', '^', '*']
    color_array = ['#FFFF00', '#00AAFF', '#000000', '#FF00AA']
    cmap_bold = ListedColormap(color_array)
    bnorm = BoundaryNorm(np.arange(0, num_labels + 1, 1), ncolors=num_labels)
    plt.figure()

    plt.scatter(X[:, 0], X[:, 1], s=65, c=y, cmap=cmap_bold, norm = bnorm, alpha = 0.40, edgecolor='black', lw = 1)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    h = []
    for c in range(0, num_labels):
        h.append(mpatches.Patch(color=color_array[c], label=class_labels[c]))
    plt.legend(handles=h)

    plt.show()
    
def plot_pca(pca, f_names, top_k = 10):
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.imshow(pca.components_[0:top_k], interpolation = 'none', cmap = 'plasma')
    feature_names=f_names
    plt.xticks(np.arange(-0., len(feature_names), 1) , feature_names, rotation = 90, fontsize=12)
    plt.yticks(np.arange(0., 2, 1), ['First PC', 'Second PC'], fontsize = 16)
    plt.colorbar()

ct.named_transformers_
transformed_features = ct.named_transformers_['onehotencoder'].get_feature_names_out()

pca = PCA(n_components=2, random_state=rng)
X_train_pca = pca.fit_transform(X_train_t.toarray())
print(X_train_t.shape, X_train_pca.shape)

joblib.dump(X_train_pca, f'{prefix}_X_train_pca.joblib')

sns.heatmap(empirical_covariance(X_train_pca))

plot_labelled_scatter(X_train_pca, y_train_t, ['clear', 'seized'])

plot_pca(pca, transformed_features)

pca_sparse = SparsePCA(n_components=2, random_state=rng)
X_train_pca_sparse = pca_sparse.fit_transform(X_train_t.toarray())

joblib.dump(X_train_pca_sparse, f'{prefix}_X_train_pca_sparse.joblib')

sns.heatmap(empirical_covariance(X_train_pca_sparse))

plot_pca(pca_sparse, transformed_features)