### Baseline Models Template for 2016 - 2020 Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA, SparsePCA
from sklearn.covariance import empirical_covariance
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches

import seaborn as sns
import joblib
import os
#from adspy_shared_utilities import plot_decision_tree
from datetime import datetime

In [2]:
rng = 42

In [3]:
p = '../data/'
input_name = 'cleaned.csv'

df = pd.read_csv(f'{p}{input_name}')

#### Preprocessing

In [4]:
def df_filtering(df, i_e = 'I', f_cols = []):

    filtered_df = df[df.i_e == i_e]
    filtered_df = filtered_df[f_cols+['act']]
    
    return filtered_df

In [5]:
df.columns

Index(['control_number', 'species_code', 'genus', 'species', 'wildlf_desc',
       'wildlf_cat', 'cartons', 'qty', 'unit', 'value', 'ctry_org', 'ctry_ie',
       'purp', 'src', 'trans_mode', 'act', 'dp_cd', 'disp_date', 'ship_date',
       'i_e', 'pt_cd', 'specific_generic_name', 'disp_date_yyyy',
       'disp_date_mm', 'ship_date_yyyy', 'ship_date_mm', 'disp_ship_date'],
      dtype='object')

In [6]:
non_feature_cols = ['control_number', 'disp_date', 'i_e', 'ship_date', 
                    'cartons', 'qty', 'unit', 'specific_generic_name',
                    'genus', 'species', 'wildlf_cat', 
                    'disp_date_yyyy', 'disp_date_mm', 'disp_ship_date']

target = ['act', 'dp_cd']

feature_cols = ['species_code', 'wildlf_desc', 
                'ctry_org', 'ctry_ie','purp', 'src', 'trans_mode', 'pt_cd', 
                 'value', 'ship_date_mm']

In [7]:
#import_df = df_filtering(df, i_e = 'I', f_cols = feature_cols)
export_df = df_filtering(df, i_e = 'E', f_cols = feature_cols)
# import: 590505 rows × 11 columns
# export: 299340 rows × 11 columns

In [10]:
# set up folder to save results
os.makedirs('export_run3_TargetEncoding')

In [23]:
prefix = 'export_run3_TargetEncoding/export'

In [12]:
def data_transformation(df):
    X, y = df.iloc[:,:-1], df.iloc[:,-1:]
    # X = pd.get_dummies(X, sparse=True)
    y = np.where(y['act']=='R',1,0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng, stratify=y) 
    
    return X_train, X_test, y_train, y_test
    

In [13]:
X_train, X_test, y_train, y_test = data_transformation(export_df)

#### column transformer w/ TargetEncoding

In [11]:
from category_encoders.target_encoder import TargetEncoder

In [14]:
categorical_var = ['species_code', 'wildlf_desc', 'ctry_org', 'ctry_ie','purp', 'src', 
                   'trans_mode', 'pt_cd']
numerical_var = ['value']

In [15]:
ct_target = make_column_transformer(
    (StandardScaler(), numerical_var),
    (TargetEncoder(), categorical_var),
    remainder='passthrough')
le = LabelEncoder()



In [20]:
X_train = ct_target.fit_transform(X_train, y_train)
joblib.dump(X_train, 'X_train_tragetEncoding.joblib')

X_test = ct_target.transform(X_test)
joblib.dump(X_test, 'X_test_tragetEncoding.joblib')
# Found unknown categories 




['X_test_tragetEncoding.joblib']

#### column transformer w/ OneHotEncoding

export_df.info()

categorical_var = ['species_code', 'wildlf_desc', 'ctry_org', 'ctry_ie','purp', 'src', 
                   'trans_mode', 'pt_cd']
numerical_var = ['value']

ct = make_column_transformer(
    (StandardScaler(), numerical_var),
    (OneHotEncoder(), categorical_var),
    remainder='passthrough')
le = LabelEncoder()

X_train_ohe = ct.fit_transform(X_train)
joblib.dump(X_train_ohe , 'X_train_ohe.joblib')

X_test_ohe = ct.transform(X_test)
joblib.dump(X_test_ohe , 'X_test_ohe.joblib')
-- Found unknown categories 

y_train_t = le.fit_transform(y_train.values.ravel())
y_test_t = le.transform(y_test.values.ravel())

print(f'{len(y_test_t[y_test_t > 0])},  {len(y_test_t[y_test_t ==0])}, total {len(y_test_t)}')
print(f'{len(y_train_t[y_train_t > 0])},  {len(y_train_t[y_train_t ==0])}, total {len(y_train_t)}')

len(['NYSC', 'HETB', 'HIHK', 'DIGR', 'STRH', 'CNCR', 'MUWR', 'CSOR', 'PIO?', 'WHIB', 'PHMI', 'CBQ?', 'RHFR', 'PENG', 'MRUA', 'ARB*', 'EUAU', 'CTGU', 'NENO', 'GBAN', 'PASB', 'SEPC', 'GEAC', 'TUMA', 'BUBI', 'BES?', 'OCCI', 'ONCA', 'IST?', 'CRAQ', 'MADC', 'SPRH', 'CHNE', 'PHPP', 'LTY?', 'LASP', 'AGAA', 'UAFE', 'DSP?', 'SSPH', 'COB*', 'OTGA', 'DEIJ', 'SEAV', 'CCEL', 'PGAL', 'PDMO', 'PLI*', 'TYS?', 'COJP', 'MIS?', 'CPSE', 'TABA', 'NHT?', 'NTP*', 'BANI', 'TOER', 'BOMA', 'CLAE', 'CCM?', 'PEXI', 'TANC', 'APHS', 'GAR?', 'PELF', 'AAMM', 'DDP?', 'GAS*', 'BORI', 'WOTH', 'PLAS', 'CMOR', 'PUN?', 'BBRA', 'SFLV', 'MARS', 'GOR?', 'SPHI', 'MIBA', 'ELA*', 'PAGM', 'THPS', 'ICGR', 'OXPI', 'STDO', 'PANH', 'TODE', 'NOMU', 'CONG', 'AMFU', 'THRE', 'BAVS', 'AMST', 'TRK?', 'BTGW', 'OEPI', 'PHSY', 'CMGR', 'BBAL', 'LIAB', 'RAN*', 'CUL*', 'CAMT', 'ZON?', 'CONV', 'NEP*', 'CINL', 'NCIN', 'MVIO', 'HDA?', 'BUSE', 'CLRV', 'NEH*', 'AMPH', 'ZOQU', 'PRFF', 'CEO*', 'CUP?', 'MANC', 'LITL', 'FADE', 'MAMA', 'SACL', 'TCS?', 'LURU', 'FALR', 'ABSM', 'THPY', 'NOFL', 'DDOR', 'MCB?', 'MAAB', 'CCAO', 'TOPO', 'SCEF', 'EID?', 'YETO', 'LSOU', 'OCS?', 'ANOT', 'NOSK', 'VEER', 'SURF', 'MRI?', 'PATA', 'BA##', 'CETD', 'HNC?', 'MEFE', 'CUND', 'FAR*', 'BALU', 'CCNS', 'TRQU', 'BLSW', 'OLFU', 'SDEP', 'MGI?', 'AUT?', 'BLSK', 'DETY', 'PHWI', 'ODJU', 'ERLI', 'PIP*', 'HEWA', 'OMYK', 'ARXA', 'COIE', 'TOAS', 'ELOA', 'AEUG', 'LAJA', 'CAUO', 'APRO', 'YELL', 'EILE', 'AFRU', 'PIVI', 'ADL?', 'NYPE', 'SUPH', 'TOSA', 'TUPH', 'CARA', 'PLDA', 'PIE*', 'BHGR', 'ORIO', 'BO00', 'SIBO', 'MMNK', 'TRMU', 'HOP*', 'SERH', 'POO*', 'ANHC', 'COAR', 'CSPZ', 'RSTE', 'SUDI', 'EUC*', 'PMOC', 'DPU?', 'NASI', 'MYA?', 'LYN?', 'ZANA', 'SSTO', 'TRUF', 'STPI', 'SON?', 'TOY?', 'GLOP', 'SVS?', 'MYOM', 'CNSE', 'CACO', 'GGTA', 'ATVA', 'PUCO', 'BYC?', 'DNP?', 'TOXS', 'CLNG', 'SASA', 'CASN', 'EOP?', 'CGY?', 'THAA', 'PAP?', 'ANN?', 'GHER', 'TAGN', 'VAGL', 'GAAB', 'VAMC', 'AUBM', 'ARSP', 'PPUO', 'MLS*', 'OSFL', 'MCUI', 'TAYA', 'CYR?', 'SASE', 'APFL', 'MABP', 'LIH*', 'TOSO', 'RAME', 'PYLU', 'TAMI', 'DRGL', 'SBA*', 'AMG*', 'BSPE', 'INGE', 'THOT', 'GYML', 'LAMS', 'EIM?', 'CGRO', 'HCNT', 'MRE?', 'MALP', 'ASNL', 'ECCO', 'CHOT', 'CAAS', 'NEUC', 'MCC*', 'HEMS', 'TNGC', 'SPBR', 'THUN', 'ACAC', 'CRAM', 'GECO', 'CCFF', 'WYGW', 'GOLA', 'SQU?', 'CEEE', 'TOFL', 'MYRA', 'MMIP', 'PSXA', 'LIUS', 'MIMC', 'SBOI', 'AVRT', 'TPOP', 'LEPK', 'CBNK', 'AGYL', 'BUFB', 'AVIR', 'MARU', 'LRA?', 'GUYO', 'HYD?', 'ABLO', 'GLTR', 'HSOR', 'MCV*', 'APMA', 'PYED', 'RATL', 'HECI', 'DBU?', 'HAWF', 'SYLL', 'NEOB', 'PIAR', 'BALP', 'TMRA', 'TMP?', 'BOOM', 'ATCE', 'LAES', 'MYMN', 'LALO', 'LBLO', 'BOGU', 'HEAT', 'PNOT', 'ORPA', 'PYHE', 'OCUN', 'PHCN', 'PAHC', 'CAUB', 'EPIF', 'NENI', 'ECRI', 'ENTL', 'MPS?', 'CSCA', 'PIS*', 'MLML', 'MYAB', 'CFIG', 'APD?', 'SPP*', 'BEHI', 'MUSC', 'PANO', 'LYBE', 'NALO', 'DIPP', 'COPA', 'PEH?', 'PLHE', 'SUC?', 'PRJE', 'HEFR', 'EMEL', 'CRGL', 'PEO*', 'PGAB', 'FALF', 'RHIM', 'WIWD', 'CDID', 'PCHI', 'VIN*', 'NATE', 'GYCA', 'PMR?', 'CYMM', 'CHAG', 'CROG', 'HYPH', 'HMG*', 'KIRA', 'ELI*', 'PYGB', 'LIW?', 'CNRA', 'TESG', 'COEC', 'TYA*', 'PABE', 'VAOR', 'GLMA', 'APOA', 'CCSR', 'LETA', 'BSEP', 'MYSC', 'HAWH', 'HATH', 'ATS*', 'CNC*', 'SCAS', 'VAMT', 'CAE?', 'CPSL', 'COOS', 'LBE?', 'REKN', 'RHIN', 'URB?', 'CIAN', 'MLUC', 'WEHU', 'MLA?', 'CNRU', 'ANLA', 'BTRS', 'OLWA', 'CAPN', 'HELD', 'PIN*', 'PPRZ', 'NAC?', 'MCU?', 'NISE', 'LESU', 'EAME', 'BOIM', 'DUTA', 'DCNC', 'APD*', 'PML?', 'BLUE', 'RHFI', 'MUNM', 'EMSP', 'GATR', 'OTRO', 'PIBI', 'DDU?', 'XEMI', 'NRUF', 'GLOI', 'HBRA', 'HCAM', 'BACN', 'NESP', 'LUME', 'MENM', 'EUPF', 'TATT', 'PET*', 'PLCP', 'IDE*', 'NEA?', 'ACNM', 'PADN', 'EPRO', 'PRON', 'ACSL', 'SPMA', 'MMGT', 'ACCV', 'STST', 'PAMP', 'TEER', 'TRDR', 'SERF', 'MTP?', 'URAE', 'SPHN', 'SBR*', 'ACD?', 'MED?', 'TMS?', 'AMFR', 'MYGR', 'ARLO', 'CEGY', 'COPE', 'CAMI', 'GOMV', 'ACBR', 'BTC?', 'ISPA', 'ABIM', 'PYBA', 'ONZB', 'BCP?', 'GAI?', 'CVRI', 'TDD?', 'ENEN', 'DAUA', 'ARTE', 'HAMP', 'BLTU', 'BMES', 'LAOM', 'LYCD', 'SSEN', 'CLM?', 'THTO', 'MYIR', 'HRL?', 'OVOO', 'URSE', 'TRI?', 'FELP', 'WNRA', 'ARGB', 'POT*', 'DASL', 'APY?', 'RAEX', 'RFBO', 'AMAJ', 'AMRR', 'SSTT', 'SVIN', 'CSPN', 'TYBR', 'PPDX', 'PTOR', 'EMBL', 'PPO?', 'COU*', 'AMGR', 'TBRN', 'CUPA', 'PNCP', 'PCUI', 'HIFI', 'AEXT', 'THX*', 'PND?', 'OVO?', 'LNIG', 'EPSS', 'PTYH', 'CVIG', 'LGS*', 'LBPI', 'BONI', 'PHCR', 'FERA', 'NCS?', 'HOSE', 'HANG', 'HEHE', 'PTGL', 'CCST', 'HDM?', 'PSO*', 'CCCR', 'ARMX', 'STYG', 'FIFI', 'HOG?', 'DICY', 'AAPA', 'BODU', 'SCIP', 'COCC', 'ESUB', 'PSRF', 'PHEY', 'IND*', 'ZMN?', 'WHB?', 'AGVA', 'BRPE', 'TAE*', 'APVE', 'LYPH', 'GRSN', 'ETMS', 'AS00', 'CEOC', 'ECOC', 'CRQ?', 'TURC', 'ANTE', 'TCHR', 'PSRE', 'NAPI', 'COGO', 'FURE', 'ECAE', 'BTH?', 'SMAX', 'CHO*', 'AMPR', 'MYEL', 'ASFL', 'ARU?', 'BLAC', 'HBOL', 'CRYA', 'LCTH', 'CBDT', 'COEF', 'ENG?', 'PRFR', 'TPER', 'CCCL', 'PHDE', 'CUGA', 'PTO?', 'TCRL', 'TAKY', 'TYCO', 'APHC', 'ELP*', 'EL??', 'FLO?', 'ASAX', 'HYCY', 'WBMB', 'OSMO', 'TUFS', 'TRJA', 'LEOW', 'MOT?', 'PHDA', 'DIMB', 'RIHU', 'CRML', 'AFAT', 'CTT?', 'PFVC', 'JUTM', 'ACAE', 'CHTU', 'ASET', 'NEFL', 'RAEV', 'PYGO', 'EPAL', 'URN?', 'LLAV', 'AGPE', 'XIOC', 'GESC', 'CYPN', 'SSCT', 'VAME', 'PMLL', 'SPMC', 'CSIE', 'DNO?', 'BMOL', 'AGC?', 'PIRF', 'EUDC', 'PPLU', 'XAFL', 'SHAH', 'CHTR', 'RUPE', 'KNIP', 'ARVA', 'SEBA', 'ACG?', 'GCEC', 'MIM?', 'LEAS', 'MEHP', 'PEAT', 'BRAB', 'ASN?', 'VAIN', 'STJA', 'PNN?', 'MIDU', 'MIOL', 'MYFS', 'PLRH', 'PLFL', 'REDP', 'EMTA', 'OSG?', 'LGU?', 'BZCO', 'SSTA', 'CAB*', 'HYA?', 'ECYA', 'DGL?', 'COLS', 'PHUR', 'PHRC', 'HCLC', 'PNE?', 'RGN?', 'GRAG', 'PPAN', 'BALT', 'AIP?', 'FERE', 'SAH?', 'PEOC', 'RDP?', 'ODGU', 'WEFL', 'LIUN', 'CYQU', 'SYCO', 'MDC*', 'GUPL', 'ASIS', 'PR00', 'BLEP', 'GORB', 'NATN', 'SCS?', 'PISE', 'GOPO', 'DVAG', 'ADME', 'ACSO', 'ANHU', 'FUHY', 'ECS*', 'NUWO', 'NTL?', 'SYLB', 'STEX', 'RPC?', 'SAXR', 'COGS', 'RPUM', 'TE00', 'HELI', 'TRIP', 'STCU', 'SAT?', 'DMEX', 'COI?', 'CRDU', 'SYAT', 'OPD?', 'VALE', 'CPO*', 'NUCA', 'AMAV', 'CMRQ', 'SYNR', 'HOSA', 'DSA?', 'PYN?', 'YTWA', 'JWW?', 'NNAS', 'COTE', 'DOBM', 'BRAU', 'GRPE', 'OUL?', 'OLVI', 'PAML', 'PLDU', 'STU*', 'GPT?', 'AGAT', 'LEUF', 'LEBU', 'HAGO', 'ARLA', 'BLWA', 'VECR', 'BTBW', 'ICME', 'CLYL', 'PYCF', 'CYX?', 'PDUN', 'HEAX', 'MIFU', 'GOSL', 'ATRV', 'SEXP', 'GRYE', 'EMHO', 'PTSD', 'CMGU', 'HAPU', 'VUGR', 'TGER', 'HHAI', 'GBUN', 'MUL*', 'CMCC', 'AMXC', 'BON?', 'ATH*', 'LNO*', 'SYST', 'TIOB', 'LIGY', 'PAGL', 'XIEL', 'AGI?', 'BSEC', 'LBMW', 'LMEL', 'MEEV', 'MECP', 'CEPN', 'CACH', 'PMAS', 'PYCY', 'POEC', 'BABJ', 'CSCR', 'SIY?', 'ORPP', 'EPTS', 'SPOD', 'CHN*', 'TPEL', 'SIEI', 'NOUR', 'NASG', 'XEIM', 'LOLI', 'GROR', 'PFSH', 'IGFR', 'CGB?', 'ANMS', 'BRCR', 'CSP?', 'CNCA', 'SPDE', 'ELPA', 'ARO*', 'EHE?', 'NYNO'])

#### Dummy Classifers

In [21]:
start = datetime.now()
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

# dummy_majority.score(X_test, y_test)
dummy_predicted = dummy_majority.predict(X_test)

model run time: 0:00:00.016193


In [24]:
joblib.dump(dummy_majority, f'{prefix}_dummy_majority_clf.joblib')

['export_run3_TargetEncoding/export_dummy_majority_clf.joblib']

#### Logistic Regression

In [25]:
start = datetime.now()
lr = LogisticRegression(random_state=rng).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_predicted)
print('Logistic regression classifier (default settings)\n', confusion)

model run time: 0:00:00.865735
Logistic regression classifier (default settings)
 [[74162    30]
 [  568    75]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
start = datetime.now()
lr_balanced = LogisticRegression(random_state=rng, class_weight='balanced').fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

lr_balanced_predicted = lr_balanced.predict(X_test)
confusion = confusion_matrix(y_test, lr_balanced_predicted)
print('Logistic regression classifier (balanced)\n', confusion)

model run time: 0:00:01.010160
Logistic regression classifier (balanced)
 [[67745  6447]
 [  295   348]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
joblib.dump(lr, f'{prefix}_lr_clf.joblib')
joblib.dump(lr_balanced, f'{prefix}_lr_balanced_clf.joblib')

['export_run3_TargetEncoding/export_lr_balanced_clf.joblib']

#### Decision Tree

In [28]:
start = datetime.now()
decision_tree_clf = DecisionTreeClassifier(random_state=rng).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:00:00.786365


In [29]:
tree_predicted = decision_tree_clf.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)
print('Logistic regression classifier (balanced)\n', confusion)

Logistic regression classifier (balanced)
 [[73846   346]
 [  353   290]]


In [30]:
joblib.dump(decision_tree_clf, f'{prefix}_decision_tree_clf.joblib')

['export_run3_TargetEncoding/export_decision_tree_clf.joblib']

#### Random Forest

In [31]:
start = datetime.now()
rf_clf = RandomForestClassifier(random_state=rng).fit(X_train, y_train)
end = datetime.now()
print(f'model run time: {end - start}')

model run time: 0:00:18.235343


In [32]:
rf_predicted = rf_clf.predict(X_test)
confusion = confusion_matrix(y_test, rf_predicted)
print('Logistic regression classifier (balanced)\n', confusion)

Logistic regression classifier (balanced)
 [[74105    87]
 [  376   267]]


In [33]:
joblib.dump(rf_clf, f'{prefix}_rf_clf.joblib')

['export_run3_TargetEncoding/export_rf_clf.joblib']

#### Model Evaluation

In [34]:
print('Random class-proportional (dummy)\n', 
      classification_report(y_test, dummy_predicted, target_names=['clear', 'seized']))
print('Decision Tree \n', 
      classification_report(y_test, tree_predicted, target_names=['clear', 'seized']))
print('Random Forest \n', 
      classification_report(y_test, rf_predicted, target_names=['clear', 'seized']))
print('Logistic Regression \n', 
      classification_report(y_test, lr_predicted, target_names=['clear', 'seized']))
print('Logistic Regression (balanced) \n', 
      classification_report(y_test, lr_balanced_predicted, target_names=['clear', 'seized']))

Random class-proportional (dummy)
               precision    recall  f1-score   support

       clear       0.99      1.00      1.00     74192
      seized       0.00      0.00      0.00       643

    accuracy                           0.99     74835
   macro avg       0.50      0.50      0.50     74835
weighted avg       0.98      0.99      0.99     74835

Decision Tree 
               precision    recall  f1-score   support

       clear       1.00      1.00      1.00     74192
      seized       0.46      0.45      0.45       643

    accuracy                           0.99     74835
   macro avg       0.73      0.72      0.72     74835
weighted avg       0.99      0.99      0.99     74835

Random Forest 
               precision    recall  f1-score   support

       clear       0.99      1.00      1.00     74192
      seized       0.75      0.42      0.54       643

    accuracy                           0.99     74835
   macro avg       0.87      0.71      0.77     74835
weighte

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression 
               precision    recall  f1-score   support

       clear       0.99      1.00      1.00     74192
      seized       0.71      0.12      0.20       643

    accuracy                           0.99     74835
   macro avg       0.85      0.56      0.60     74835
weighted avg       0.99      0.99      0.99     74835

Logistic Regression (balanced) 
               precision    recall  f1-score   support

       clear       1.00      0.91      0.95     74192
      seized       0.05      0.54      0.09       643

    accuracy                           0.91     74835
   macro avg       0.52      0.73      0.52     74835
weighted avg       0.99      0.91      0.95     74835



In [35]:
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 

#### PCA

def plot_labelled_scatter(X, y, class_labels):
    num_labels = len(class_labels)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    marker_array = ['o', '^', '*']
    color_array = ['#FFFF00', '#00AAFF', '#000000', '#FF00AA']
    cmap_bold = ListedColormap(color_array)
    bnorm = BoundaryNorm(np.arange(0, num_labels + 1, 1), ncolors=num_labels)
    plt.figure()

    plt.scatter(X[:, 0], X[:, 1], s=65, c=y, cmap=cmap_bold, norm = bnorm, alpha = 0.40, edgecolor='black', lw = 1)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    h = []
    for c in range(0, num_labels):
        h.append(mpatches.Patch(color=color_array[c], label=class_labels[c]))
    plt.legend(handles=h)

    plt.show()
    
def plot_pca(pca, f_names, top_k = 10):
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.imshow(pca.components_[0:top_k], interpolation = 'none', cmap = 'plasma')
    feature_names=f_names
    plt.xticks(np.arange(-0., len(feature_names), 1) , feature_names, rotation = 90, fontsize=12)
    plt.yticks(np.arange(0., 2, 1), ['First PC', 'Second PC'], fontsize = 16)
    plt.colorbar()

ct.named_transformers_
transformed_features = ct.named_transformers_['onehotencoder'].get_feature_names_out()

pca = PCA(n_components=2, random_state=rng)
X_train_pca = pca.fit_transform(X_train_t.toarray())
print(X_train_t.shape, X_train_pca.shape)

joblib.dump(X_train_pca, f'{prefix}_X_train_pca.joblib')

sns.heatmap(empirical_covariance(X_train_pca))

plot_labelled_scatter(X_train_pca, y_train_t, ['clear', 'seized'])

plot_pca(pca, transformed_features)

pca_sparse = SparsePCA(n_components=2, random_state=rng)
X_train_pca_sparse = pca_sparse.fit_transform(X_train_t.toarray())

joblib.dump(X_train_pca_sparse, f'{prefix}_X_train_pca_sparse.joblib')

sns.heatmap(empirical_covariance(X_train_pca_sparse))

plot_pca(pca_sparse, transformed_features)