# Product Matching
## Level 5: Model Evaluation Script

### Notes:
1. Evaluate the Voting Ensemble model on train and test data

### References: 

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [2]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
from sklearn.base import BaseEstimator               #For creating a simple baseline model
from sklearn.model_selection import cross_val_score, cross_val_predict  #For performing model cross-validation
from sklearn.metrics import accuracy_score  #For evaluating classifier model
from sklearn.metrics import confusion_matrix, precision_score,\
    recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score  #For evaluating classifier model
# from sklearn import tree             #For visualizing decision trees
# from graphviz import Source               #For storing decision tree visualizations
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module
import matplotlib.pyplot as plt      #Standard plotting package

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

In [4]:
# def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):    
#     plt.plot(thresholds, precisions[:-1], "b--", label="Precision")    
#     plt.plot(thresholds, recalls[:-1], "g-", label="Recall")  
#     plt.legend(loc='center right', fontsize=16)
#     plt.xlabel("Threshold", fontsize=16)
#     plt.grid(True)
#     # plt.axis([-50000, 50000, 0, 1])

In [5]:
# def plot_precision_vs_recall(precisions, recalls):
#     plt.plot(recalls, precisions, "b-", linewidth=2)
#     plt.xlabel("Recall", fontsize=16)
#     plt.ylabel("Precision", fontsize=16)
#     plt.axis([0, 1, 0, 1])
#     plt.grid(True)

In [6]:
# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
#     plt.axis([0, 1, 0, 1])                                   
#     plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) 
#     plt.ylabel('True Positive Rate (Recall)', fontsize=16)    
#     plt.grid(True)                                            

## Custom Tranformers

## Set Directories

In [7]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [8]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

## Load Model

In [9]:
sgd_clf    = joblib.load(iput_dir + "L400_mdl_2022-08-01.pkl")
tree_clf   = joblib.load(iput_dir + "L401_mdl_2022-08-01.pkl")
rnd_clf    = joblib.load(iput_dir + "L402_mdl_2022-08-03.pkl")
voting_clf = joblib.load(iput_dir + "L420_mdl_2022-08-05.pkl")

## Load Data

In [10]:
## Training Data
X = joblib.load(iput_dir + "L300_prd_df_2022-07-15.pkl")
y = joblib.load(iput_dir + "L300_tgt_df_2022-07-15.pkl")

In [11]:
## Test Data
tst_df = (
    pd
    .read_pickle(iput_dir + "L100_tst_df_2022-07-15.pkl")
    .rename(columns = dict(zip(
        ['Feat_1','Feat_2','Feat_3','Feat_4','Feat_5','Feat_6'],
        ['Feat_Exact_SuppItemNum','Feat_Fuzzy_SuppName','Feat_Fuzzy_ItemDesc','Feat_Exact_ItemNum','Feat_Exact_UNSPSC','Feat_Fuzzy_SuppItemNum']
    )))
)

X_tst = tst_df.filter(regex='^Feat', axis="columns").values
y_tst = tst_df.assign(Match = lambda df: np.where(df.Match=='Yes', 1, 0)).Match

In [12]:
tst_df.Match.value_counts()

No     261
Yes    156
Name: Match, dtype: int64

## Baseline Model Creation

In [13]:
class NotMatchClassifier(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.zeros(len(X), dtype=bool)
    
not_match_clf = NotMatchClassifier()

## Visualize Model

## Evaluate Model

#### Primary error metric (Accuracy)

Training

In [14]:
print(sum(not_match_clf.predict(X)==y)/len(y))
print(sum(voting_clf.predict(X)==y)/len(y))

0.6085131894484412
0.9982014388489209


Testing

In [16]:
## Be careful, logistic model need standardized inputs

for clf in (not_match_clf, sgd_clf, tree_clf, rnd_clf, voting_clf):
    clf.fit(X, y)
    y_pred = clf.predict(X_tst)
    print(clf.__class__.__name__, accuracy_score(y_tst, y_pred))

NotMatchClassifier 0.6258992805755396
SGDClassifier 0.973621103117506
DecisionTreeClassifier 0.9952038369304557
RandomForestClassifier 0.9952038369304557
VotingClassifier 0.9976019184652278


#### Cross-validation

In [17]:
print(cross_val_score(not_match_clf, X, y, cv=5, scoring="accuracy"))
print(cross_val_score(voting_clf, X, y, cv=5, scoring="accuracy"))

[0.54491018 0.63473054 0.63173653 0.5975976  0.63363363]
[0.99401198 0.99700599 0.99401198 0.98798799 0.996997  ]


#### Confusion Matrix - Precision - Recall

In [18]:
y_train_pred = cross_val_predict(voting_clf, X, y, cv=5)
y_tst_pred = voting_clf.predict(X_tst)

Training

In [19]:
## Actual (row) vs. Predicted (column) - negative first, positive second
## [[true neg, false pos],[false neg, true pos]]
confusion_matrix(y, y_train_pred)

array([[1007,    8],
       [   2,  651]], dtype=int64)

In [20]:
precision_score(y, y_train_pred)

0.9878603945371776

In [21]:
recall_score(y, y_train_pred)

0.996937212863706

In [22]:
f1_score(y, y_train_pred)

0.9923780487804877

Testing

In [23]:
## Actual (row) vs. Predicted (column) - negative first, positive second
## [[true neg, false pos],[false neg, true pos]]
confusion_matrix(y_tst, y_tst_pred)

array([[261,   0],
       [  1, 155]], dtype=int64)

In [24]:
precision_score(y_tst, y_tst_pred)

1.0

In [25]:
recall_score(y_tst, y_tst_pred)

0.9935897435897436

In [26]:
f1_score(y_tst, y_tst_pred)

0.9967845659163987

#### Plots

In [None]:
y_scores = cross_val_predict(mdl_obj, X, y, cv=3, method="predict_proba")
precisions, recalls, thresholds = precision_recall_curve(y, y_scores[:,1])

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
tgt_precision = 0.9

recall_90_precision = recalls[np.argmax(precisions >= tgt_precision)]
threshold_90_precision = thresholds[np.argmax(precisions >= tgt_precision)]

plt.figure(figsize=(8, 4))                                                                  
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., max(tgt_precision,recall_90_precision)], "r:")                 
plt.plot([min(thresholds), threshold_90_precision], [tgt_precision, tgt_precision], "r:")                                
plt.plot([min(thresholds), threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")
plt.plot([threshold_90_precision], [tgt_precision], "ro")                                             
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                                             
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([recall_90_precision, recall_90_precision], [0., tgt_precision], "r:")
plt.plot([0.0, recall_90_precision], [tgt_precision, tgt_precision], "r:")
plt.plot([recall_90_precision], [tgt_precision], "ro")
plt.show()

In [None]:
threshold_90_precision

In [None]:
y_train_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
precision_score(y, y_train_pred_90)

In [None]:
recall_score(y, y_train_pred_90)

In [None]:
f1_score(y, y_train_pred_90)

In [None]:
fpr, tpr, thresholds = roc_curve(y, y_scores)

In [None]:
plt.figure(figsize=(8, 6))                                    
plot_roc_curve(fpr, tpr)
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")  
plt.plot([fpr_90], [recall_90_precision], "ro")                                    
plt.show()

In [None]:
roc_auc_score(y, y_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X, y, cv=3,
                                    method="predict_proba")

In [None]:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y,y_scores_forest)

In [None]:
recall_for_forest = tpr_forest[np.argmax(fpr_forest >= fpr_90)]

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")
plt.plot([fpr_90], [recall_90_precision], "ro")
plt.plot([fpr_90, fpr_90], [0., recall_for_forest], "r:")
plt.plot([fpr_90], [recall_for_forest], "ro")
plt.grid(True)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
roc_auc_score(y, y_scores_forest)

In [None]:
y_train_pred_forest = cross_val_predict(forest_clf, X, y, cv=3)
precision_score(y, y_train_pred_forest)