# Product Matching
## Level 4: Model Training Script

### Notes:
1. Train an Ensemble Hard/Soft Voting Classifier model to classify product matches
2. Optimize trained model via hyperparameter tuning

### References: 

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [2]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
from sklearn.ensemble import VotingClassifier       #For training based on ensemble methods
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

## Custom Tranformers

## Set Directories

In [4]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [5]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

In [6]:
## Output file name
mdl_oput_file_name = "L420_mdl_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'

## Load Data

In [7]:
raw_df = pd.read_pickle(iput_dir + "L300_feat_df_2022-07-15.pkl")

In [8]:
X = joblib.load(iput_dir + "L300_prd_df_2022-07-15.pkl")

In [9]:
y = joblib.load(iput_dir + "L300_tgt_df_2022-07-15.pkl")

## Load Models

In [10]:
sgd_clf  = joblib.load(iput_dir + "L400_mdl_2022-08-01.pkl")
tree_clf = joblib.load(iput_dir + "L401_mdl_2022-08-01.pkl")
rnd_clf  = joblib.load(iput_dir + "L402_mdl_2022-08-03.pkl")
svm_clf  = joblib.load(iput_dir + "L403_mdl_2022-08-03.pkl")
ada_clf  = joblib.load(iput_dir + "L404_mdl_2022-08-03.pkl")
gbt_clf  = joblib.load(iput_dir + "L405_mdl_2022-08-05.pkl")
xgb_clf  = joblib.load(iput_dir + "L406_mdl_2022-08-05.pkl")

## High-level Data Inspection

In [11]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ITEMNUMBER              1668 non-null   object 
 1   Number                  1668 non-null   object 
 2   ITEMDESCRIPTION         1668 non-null   object 
 3   FullDescription         1668 non-null   object 
 4   SUPPLIER_ITEM_NUMBER    1668 non-null   object 
 5   ManufacturerItemNumber  1668 non-null   object 
 6   SUPPLIERNAME            1668 non-null   object 
 7   ManufacturerName        1668 non-null   object 
 8   UNSPCCode               1646 non-null   float64
 9   UNSPSC                  1668 non-null   int64  
 10  Feat_Exact_SuppItemNum  1668 non-null   int64  
 11  Feat_Fuzzy_SuppName     1668 non-null   int64  
 12  Feat_Fuzzy_ItemDesc     1668 non-null   int64  
 13  Feat_Exact_ItemNum      1668 non-null   int64  
 14  Feat_Exact_UNSPSC       1668 non-null   

In [12]:
raw_df.head()

Unnamed: 0,ITEMNUMBER,Number,ITEMDESCRIPTION,FullDescription,SUPPLIER_ITEM_NUMBER,ManufacturerItemNumber,SUPPLIERNAME,ManufacturerName,UNSPCCode,UNSPSC,Feat_Exact_SuppItemNum,Feat_Fuzzy_SuppName,Feat_Fuzzy_ItemDesc,Feat_Exact_ItemNum,Feat_Exact_UNSPSC,Match_Confidence,Match,Comments,Unnamed: 17,Feat_Fuzzy_SuppItemNum
0,58305122,305106,"Regular Bevel Needle 25Gx5/8"" (100 count)",NEEDLE HYPO REGULAR BEVEL 30GX 0.5IN 100/PK 10...,305122,305106,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142523.0,42142523,0,100,77,0,1,02 Med,No,,,0
1,683208,A5000-1,"Lisco Nonsterile Sponge 4""x4""",APPLICATOR WOODEN NO TIP 6IN NONSTERILE 1000EA...,3208,A5000-1,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42141502,0,100,53,0,0,03 Low,No,,,0
2,682733,2733,"Curity Nonsterile Gauze Sponge 4""x4""",GAUZE SPONGE 4X4IN 16PLY 2000/CS,2733-,2733-,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,1,100,55,0,1,01 High,Yes,,,1
3,686132,8043,"CURITY Sterile Gauze Pad, 3""x3"", 12 ply",GAUZE SPONGE CURITY 3X3 4PLY LF STER 2EA/PK 25...,6132,8043,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,0,100,57,0,1,03 Low,No,,,0
4,58309642,309642,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE LUER LOK 10ML 21G 1IN 100/PK 4PK/CS,309642,309642,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0,42142609,1,100,63,0,1,01 High,Yes,,,1


In [13]:
X.shape

(1668, 6)

In [14]:
y.shape

(1668,)

## Train Model

In [15]:
estimators = [sgd_clf, tree_clf, rnd_clf, svm_clf, ada_clf, gbt_clf, xgb_clf]
estimators_str = ['sgd_clf', 'tree_clf', 'rnd_clf', 'svm_clf', 'ada_clf', 'gbt_clf', 'xgb_clf']
estimators_lst = list(zip(estimators_str, estimators))

In [16]:
voting_clf = VotingClassifier(
    estimators=estimators_lst,
    voting='hard')

In [17]:
voting_clf.fit(X, y)

## Visualize Model

In [18]:
## Verify a model was created
tree_clf.predict(X[0:4])

array([0, 0, 1, 0])

In [19]:
## Verify a model was created
tree_clf.predict_proba(X[0:4])

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [0.00664452, 0.99335548],
       [1.        , 0.        ]])

## Fine-tune Model (e.g. Grid Search)

In [21]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Grid Search

In [22]:
gs_param_grid = [{
    'voting': ['hard', 'soft'], 
}]

In [23]:
gs_voting_clf = VotingClassifier(estimators=estimators_lst)

In [24]:
grid_search = GridSearchCV(gs_voting_clf, gs_param_grid, cv=5, scoring='f1', return_train_score=True)

In [25]:
grid_search.fit(X, y)

In [26]:
grid_search.best_params_

{'voting': 'hard'}

In [27]:
grid_search.best_estimator_

In [29]:
gs_bst_voting_clf = grid_search.best_estimator_

Random Search

In [30]:
rs_param_grid = [{
    'voting': ['hard', 'soft'], 
}]

In [31]:
rs_voting_clf = VotingClassifier(estimators=estimators_lst)

In [33]:
rand_search = RandomizedSearchCV(gs_voting_clf, rs_param_grid, n_iter=10000, cv=5, scoring='f1', return_train_score=True)

In [35]:
rand_search.fit(X, y)

In [36]:
rand_search.best_params_

{'voting': 'hard'}

In [37]:
rand_search.best_estimator_

In [38]:
rs_bst_voting_clf = rand_search.best_estimator_

## Save Model to Modeling Environment

In [39]:
joblib.dump(rs_bst_voting_clf, oput_dir + mdl_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L420_mdl_2022-08-05.pkl']