# Product Matching
## Level 4: Model Training Script

### Notes:
1. Train a Random Forest model to classify product matches
2. Optimize trained model via hyperparameter tuning

### References: 

## Notebook Config

In [2]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [3]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
from sklearn.ensemble import RandomForestClassifier       #For training ensemble decision trees on data
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module

In [4]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

## Custom Tranformers

## Set Directories

In [5]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [6]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

In [7]:
## Output file name
mdl_oput_file_name = "L402_mdl_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'

## Load Data

In [8]:
raw_df = pd.read_pickle(iput_dir + "L300_feat_df_2022-07-15.pkl")

In [9]:
X = joblib.load(iput_dir + "L300_prd_df_2022-07-15.pkl")

In [10]:
y = joblib.load(iput_dir + "L300_tgt_df_2022-07-15.pkl")

## High-level Data Inspection

In [11]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ITEMNUMBER              1668 non-null   object 
 1   Number                  1668 non-null   object 
 2   ITEMDESCRIPTION         1668 non-null   object 
 3   FullDescription         1668 non-null   object 
 4   SUPPLIER_ITEM_NUMBER    1668 non-null   object 
 5   ManufacturerItemNumber  1668 non-null   object 
 6   SUPPLIERNAME            1668 non-null   object 
 7   ManufacturerName        1668 non-null   object 
 8   UNSPCCode               1646 non-null   float64
 9   UNSPSC                  1668 non-null   int64  
 10  Feat_Exact_SuppItemNum  1668 non-null   int64  
 11  Feat_Fuzzy_SuppName     1668 non-null   int64  
 12  Feat_Fuzzy_ItemDesc     1668 non-null   int64  
 13  Feat_Exact_ItemNum      1668 non-null   int64  
 14  Feat_Exact_UNSPSC       1668 non-null   

In [12]:
raw_df.head()

Unnamed: 0,ITEMNUMBER,Number,ITEMDESCRIPTION,FullDescription,SUPPLIER_ITEM_NUMBER,ManufacturerItemNumber,SUPPLIERNAME,ManufacturerName,UNSPCCode,UNSPSC,Feat_Exact_SuppItemNum,Feat_Fuzzy_SuppName,Feat_Fuzzy_ItemDesc,Feat_Exact_ItemNum,Feat_Exact_UNSPSC,Match_Confidence,Match,Comments,Unnamed: 17,Feat_Fuzzy_SuppItemNum
0,58305122,305106,"Regular Bevel Needle 25Gx5/8"" (100 count)",NEEDLE HYPO REGULAR BEVEL 30GX 0.5IN 100/PK 10...,305122,305106,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142523.0,42142523,0,100,77,0,1,02 Med,No,,,0
1,683208,A5000-1,"Lisco Nonsterile Sponge 4""x4""",APPLICATOR WOODEN NO TIP 6IN NONSTERILE 1000EA...,3208,A5000-1,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42141502,0,100,53,0,0,03 Low,No,,,0
2,682733,2733,"Curity Nonsterile Gauze Sponge 4""x4""",GAUZE SPONGE 4X4IN 16PLY 2000/CS,2733-,2733-,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,1,100,55,0,1,01 High,Yes,,,1
3,686132,8043,"CURITY Sterile Gauze Pad, 3""x3"", 12 ply",GAUZE SPONGE CURITY 3X3 4PLY LF STER 2EA/PK 25...,6132,8043,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,0,100,57,0,1,03 Low,No,,,0
4,58309642,309642,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE LUER LOK 10ML 21G 1IN 100/PK 4PK/CS,309642,309642,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0,42142609,1,100,63,0,1,01 High,Yes,,,1


In [13]:
X.shape

(1668, 6)

In [14]:
y.shape

(1668,)

## Train Model

In [15]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=2, n_jobs=-1, random_state=42, oob_score=True)
rnd_clf.fit(X, y)

## Visualize Model

In [16]:
for name, score in zip(raw_df.filter(regex='^Feat', axis="columns").columns, rnd_clf.feature_importances_):
    print(name, score)

Feat_Exact_SuppItemNum 0.21174536659332738
Feat_Fuzzy_SuppName 0.10626780505654108
Feat_Fuzzy_ItemDesc 0.06477842455898056
Feat_Exact_ItemNum 0.19505127364201308
Feat_Exact_UNSPSC 0.0068061486246880085
Feat_Fuzzy_SuppItemNum 0.41535098152444977


In [17]:
## Verify a model was created
rnd_clf.predict(X[0:4])

array([0, 0, 1, 0])

In [18]:
## Verify a model was created
rnd_clf.predict_proba(X[0:4])

array([[0.93967935, 0.06032065],
       [0.9461396 , 0.0538604 ],
       [0.14112746, 0.85887254],
       [0.93587306, 0.06412694]])

In [19]:
rnd_clf.oob_score_

0.9724220623501199

In [20]:
rnd_clf.oob_decision_function_

array([[0.93779265, 0.06220735],
       [0.94073516, 0.05926484],
       [0.1359902 , 0.8640098 ],
       ...,
       [0.26237935, 0.73762065],
       [0.93760738, 0.06239262],
       [0.35696678, 0.64303322]])

## Fine-tune Model (e.g. Grid Search)

In [21]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Grid Search

In [60]:
gs_param_grid = [{
    'n_estimators': [10, 50, 100, 200, 500, 1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': list(range(1, np.floor(X.shape[1]/2).astype(int) + 1)),
    # 'max_leaf_nodes': [None] + list(range(1, 111, 10)),
    # 'min_samples_split': [2, 3, 4, 5]
}]

In [61]:
gs_rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42, oob_score=True)

In [62]:
grid_search = GridSearchCV(gs_rnd_clf, gs_param_grid, cv=5, scoring='f1', return_train_score=True)

In [63]:
grid_search.fit(X, y)

In [64]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 3, 'n_estimators': 500}

In [65]:
grid_search.best_estimator_

In [66]:
gs_bst_sgd_clf = grid_search.best_estimator_

Random Search

In [67]:
rs_param_grid = [{
    'n_estimators': [10, 50, 100, 200, 500, 1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': list(range(1, np.floor(X.shape[1]/2).astype(int) + 1)),
    # 'max_leaf_nodes': [None] + list(range(1, 111, 10)),
    # 'min_samples_split': [2, 3, 4, 5]
}]

In [68]:
rs_rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42, oob_score=True)

In [69]:
rand_search = RandomizedSearchCV(rs_rnd_clf, rs_param_grid, n_iter=10000, cv=5, scoring='f1', return_train_score=True)

In [71]:
rand_search.fit(X, y)

In [72]:
rand_search.best_params_

{'n_estimators': 500, 'max_depth': 3, 'criterion': 'gini'}

In [None]:
rand_search.best_estimator_

In [74]:
rs_bst_rnd_clf = rand_search.best_estimator_

## Save Model to Modeling Environment

In [75]:
joblib.dump(rs_bst_rnd_clf, oput_dir + mdl_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L402_mdl_2022-08-03.pkl']