# Product Matching
## Level 4: Model Training Script

### Notes:
1. Train an XGBoost model to classify product matches
2. Optimize trained model via hyperparameter tuning

### References: 

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [2]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
import xgboost                       #For training an extreme gradient boosting model on data
from sklearn.model_selection import train_test_split          #For use in model validation for early stopping
from sklearn.metrics import log_loss                #For use in model validation for early stopping
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

## Custom Tranformers

## Set Directories

In [4]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [5]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

In [6]:
## Output file name
mdl_oput_file_name = "L406_mdl_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'

## Load Data

In [7]:
raw_df = pd.read_pickle(iput_dir + "L300_feat_df_2022-07-15.pkl")

In [8]:
X = joblib.load(iput_dir + "L300_prd_df_2022-07-15.pkl")

In [9]:
y = joblib.load(iput_dir + "L300_tgt_df_2022-07-15.pkl")

## High-level Data Inspection

In [10]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ITEMNUMBER              1668 non-null   object 
 1   Number                  1668 non-null   object 
 2   ITEMDESCRIPTION         1668 non-null   object 
 3   FullDescription         1668 non-null   object 
 4   SUPPLIER_ITEM_NUMBER    1668 non-null   object 
 5   ManufacturerItemNumber  1668 non-null   object 
 6   SUPPLIERNAME            1668 non-null   object 
 7   ManufacturerName        1668 non-null   object 
 8   UNSPCCode               1646 non-null   float64
 9   UNSPSC                  1668 non-null   int64  
 10  Feat_Exact_SuppItemNum  1668 non-null   int64  
 11  Feat_Fuzzy_SuppName     1668 non-null   int64  
 12  Feat_Fuzzy_ItemDesc     1668 non-null   int64  
 13  Feat_Exact_ItemNum      1668 non-null   int64  
 14  Feat_Exact_UNSPSC       1668 non-null   

In [11]:
raw_df.head()

Unnamed: 0,ITEMNUMBER,Number,ITEMDESCRIPTION,FullDescription,SUPPLIER_ITEM_NUMBER,ManufacturerItemNumber,SUPPLIERNAME,ManufacturerName,UNSPCCode,UNSPSC,Feat_Exact_SuppItemNum,Feat_Fuzzy_SuppName,Feat_Fuzzy_ItemDesc,Feat_Exact_ItemNum,Feat_Exact_UNSPSC,Match_Confidence,Match,Comments,Unnamed: 17,Feat_Fuzzy_SuppItemNum
0,58305122,305106,"Regular Bevel Needle 25Gx5/8"" (100 count)",NEEDLE HYPO REGULAR BEVEL 30GX 0.5IN 100/PK 10...,305122,305106,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142523.0,42142523,0,100,77,0,1,02 Med,No,,,0
1,683208,A5000-1,"Lisco Nonsterile Sponge 4""x4""",APPLICATOR WOODEN NO TIP 6IN NONSTERILE 1000EA...,3208,A5000-1,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42141502,0,100,53,0,0,03 Low,No,,,0
2,682733,2733,"Curity Nonsterile Gauze Sponge 4""x4""",GAUZE SPONGE 4X4IN 16PLY 2000/CS,2733-,2733-,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,1,100,55,0,1,01 High,Yes,,,1
3,686132,8043,"CURITY Sterile Gauze Pad, 3""x3"", 12 ply",GAUZE SPONGE CURITY 3X3 4PLY LF STER 2EA/PK 25...,6132,8043,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,0,100,57,0,1,03 Low,No,,,0
4,58309642,309642,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE LUER LOK 10ML 21G 1IN 100/PK 4PK/CS,309642,309642,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0,42142609,1,100,63,0,1,01 High,Yes,,,1


In [12]:
X.shape

(1668, 6)

In [13]:
y.shape

(1668,)

## Train Model

In [14]:
xgb_clf = xgboost.XGBClassifier()
xgb_clf.fit(X, y)

## Visualize Model

In [15]:
for name, score in zip(raw_df.filter(regex='^Feat', axis="columns").columns, xgb_clf.feature_importances_):
    print(name, score)

Feat_Exact_SuppItemNum 0.023579983
Feat_Fuzzy_SuppName 0.0034263292
Feat_Fuzzy_ItemDesc 0.010998548
Feat_Exact_ItemNum 0.03276333
Feat_Exact_UNSPSC 0.008576165
Feat_Fuzzy_SuppItemNum 0.92065567


In [16]:
## Verify a model was created
xgb_clf.predict(X[0:4])

array([0, 0, 1, 0])

In [17]:
## Verify a model was created
xgb_clf.predict_proba(X[0:4])

array([[9.9996221e-01, 3.7801838e-05],
       [9.9981195e-01, 1.8807320e-04],
       [2.0815730e-03, 9.9791843e-01],
       [9.9980348e-01, 1.9651926e-04]], dtype=float32)

## Fine-tune Model (e.g. Grid Search)

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#### Early Stopping Method #1 (working as intended)

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=49)

xgb_best_clf = xgboost.XGBClassifier()
xgb_best_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5)

[0]	validation_0-logloss:0.44867
[1]	validation_0-logloss:0.31350
[2]	validation_0-logloss:0.22762
[3]	validation_0-logloss:0.16518
[4]	validation_0-logloss:0.12199
[5]	validation_0-logloss:0.09179
[6]	validation_0-logloss:0.07013
[7]	validation_0-logloss:0.05478
[8]	validation_0-logloss:0.04318
[9]	validation_0-logloss:0.03503
[10]	validation_0-logloss:0.02941
[11]	validation_0-logloss:0.02533
[12]	validation_0-logloss:0.02236
[13]	validation_0-logloss:0.02003
[14]	validation_0-logloss:0.01833
[15]	validation_0-logloss:0.01634
[16]	validation_0-logloss:0.01538
[17]	validation_0-logloss:0.01467
[18]	validation_0-logloss:0.01387
[19]	validation_0-logloss:0.01353
[20]	validation_0-logloss:0.01306
[21]	validation_0-logloss:0.01276
[22]	validation_0-logloss:0.01255
[23]	validation_0-logloss:0.01211
[24]	validation_0-logloss:0.01211
[25]	validation_0-logloss:0.01170
[26]	validation_0-logloss:0.01178
[27]	validation_0-logloss:0.01176
[28]	validation_0-logloss:0.01179
[29]	validation_0-loglos

In [20]:
xgb_best_clf.predict(X[0:4])

array([0, 0, 1, 0])

Grid Search

In [43]:
gs_param_grid = [{
    # 'max_depth': list(range(1, np.floor(X.shape[1]).astype(int) + 1)),
    'n_estimators': [50, 94, 100, 200], 
    'learning_rate': [0.1, 0.5, 1], 
}]

In [44]:
gs_xgb_clf = xgboost.XGBClassifier(max_depth=6)

In [45]:
grid_search = GridSearchCV(gs_xgb_clf, gs_param_grid, cv=5, scoring='f1', return_train_score=True)

In [46]:
grid_search.fit(X, y)

In [47]:
grid_search.best_params_

{'learning_rate': 0.1, 'n_estimators': 94}

In [48]:
grid_search.best_estimator_

In [49]:
gs_bst_gbt_clf = grid_search.best_estimator_

Random Search

In [50]:
rs_param_grid = [{
    # 'max_depth': list(range(1, np.floor(X.shape[1]).astype(int) + 1)),
    'n_estimators': [50, 94, 100, 200], 
    'learning_rate': [0.1, 0.5, 1], 
}]

In [51]:
rs_xgb_clf = xgboost.XGBClassifier(max_depth=6)

In [52]:
rand_search = RandomizedSearchCV(rs_xgb_clf, rs_param_grid, n_iter=10000, cv=5, scoring='f1', return_train_score=True)

In [53]:
rand_search.fit(X, y)

In [54]:
rand_search.best_params_

{'n_estimators': 94, 'learning_rate': 0.1}

In [55]:
rs_bst_xgb_clf = rand_search.best_estimator_

## Save Model to Modeling Environment

In [56]:
joblib.dump(rs_bst_xgb_clf, oput_dir + mdl_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L406_mdl_2022-08-05.pkl']