# Product Matching
## Level 3: Model Feature Creation Script

### Notes:
1. Create all candidate predictor variables 

### References: 

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [2]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  #For standardizing/tranforming numeric variables

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

## Custom Tranformers

## Set Directories

In [4]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [5]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

In [18]:
## Output file name
feat_oput_file_name = "L300_feat_df_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'
prd_oput_file_name  = "L300_prd_df_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'
tgt_oput_file_name  = "L300_tgt_df_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'
sclr_oput_file_name = "L300_sclr_obj_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'

## Load Data

In [7]:
raw_df = pd.read_pickle(iput_dir + "L100_trn_df_2022-07-15.pkl")

## High-level Data Inspection

In [8]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ITEMNUMBER              1668 non-null   object 
 1   Number                  1668 non-null   object 
 2   ITEMDESCRIPTION         1668 non-null   object 
 3   FullDescription         1668 non-null   object 
 4   SUPPLIER_ITEM_NUMBER    1668 non-null   object 
 5   ManufacturerItemNumber  1668 non-null   object 
 6   SUPPLIERNAME            1668 non-null   object 
 7   ManufacturerName        1668 non-null   object 
 8   UNSPCCode               1646 non-null   float64
 9   UNSPSC                  1668 non-null   int64  
 10  Feat_1                  1668 non-null   int64  
 11  Feat_2                  1668 non-null   int64  
 12  Feat_3                  1668 non-null   int64  
 13  Feat_4                  1668 non-null   int64  
 14  Feat_5                  1668 non-null   

In [9]:
raw_df.head()

Unnamed: 0,ITEMNUMBER,Number,ITEMDESCRIPTION,FullDescription,SUPPLIER_ITEM_NUMBER,ManufacturerItemNumber,SUPPLIERNAME,ManufacturerName,UNSPCCode,UNSPSC,Feat_1,Feat_2,Feat_3,Feat_4,Feat_5,Match_Confidence,Match,Comments,Unnamed: 17,Feat_6
0,58305122,305106,"Regular Bevel Needle 25Gx5/8"" (100 count)",NEEDLE HYPO REGULAR BEVEL 30GX 0.5IN 100/PK 10...,305122,305106,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142523.0,42142523,0,100,77,0,1,02 Med,No,,,0
1,683208,A5000-1,"Lisco Nonsterile Sponge 4""x4""",APPLICATOR WOODEN NO TIP 6IN NONSTERILE 1000EA...,3208,A5000-1,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42141502,0,100,53,0,0,03 Low,No,,,0
2,682733,2733,"Curity Nonsterile Gauze Sponge 4""x4""",GAUZE SPONGE 4X4IN 16PLY 2000/CS,2733-,2733-,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,1,100,55,0,1,01 High,Yes,,,1
3,686132,8043,"CURITY Sterile Gauze Pad, 3""x3"", 12 ply",GAUZE SPONGE CURITY 3X3 4PLY LF STER 2EA/PK 25...,6132,8043,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0,42311512,0,100,57,0,1,03 Low,No,,,0
4,58309642,309642,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE LUER LOK 10ML 21G 1IN 100/PK 4PK/CS,309642,309642,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0,42142609,1,100,63,0,1,01 High,Yes,,,1


## Feature Creation

In [10]:
## Reformat certain column variables (e.g. UNSPSC)

In [11]:
## Custom Feature Creation
raw_feat_df = (
    raw_df
    .rename(columns = dict(zip(
        ['Feat_1','Feat_2','Feat_3','Feat_4','Feat_5','Feat_6'],
        ['Feat_Exact_SuppItemNum','Feat_Fuzzy_SuppName','Feat_Fuzzy_ItemDesc','Feat_Exact_ItemNum','Feat_Exact_UNSPSC', 'Feat_Fuzzy_SuppItemNum']
    )))
)

In [12]:
raw_feat_df

Unnamed: 0,ITEMNUMBER,Number,ITEMDESCRIPTION,FullDescription,SUPPLIER_ITEM_NUMBER,ManufacturerItemNumber,SUPPLIERNAME,ManufacturerName,UNSPCCode,UNSPSC,Feat_Exact_SuppItemNum,Feat_Fuzzy_SuppName,Feat_Fuzzy_ItemDesc,Feat_Exact_ItemNum,Feat_Exact_UNSPSC,Match_Confidence,Match,Comments,Unnamed: 17,Feat_Fuzzy_SuppItemNum
0,58305122,305106,"Regular Bevel Needle 25Gx5/8"" (100 count)",NEEDLE HYPO REGULAR BEVEL 30GX 0.5IN 100/PK 10...,305122,305106,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142523.0000,42142523,0,100,77,0,1,02 Med,No,,,0
1,683208,A5000-1,"Lisco Nonsterile Sponge 4""x4""",APPLICATOR WOODEN NO TIP 6IN NONSTERILE 1000EA...,3208,A5000-1,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0000,42141502,0,100,53,0,0,03 Low,No,,,0
2,682733,2733,"Curity Nonsterile Gauze Sponge 4""x4""",GAUZE SPONGE 4X4IN 16PLY 2000/CS,2733-,2733-,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0000,42311512,1,100,55,0,1,01 High,Yes,,,1
3,686132,8043,"CURITY Sterile Gauze Pad, 3""x3"", 12 ply",GAUZE SPONGE CURITY 3X3 4PLY LF STER 2EA/PK 25...,6132,8043,CARDINAL HEALTH-PR,CARDINAL HEALTH,42311512.0000,42311512,0,100,57,0,1,03 Low,No,,,0
4,58309642,309642,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE LUER LOK 10ML 21G 1IN 100/PK 4PK/CS,309642,309642,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0000,42142609,1,100,63,0,1,01 High,Yes,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663,58309588,305270,Luer-Lok Syringe with Detachable PrecisionGlid...,SYRINGE 3ML W/DETACHABLE NEEDLE 1IN 25G 100/PK...,309588,305270,BECTON DICKINSON CONSUMER,BECTON DICKINSON,42142609.0000,42142609,0,100,71,0,1,02 Med,No,,,0
1664,55001203,55001203,AirLife Adult Vinyl Oxygen Mask 7',AirLife Adult Vinyl Oxygen Mask 7,001203,55001203,VYAIRE MEDICAL,RGH ENTERPRISE INC,42271708.0000,42271708,0,31,100,1,1,03 Low,Yes,,,1
1665,GL511,GL511,"Surgilast Tubular Elastic Dressing Retainer, S...",RETAINER DRESSINGTUBULAR ELASTIC,GL-511,GL511,INTEGRA LIFESCIENCES,INTEGRA LIFESCIENCES SALES LLC,42311543.0000,42311543,0,100,67,1,1,03 Low,Yes,,,1
1666,552D72PL65X,CFN66A,Protexis PI Classic Sterile Polyisoprene Powde...,TUBING SURGICAL SUCTION 6MM 6FT STERILE 45/BX=CS,2D72PL65X,N66A,CARDINAL HEALTH - MED,CARDINAL HEALTH,42132205.0000,42293515,0,100,50,0,0,03 Low,No,,,0


In [13]:
## Create Standard Scale copies 


## Predictor Target Split

In [14]:
# Step 1. Select the predictors
X = raw_feat_df.filter(regex='^Feat', axis="columns").values

In [15]:
# Step 1. Select the target
# Step 2. Numericize target
y = raw_feat_df.assign(Match = lambda df: np.where(df.Match=='Yes', 1, 0)).Match

## Transformers

In [17]:
X_scaler = StandardScaler().fit(X)

## Save Data to Modeling Environment

In [19]:
raw_feat_df.to_pickle(oput_dir + feat_oput_file_name)

In [20]:
joblib.dump(X, oput_dir + prd_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L300_prd_df_2022-07-15.pkl']

In [21]:
joblib.dump(y, oput_dir + tgt_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L300_tgt_df_2022-07-15.pkl']

In [22]:
joblib.dump(X_scaler, oput_dir + sclr_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L300_sclr_obj_2022-07-15.pkl']