# Product Matching
## Level 4: Model Training Script

### Notes:
1. Train a Support Vector Machine model to classify product matches
2. Optimize trained model via hyperparameter tuning

### References: 

## Notebook Config

In [1]:
# Display settings
## Auto reload modules & inline plots
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Package Import & Initialization

In [2]:
# Import packages

import pandas as pd                  #For data manipulation and bgq --> pandas conversion
import numpy as np                   #For scientific computation
import os                            #For work with native operating system and directories
from pathlib import Path             #For working with file paths and directories
import warnings                      #To tweak warning options
import datetime as dt                #For date objects and implemetations
from sklearn.pipeline import Pipeline       #For linking various data ETL moves together
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  #For standardizing/tranforming numeric variables 
from sklearn.svm import SVC          #For training support vector machines on data
import joblib                        #For saving objects, more efficient at serializing large numpy arrays than pickle module

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')

## Function Library

## Custom Tranformers

## Set Directories

In [4]:
#cwd = os.getcwd()
#os.chdir(cwd)
home = str(Path.home())
proj_path = os.path.join(home, 'Cardinal Health', 'Enterprise Data Remediation - Documents', 'General', '01_Projects', 
                         '23_Kinaxis')

In [5]:
main_dir = proj_path + '\\'
iput_dir = main_dir + 'Data\\Output\\ML\\'
oput_dir = main_dir + 'Data\\Output\\ML\\'

## Set Output File Names

In [6]:
## Output file name
mdl_oput_file_name = "L403_mdl_" + str(dt.datetime.today().strftime('%Y-%m-%d')) + '.pkl'

## Load Data

In [7]:
raw_df = pd.read_pickle(iput_dir + "L300_feat_df_2022-07-15.pkl")

In [8]:
X = joblib.load(iput_dir + "L300_prd_df_2022-07-15.pkl")

In [9]:
y = joblib.load(iput_dir + "L300_tgt_df_2022-07-15.pkl")

In [10]:
X_Scaler = joblib.load(iput_dir + "L300_sclr_obj_2022-07-15.pkl")

## High-level Data Inspection

In [None]:
raw_df.info()

In [None]:
raw_df.head()

In [None]:
X.shape

In [None]:
y.shape

## Train Model

In [11]:
X_scld = X_Scaler.transform(X)

In [12]:
svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=5, probability=True)
svm_clf.fit(X_scld, y)

## Visualize Model

In [14]:
## Verify a model was created
svm_clf.predict(X_scld[0:4])

array([0, 0, 1, 0])

## Fine-tune Model (e.g. Grid Search)

In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Grid Search

In [17]:
gs_param_grid = [{
    'kernel': ['lnear','poly','rbf','sigmoid'], 
    'degree': [1, 2, 3], #polynomial kernel function,
    # 'coef0': [0.0],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],  #regularization
    'C': [0.1, 1, 10, 100, 1000]  #regularization
}]

In [18]:
gs_svm_clf = SVC(coef0=1, probability=True)

In [19]:
grid_search = GridSearchCV(gs_svm_clf, gs_param_grid, cv=5, scoring='f1', return_train_score=True)

In [20]:
grid_search.fit(X_scld, y)

In [21]:
grid_search.best_params_

{'C': 0.1, 'degree': 2, 'gamma': 1, 'kernel': 'poly'}

In [22]:
grid_search.best_estimator_

In [23]:
gs_bst_sgd_clf = grid_search.best_estimator_

Random Search

In [31]:
rs_param_grid = [{
    'kernel': ['lnear','poly','rbf','sigmoid'], 
    'degree': [1, 2, 3], #polynomial kernel function,
    # 'coef0': [0.0],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],  #regularization
    'C': [0.1, 1, 10, 100, 1000]  #regularization
}]

In [32]:
rs_svm_clf = SVC(coef0=1, probability=True)

In [33]:
rand_search = RandomizedSearchCV(rs_svm_clf, rs_param_grid, n_iter=10000, cv=5, scoring='f1', return_train_score=True)

In [34]:
rand_search.fit(X_scld, y)

In [35]:
rand_search.best_params_

{'kernel': 'poly', 'gamma': 1, 'degree': 2, 'C': 0.1}

In [36]:
rs_bst_svm_clf = rand_search.best_estimator_

## Save Model to Modeling Environment

In [37]:
joblib.dump(rs_bst_svm_clf, oput_dir + mdl_oput_file_name)

['C:\\Users\\kehinde.salau\\Cardinal Health\\Enterprise Data Remediation - Documents\\General\\01_Projects\\23_Kinaxis\\Data\\Output\\ML\\L403_mdl_2022-08-03.pkl']