## ONNX CONVERSION

Retrain model with all data and convert to ONNX file. 

##### Timing 
We want to time how long these programs take to run. We are interested both in real time and CPU time. 

In [None]:
import time 

start_time = time.time()
start_cpu_time = time.process_time()

#### Set Up

In [None]:
import os

import numpy as np
from numpy.random import choice

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import collect_list, regexp_replace, lower
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import year, month, dayofmonth
from functools import reduce

import datetime 
import pandas as pd
import time
import math

import matplotlib.pyplot as plt
import pickle

In [None]:
#Machine Learning Libraries 
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report, roc_auc_score, average_precision_score
from sklearn.utils.class_weight import compute_class_weight

In [None]:
#Install ONNX ML Tools
!pip install lightgbm onnxmltools onnx skl2onnx

In [None]:
import onnxmltools
import onnx
from onnxmltools.convert.common.data_types import FloatTensorType

#### Import Prepared Data

In [None]:
md = spark.read.load("/anaurosevic/cdn0_cards_affinity/model_data")

In [None]:
model_data = md.toPandas()

#### Scale Continuous Variables

In [None]:
all_vars = list(model_data.columns)

In [None]:
prefixes = ["device_","province_","sess_channel_"]
filtered_list = [item for item in all_vars if item.startswith(tuple(prefixes))]

In [None]:
all_vars = list(model_data.columns)

#Manually specify non-cont vars [majority are continuous] 
non_cont_vars = ['user_pseudo_id','product_code','postal_code'] + filtered_list #Primary keys + non-cont

cont_vars = list(set(all_vars)-set(non_cont_vars))
cont_vars.sort()

In [None]:
scale = StandardScaler()
scaled = scale.fit_transform(model_data[cont_vars])

scaled_df = pd.DataFrame(scaled)
scaled_df.columns = cont_vars

In [None]:
model_data_scaled = pd.concat([model_data.drop(columns = cont_vars, axis=1),scaled_df], axis=1)

In [None]:
#Get rid of user_pseudo_id :) 
md_final = model_data_scaled.drop('user_pseudo_id',axis=1)

#### Split X & Y

In [None]:
X = md_final.drop('product_code', axis=1)
y = md_final['product_code'].astype('category')

In [None]:
#Convert X to float32
X_temp = np.array(X)  # Ensure it's a NumPy array
X_final = X_temp.astype(np.float32)  # Convert all values to float32

#### Model Training

In [None]:
best_params = spark.read.load("/anaurosevic/cdn0_cards_affinity/best_params")

In [None]:
best_params_dict = best_params.toPandas().iloc[0].to_dict()

In [None]:
#Convert from float to integer
best_params_dict['n_estimators'] = int(best_params_dict['n_estimators'])
best_params_dict['num_leaves'] = int(best_params_dict['num_leaves'])

In [None]:
best_model = lgb.LGBMClassifier(
    objective = 'multiclass',
    is_unbalance = True,
    random_state = 42, 
    verbosity = 0,
    **best_params_dict
)

In [None]:
best_model.fit(X_final, y)

#### Save as ONNX

In [None]:
# Define the input type for the model
initial_type = [('input', FloatTensorType([None, X_final.shape[1]]))]

# Convert the model
onnx_model = onnxmltools.convert_lightgbm(best_model, initial_types=initial_type)

# Save the ONNX model to a file
with open("cards_affinity.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

#### Check ONNX

In [None]:
# Load the ONNX model
onnx_model = onnx.load("cards_affinity.onnx")

# Check the model
onnx.checker.check_model(onnx_model)
print("The model is valid!")

#### Upload to Artifactory 

##### Step 1: Define python function that enables you to run bash command in python

In [None]:
import subprocess 

def run_cmd(args_list):
        """
        run linux commands
        """
        # import subprocess
        print('Running system command: {0}'.format(' '.join(args_list)))
        proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        s_output, s_err = proc.communicate()
        s_return =  proc.returncode
        return s_return, s_output, s_err 

##### Step 2: Prep parameters

In [None]:
folder_name = "LATEST"
repo_name = "cards_affinity"
model_name = "cards_affinity.onnx"
file_type = "zip" # Replace with your type of file
user = "XXXXXXX"  # Replace with your lan id
pwd = "XXXXXXX"  # Replace with your lan id password
artifact_access = "generic-0as0" # Different developers have different permissions. Put a path that you have permission to access to.
artifactory = f"http:/.../artifactory/{artifact_access}/models/{repo_name}/{folder_name}/{model_name}"

##### Step 3: Upload model to artifactory

In [None]:
command_upload_to_artifactory = f'curl -u {user}:{pwd} -X PUT {artifactory} -T {model_name}'.split()
(ret, out, err)= run_cmd(command_upload_to_artifactory)

--- END PROGRAM ---

In [None]:
#Timing summary
end_time = time.time()
end_cpu_time = time.process_time()

real_time_elapsed = end_time - start_time
cpu_time_elapsed = end_cpu_time - start_cpu_time

print(f"Real time: {real_time_elapsed:.2f} seconds")
print(f"CPU time: {cpu_time_elapsed:.2f} seconds")