## MODEL TRAINING

Train model with LightGBM and identify performance.

##### Timing 
We want to time how long these programs take to run. We are interested both in real time and CPU time.

In [None]:
import time 

start_time = time.time()
start_cpu_time = time.process_time()

#### Set Up

In [None]:
import os

import numpy as np
from numpy.random import choice

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import collect_list, regexp_replace, lower
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import year, month, dayofmonth
from functools import reduce

import datetime 
import pandas as pd
import time
import math

import matplotlib.pyplot as plt
import pickle

In [None]:
#Machine Learning Libraries 
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report, roc_auc_score, average_precision_score
from sklearn.utils.class_weight import compute_class_weight

#### Import Prepared Data

In [None]:
md = spark.read.load("/anaurosevic/cdn0_cards_affinity/model_data")

In [None]:
model_data = md.toPandas()

In [None]:
model_data.shape

In [None]:
model_data.memory_usage(deep=True).sum()/1e9

In [None]:
cc_prevalence = model_data.groupby('product_code').size().reset_index(name='counts').sort_values('counts',ascending = False)
cc_prevalence['prop'] = round(cc_prevalence['counts']/cc_prevalence['counts'].sum()*100,1)
cc_prevalence #Class imbalance - need to deal with this ~ 

#### Scale Continuous Variables

In [None]:
all_vars = list(model_data.columns)

In [None]:
prefixes = ["device_","province_","sess_channel_"]
filtered_list = [item for item in all_vars if item.startswith(tuple(prefixes))]

In [None]:
all_vars = list(model_data.columns)

#Manually specify non-cont vars [majority are continuous] 
non_cont_vars = ['user_pseudo_id','product_code','postal_code'] + filtered_list #Primary keys + non-cont

cont_vars = list(set(all_vars)-set(non_cont_vars))
cont_vars.sort()

In [None]:
scale = StandardScaler()
scaled = scale.fit_transform(model_data[cont_vars])

scaled_df = pd.DataFrame(scaled)
scaled_df.columns = cont_vars
scaled_df.head()

In [None]:
model_data_scaled = pd.concat([model_data.drop(columns = cont_vars, axis=1), scaled_df], axis=1)

In [None]:
model_data_scaled.shape

In [None]:
#Get rid of user_pseudo_id :) 
md_final = model_data_scaled.drop('user_pseudo_id',axis=1)

In [None]:
print(list(md_final.columns))

In [None]:
#Save for Erick
sampled_df = md_final.sample(n=1000, random_state=1)

In [None]:
sampled_df.shape

In [None]:
#Save 
sampled_df.to_csv('sample_data.csv')

#### Split X & Y and Train & Test

In [None]:
X = md_final.drop(['product_code','postal_code'], axis=1)
y = md_final['product_code'].astype('category')

In [None]:
X.memory_usage(deep=True).sum()/1e9

In [None]:
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Class Weights for Imbalance

In [None]:
#Example data
classes = np.unique(y_train)  # Unique class labels
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)

#Create a mapping from class labels to class weights
class_weights_dict = dict(zip(classes, class_weights))

#Map labels in y_train to their corresponding weights
sample_weights = np.array([class_weights_dict[label] for label in y_train])

In [None]:
class_weights_dict

#### Define Hyperparameter Tuning Grid

In [None]:
xgb_model = xgb.XGBClassifier(
    objective = 'multi:softprob', #Multicategory classification curve
    eval_metric = 'mlogloss', #Precision-recall curve is best for imbalanced data
    use_label_encoder = False,
    random_state = 42)

In [None]:
lgbm_model = lgb.LGBMClassifier(
    objective = 'multiclass',
    random_state = 42, 
   # class_weight = class_weights_dict, #To try to address imbalance 
    is_unbalance = True,
    verbosity = 1)

In [None]:
#Broad params
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1], # step size shrinkage
    "n_estimators": [100, 150, 200], # number of trees
    "num_leaves": [20, 30, 40]
}
#Without offers: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 30}
#With offers: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 40}

In [None]:
#Double click
# param_grid_dc = {
#     "learning_rate": [0.025, 0.05, 0.075], 
#     "n_estimators": [75, 100, 125], 
#     "num_leaves": [15, 20, 25]
# }

In [None]:
grid_search = GridSearchCV(
    estimator = lgbm_model,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 2,
    cv = 5,
    n_jobs = 2 #Keep it at 2 for memory reasons :D 
)

#### Perform Grid Search (Find Best Model)

In [None]:
training_start_time = time.time()
print(training_start_time)

grid_search.fit(X_train, y_train) 

training_end_time = time.time()
print(training_end_time)

In [None]:
print("Model training took", round((training_end_time-training_start_time)/60),"minutes!")

In [None]:
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

In [None]:
results = pd.DataFrame(grid_search.cv_results_) 

#### Save Parameters

In [None]:
best_params_pd = pd.DataFrame.from_dict(best_params, orient='index').T
best_params_pd.head()

In [None]:
best_params_spark = spark.createDataFrame(best_params_pd)

In [None]:
best_params_spark.write.mode("overwrite").parquet("/anaurosevic/cdn0_cards_affinity/best_params")

#### Load Parameters

In [None]:
best_params = spark.read.load("/anaurosevic/cdn0_cards_affinity/best_params")

In [None]:
best_params.show()

In [None]:
best_params_dict = best_params.toPandas().iloc[0].to_dict()

In [None]:
#Convert from float to integer
best_params_dict['n_estimators'] = int(best_params_dict['n_estimators'])
best_params_dict['num_leaves'] = int(best_params_dict['num_leaves'])

In [None]:
best_params_dict

#### Refit Using Best Parameters 

In [None]:
best_model = lgb.LGBMClassifier(
    objective = 'multiclass',
    #class_weight = class_weights_dict, #To try to address imbalance - tested, but performance is worse 
    random_state = 42, 
    verbosity = 0,
    **best_params_dict
)

In [None]:
best_model.fit(X_train, y_train)

In [None]:
#Save the trained model 
with open("lgbm_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

In [None]:
#Open the model 
with open("lgbm_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

In [None]:
loaded_model

#### Performance

In [None]:
pred_probs = loaded_model.predict_proba(X_test)
print(pred_probs.shape)
#The result is a probability distribution across all of the classes (14 CCs)
pred_probs[1]

In [None]:
#How do we know what the order is? Should be the same as y_train 
class_labels = np.sort(np.unique(y_train))
print(class_labels)

In [None]:
pred_probs_df = pd.DataFrame(pred_probs)
pred_probs_df.head()

In [None]:
#Rename columns 
pred_probs_df.columns = class_labels
pred_probs_df.head()

In [None]:
pred_probs_df['predicted_class'] = pred_probs_df.idxmax(axis=1)

In [None]:
y_comparison = pd.DataFrame()
y_comparison['actual'] = y_test
y_comparison['predicted'] = pred_probs_df['predicted_class'].to_numpy()

#### Function
Let's create a function so that we can easily swap train/test and complete/subset. 

In [None]:
def predictions(X_data, Y_data, model_type): 

    #Prepare features
    #Format features as a DMatrix because best_model.predict only handles this 
    #X_dmatrix = xgb.DMatrix(X_data) #For XGBoost only

    #Predict class probabilities
    #The result is a probability distribution across all of the classes (14 CCs)
    if (model_type=='complete'):
        #XGBoost
        #pred_probs = best_model.predict(X_dmatrix) 
        #LGBM 
        pred_probs = loaded_model.predict_proba(X_data)
        
    elif (model_type=='subset'):
        #XGBoost
        #pred_probs = best_model_subset.predict(X_dmatrix)
        #LGBM 
        pred_probs = loaded_model.predict_proba(X_data)
    
    #Identify class labels 
    #How do we know what the order is? Should be the same as y_train 
    class_labels = np.sort(np.unique(Y_data))

    #Create DF 
    pred_probs_df = pd.DataFrame(pred_probs)

    #Rename columns 
    pred_probs_df.columns = class_labels

    #Identify top predicted card (max probability)
    pred_probs_df['predicted_class'] = pred_probs_df.idxmax(axis=1)

    #Also identify top 3 predicted cards 
    top_3_columns = pred_probs_df.drop('predicted_class',axis=1).apply(lambda x: x.sort_values(ascending=False).head(3).index.to_list(), axis=1)
    pred_probs_df['top_3_predicted'] = top_3_columns.to_numpy()

    #Final summary 
    y_comparison = pd.DataFrame()
    y_comparison['actual'] = Y_data
    y_comparison['predicted'] = pred_probs_df['predicted_class'].to_numpy()
    y_comparison['top_3'] = pred_probs_df['top_3_predicted'].to_numpy()

    return y_comparison 

In [None]:
y_comparison_test = predictions(X_test, y_test,"complete")
y_comparison_train = predictions(X_train, y_train,"complete") 

In [None]:
y_comparison_test.head()

In [None]:
actual_counts = y_comparison_test.groupby('actual').size().reset_index().rename(columns={'actual':'product_code',0:'actual_counts'})
predicted_counts = y_comparison_test.groupby('predicted').size().reset_index().rename(columns={'predicted':'product_code',0:'predicted_counts'})

count_comparison = actual_counts.merge(predicted_counts, on='product_code').sort_values(by=['actual_counts'], ascending=False)
count_comparison['diff'] = (count_comparison['predicted_counts']-count_comparison['actual_counts'])/count_comparison['actual_counts']*100

count_comparison['diff']=count_comparison['diff'].round(0)

count_comparison

#### Evaluate Model Performance

In [None]:
print("Classification Report - Test:")
print(classification_report(y_comparison_test['actual'], y_comparison_test['predicted']))
#Accuracy of 0.54 is not too hot
#MC4 (Westjet) performance is great: 
#i.e., we are really good at predicting when a prospect might choose a Westjet card 

In [None]:
print("Classification Report - Train:")
print(classification_report(y_comparison_train['actual'], y_comparison_train['predicted']))
#Training classification to monitor overfitting!! 

In [None]:
#Number of instances where correct!
print("Correct card is in top 3 in the test set",round(y_comparison_test.apply(lambda row: row['actual'] in row['top_3'], axis=1).sum()/len(y_comparison_test)*100,2),"% of the time!")
print("Correct card is in top 3 in the train set",round(y_comparison_train.apply(lambda row: row['actual'] in row['top_3'], axis=1).sum()/len(y_comparison_train)*100,2),"% of the time!")

In [None]:
feature_importance = pd.DataFrame({'Score': loaded_model.booster_.feature_importance(), 'Feature': X_test.columns}).sort_values("Score",ascending=False).head(10)
feature_importance

--- END PROGRAM ---

In [None]:
#Timing summary
end_time = time.time()
end_cpu_time = time.process_time()

real_time_elapsed = end_time - start_time
cpu_time_elapsed = end_cpu_time - start_cpu_time

print(f"Real time: {real_time_elapsed:.2f} seconds")
print(f"CPU time: {cpu_time_elapsed:.2f} seconds")