### Train Model

This notebook attempts to train the model, choose the best one based on a Metric and deploy automatically


 - loads the data from a dat location and extracts some more features for the prediction
 - trains 
     - a simple classical model
     - a node2vec models
     - a LSTM model on sequence of items browsed
 
 - Running the notebookl shoud automaticaly store/deployed the better model(based on MRR) at a location/model registry
 
 - the web api is plugged in to the model registry and will automatically picks up the deployed model
 

### 0.  Load Setup on Collab Variables

In [1]:
# #uncomment and run if working on collab
# from google.colab import drive
# drive.mount('/content/drive')


In [2]:
# #uncomment and run if working on collab
# !rm -rf mlcore
# !cp -r /content/drive/MyDrive/data/ data/
# !mkdir logs
# !mkdir models
# !unzip /content/drive/MyDrive/data/mlcore.zip
#!cd mlcore && pip install -e . && cd .. 

### 0.  Load Env Variables ( Uncomment if not running on Docker)

In [3]:
# # run this if running locally not required if you used the docker script
# #!pip install python-dotenv
# from dotenv import load_dotenv
# load_dotenv(dotenv_path = '../.env')

### 1. Import requires packages

In [4]:
import pandas as pd
import seaborn as sns

from datetime import datetime
from mlcore.data_helper import load_data
from mlcore.utils import set_logger
from mlcore.feature_extractor import *

In [5]:
ts = datetime.now()
nb_run_id = 'trng_'+ ts.strftime("%m_%d_%Y_%H_%M_%S")
training_logger = set_logger(nb_run_id)

### 2 .Load Click Data  and extract features
    - join with stores data to get merchant_info
    - extract time based features such which day/hour a click was made
    - extract user based features such as age of user till click etc.

In [6]:
data_dict = {
    'clicks':None,
    'users':None,
    'stores':None
}


for schema_name in data_dict:
    data_dict[schema_name] = load_data(schema_name, logger = training_logger)

2022-04-30 06:45:51,612:Loaded schema clicks in dataframe with shape (10000, 8)
2022-04-30 06:45:51,617:Loaded schema users in dataframe with shape (500, 3)
2022-04-30 06:45:51,620:Loaded schema stores in dataframe with shape (100, 2)


In [7]:
transformed_data = data_dict['clicks']

In [8]:
# join with stores and extract time features
stores_df = data_dict['stores']
transformed_data = pd.merge(transformed_data, stores_df, left_on = 'store_id', right_on = 'id')
extract_time_features(transformed_data)
transformed_data.iloc[0:1].to_dict()

{'index': {0: 1000},
 'id_x': {0: 7882},
 'user_id': {0: 125},
 'store_id': {0: 30},
 'created_at': {0: Timestamp('2021-09-26 01:17:41')},
 'device': {0: 'desktop'},
 'channel': {0: 'direct'},
 'platform': {0: 'extension'},
 'id_y': {0: 30},
 'merchant_id': {0: 5},
 'hour_of_day': {0: 1},
 'day_of_week': {0: 'Sunday'},
 'month_of_year': {0: 9},
 'date': {0: datetime.date(2021, 9, 26)}}

In [9]:
# join with users

users_df = data_dict['users']
transformed_data = extract_user_features(transformed_data, users_df)
transformed_data.iloc[0].to_dict()

{'index': 1000,
 'id_x': 7882,
 'user_id': 125,
 'store_id': 30,
 'created_at': Timestamp('2021-09-26 01:17:41'),
 'device': 'desktop',
 'channel': 'direct',
 'platform': 'extension',
 'id_y': 30,
 'merchant_id': 5,
 'hour_of_day': 1,
 'day_of_week': 'Sunday',
 'month_of_year': 9,
 'date': datetime.date(2021, 9, 26),
 'id': 125,
 'signup_datetime': '2021-08-25 07:55:34',
 'lifetime_first_purchase_datetime': '2021-09-16 12:06:58',
 'click_delta_signup': 45682.11666666667,
 'click_delta_first_purchase': 13750.716666666667}

In [10]:
transformed_data.shape

(10000, 19)

### 3. Train

In [11]:
# def create_store_item_click_matrix(data):
    
#     clicks = data[['user_id','merchant_id']]
#     user_item_table = clicks.groupby(['user_id','merchant_id']).size().reset_index(name='merchant_clicked_times')
#     user_item_matrix = user_item_table.pivot(index = 'user_id', 
#                                              columns='merchant_id', 
#                                              values='merchant_clicked_times').fillna(0)#.unstack(1)

#     user_item_matrix_reset = user_item_matrix.rename_axis(None, axis=1).reset_index()
#     return user_item_matrix_reset

# user_item_preference = create_store_item_click_matrix(transformed_data)

### 3.1  Set up experiment data/features

In [12]:
cat_features = ['device', 'platform', 'channel','hour_of_day', 'day_of_week']
num_features =['click_delta_signup']
target=['merchant_id']
rnn_features = ['user_id', 'created_at']
n2v_features = ['user_id']




# split by date
train_data = transformed_data[transformed_data.created_at<='2021-11-07']
test_data = transformed_data[transformed_data.created_at>'2021-11-07']


train_data = train_data[cat_features+num_features+rnn_features+target]
train_data.fillna(0)


test_data =  test_data[cat_features +rnn_features+ num_features+target]
test_data.fillna(0)


training_logger.info('train_shape:{} '.format(train_data.shape))
training_logger.info('test_shape:{}  '.format(test_data.shape))



2022-04-30 06:45:51,751:train_shape:(8421, 9) 
2022-04-30 06:45:51,752:test_shape:(1579, 9)  


### 3.2  Feature Transformation/Scaling pipeline set up

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [14]:
# set up pipeline for classical models
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, cat_features),
    ('standard-scaler', numerical_preprocessor, num_features)]
)


### 3.3 Model Training and Evaluation

#### 3.3.1 Load previous models and new models which will be trained and compared

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from mlcore.train_eval_helper import *
from mlcore.modelops import load_model, save_model, read_data


In [16]:
mldbpath = '../data/mldb.sqlite'
deployed_model_info=None
deployed_model_obj = None
load_prev_model = False
from tensorflow import keras

if load_prev_model:
    try:
        deployed_model_info = read_data(mldbpath, 'deployed_model').iloc[0].to_dict()
        if deployed_model_info:
            deployed_model_name = deployed_model_info['final_model_name']
            deployed_model_obj = load_model(deployed_model_name, cur_logger=training_logger) 
            if deployed_model_obj['type']=='SEQDL':
                actual_obj = keras.models.load_model('../models/'+deployed_model_name+'.deep_mdl')
            deployed_model_obj['obj'] = actual_obj
    except :
        training_logger.info('Could not load deployed model it may not exist')

#### 3.3.2 Set up  Classifiers which need to be trained on

In [17]:
#set up classfiers

classifiers = {
        'KNN' : {'obj': KNeighborsClassifier(),
             'param_grid':{   
                    'leaf_size' : list(range(20,30)),
                    'n_neighbors' : list(range(5,30)),
                    'p':[1,2]
                },
             "type": 'classical',
             'preprocessor' : preprocessor,
             "features":cat_features+num_features
                 
                },

    'N2V' : {'obj': None,
             'param_grid':{   
                    'workers':1,
                    'dimensions':72,
                    'walk_length':18,
                    'num_walks':100,
                    'window':9,
                    'min_count':1,
                    'batch_words':5
                },
             "type": "N2V",
             "preprocessor" : None,
             "features":cat_features+num_features+['user_id'],
             "merchants2vecdict":{}
             
            },
        'LSTM' : {'obj': None,
             'param_grid':{   
                    'epochs' :10,
                },
             "type": 'SEQDL',
             'preprocessor' : None,
             "features":rnn_features+target,
             "user_merchant_hist_path":None,
              
                 
                 
                }  
    
        }

# for clf in classifiers:
#     classifiers[clf]['preprocessor'] = preprocessor



In [18]:
#original_data
# X_train = train_data[cat_features+num_features]
# y_train = train_data[target]
# X_test = test_data[cat_features + num_features]
# y_test = test_data[target]

#X_train = X_train.iloc[0:50000]
#y_train = y_train.iloc[0:50000]

#X_test_small = X_test.iloc[0:15000]
#y_test_small = y_test.iloc[0:15000]




### *** Please comment below cell if you want to run on a full dataset

In [19]:
# ##shorten_data for demo
# train_data = train_data.iloc[0:5000]
# test_data = test_data.iloc[0:1000]


In [20]:
# from tensorflow.python.client import device_lib
# def get_available_devices():
#     local_device_protos = device_lib.list_local_devices()
#     return [x.name for x in local_device_protos if x.device_type == 'GPU' or x.device_type == 'CPU']
# get_available_devices()

In [21]:
metrics = {'MRR':0}
if deployed_model_obj:
    classifiers['deployed_model'] = {
        'obj':deployed_model_obj['obj'],
        'param_grid':deployed_model_obj['param_grid'],
        'features':deployed_model_obj['features'],
        'preprocessor':deployed_model_obj['preprocessor'],
         'type':deployed_model_obj['type']
    }
    
    if classifiers['deployed_model']['type']=='N2V':
        classifiers['deployed_model']['merchants2vecdict'] = deployed_model_obj['merchants2vecdict']
        
for metric in metrics:
    for clf in classifiers:
        classifiers[clf][metric]=0

get_df_from_dict(classifiers)

Unnamed: 0,index,obj,param_grid,type,preprocessor,features,MRR,merchants2vecdict,user_merchant_hist_path
0,KNN,KNeighborsClassifier(),"{'leaf_size': [20, 21, 22, 23, 24, 25, 26, 27,...",classical,ColumnTransformer(transformers=[('one-hot-enco...,"[device, platform, channel, hour_of_day, day_o...",0,,
1,N2V,,"{'workers': 1, 'dimensions': 72, 'walk_length'...",N2V,,"[device, platform, channel, hour_of_day, day_o...",0,{},
2,LSTM,,{'epochs': 10},SEQDL,,"[user_id, created_at, merchant_id]",0,,


#### 3.3.3 Train Models

In [22]:
#Train Models (trained model is stored back in dict)
use_dask = False

if use_dask:
    from dask.distributed import Client, progress
    dask_client = Client(processes=False, threads_per_worker=2,
                    n_workers=1, memory_limit='3GB')
else:
    dask_client = None

#comparison_result_dict = train_models(classifiers, X_train,y_train, training_logger, dask_client)
comparison_result_dict = train_models(classifiers, train_data, target, training_logger, dask_client)

#comparison_result_dict = train_models(classifiers, X_train,y_train, training_logger)
comparison_result = get_df_from_dict(comparison_result_dict, idxname='Classifier')
comparison_result

2022-04-30 06:45:53,240:Training started for KNN
  return self._fit(X, y)
2022-04-30 06:45:53,259:Training ended for KNN
2022-04-30 06:45:53,260:Training ended for KNN
2022-04-30 06:45:53,260:Training started for N2V


Computing transition probabilities:   0%|          | 0/510 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 100/100 [00:00<00:00, 433.75it/s]
2022-04-30 06:46:11,185:Training ended for N2V
2022-04-30 06:46:11,186:Training ended for N2V
2022-04-30 06:46:11,187:Training started for LSTM
2022-04-30 06:46:11,232:Training ended for LSTM
2022-04-30 06:46:11,421:Saved schema user_merchant_hist_20220430__064611 in dataframe with shape (500, 2) at path ../data/user_merchant_hist_20220430__064611.csv
2022-04-30 06:46:11,421:loaded user_merchant hist  of size:: (500, 2)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 26, 200)           2200      
                                                                 
 lstm (LSTM)                 (None, 26, 100)           120400    
                                                                 
 lstm_1 (LSTM)               (None, 200)               240800    
                                                                 
 dense (Dense)               (None, 11)                2211      
                                                                 
Total params: 365,611
Trainable params: 365,611
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-04-30 06:46:24,513:Training ended for LSTM


Unnamed: 0,Classifier,obj,param_grid,type,preprocessor,features,MRR,merchants2vecdict,user_merchant_hist_path,train_tokenizer,max_sequence_len,user_merchant_hist_data_path
0,KNN,KNeighborsClassifier(),"{'leaf_size': [20, 21, 22, 23, 24, 25, 26, 27,...",classical,ColumnTransformer(transformers=[('one-hot-enco...,"[device, platform, channel, hour_of_day, day_o...",0,,,,,
1,N2V,"Word2Vec(vocab=510, vector_size=72, alpha=0.025)","{'workers': 1, 'dimensions': 72, 'walk_length'...",N2V,,"[device, platform, channel, hour_of_day, day_o...",0,"{'merchant_1': [-0.2271407, 0.019114045, 0.443...",,,,
2,LSTM,<keras.engine.sequential.Sequential object at ...,{'epochs': 10},SEQDL,,"[user_id, created_at, merchant_id]",0,,,<keras_preprocessing.text.Tokenizer object at ...,27.0,../data/user_merchant_hist_20220430__064611.csv


#### 3.3.3 Tes/Compare  Models

In [23]:
# Compute MRR ON TEST DATA ( MRR is computed and stored back in dict)

comparison_result_dict = test_models(classifiers, test_data, target, training_logger,dask_client)
#comparison_result_dict = test_models(classifiers, X_test,y_test, target)

comparison_result = get_df_from_dict(comparison_result_dict, idxname='Classifier')
comparison_result

2022-04-30 06:46:24,535:Testing started for KNN
2022-04-30 06:46:25,015:Testing ended for KNN
2022-04-30 06:46:25,017:Testing started for N2V
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df["order_of_preds"] = X_test.user_id.apply(
2022-04-30 06:46:25,321:Testing ended for N2V
2022-04-30 06:46:25,322:Testing started for LSTM
2022-04-30 06:46:25,330:Loaded data file ../data/user_merchant_hist_20220430__064611.csv in dataframe with shape (500, 2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df["y_org"] = y_test[target]


No history found for 0 users dropping them for MRR compute


2022-04-30 06:47:22,975:Testing ended for LSTM


Unnamed: 0,Classifier,obj,param_grid,type,preprocessor,features,MRR,merchants2vecdict,user_merchant_hist_path,train_tokenizer,max_sequence_len,user_merchant_hist_data_path
0,KNN,KNeighborsClassifier(),"{'leaf_size': [20, 21, 22, 23, 24, 25, 26, 27,...",classical,ColumnTransformer(transformers=[('one-hot-enco...,"[device, platform, channel, hour_of_day, day_o...",0.302937,,,,,
1,N2V,"Word2Vec(vocab=510, vector_size=72, alpha=0.025)","{'workers': 1, 'dimensions': 72, 'walk_length'...",N2V,,"[device, platform, channel, hour_of_day, day_o...",0.1,"{'merchant_1': [-0.2271407, 0.019114045, 0.443...",,,,
2,LSTM,<keras.engine.sequential.Sequential object at ...,{'epochs': 10},SEQDL,,"[user_id, created_at, merchant_id]",0.1,,,<keras_preprocessing.text.Tokenizer object at ...,27.0,../data/user_merchant_hist_20220430__064611.csv


In [24]:
#Get best Model Based on a metric
metric = 'MRR'
best_model_row = get_best_model(comparison_result, metric)
best_model = best_model_row['Classifier']
best_model_id = classifiers[best_model]['obj']
training_logger.info("Best performing model on basic of metric {} is {}".format(metric, best_model))
#best_model_row.to_dict()

2022-04-30 06:47:22,996:Best performing model on basic of metric MRR is KNN


In [25]:
# # #ovverride best model
# model_key_to_deploy = 'LSTM'
# best_model_row = comparison_result[comparison_result.Classifier==model_key_to_deploy].iloc[0]




In [26]:
#Save/Deploy final_trained_model
final_trained_model = best_model_row.to_dict()
final_model_name =final_trained_model['Classifier']+'_'+nb_run_id.replace('trng','model')

#save model
model_type = final_trained_model['type']
if model_type=='SEQDL':
    final_model_name = 'SEQDL_'+final_model_name
save_model(final_trained_model, final_model_name, model_type=model_type)

deploy_df = pd.DataFrame([[final_model_name]], 
                         columns =['final_model_name'])
deploy_df


model stored at ../models/KNN_model_04_30_2022_06_45_51.mdl


Unnamed: 0,final_model_name
0,KNN_model_04_30_2022_06_45_51


In [27]:
# final_trained_model

#### 3.3.3 Update Model registry/Deploy Best

In [28]:
schema_dict = {
    'deployed_model':deploy_df,
    'hist_deployed_models':deploy_df,
    #'train_report':comparison_result
}


In [29]:
from mlcore.dbhelper import store_data, overwrite_data
for dkey in schema_dict:
    data_to_be_stored = schema_dict[dkey]
    if dkey=='deployed_model':
         overwrite_data(data_to_be_stored, mldbpath, dkey)
    else:
        store_data(data_to_be_stored, mldbpath, dkey)

dep_model_name = deploy_df['final_model_name'].iloc[0]
print('Model {} deployed \n associated reports saved in respective tables with id:{}'.format(dep_model_name,nb_run_id))

Model KNN_model_04_30_2022_06_45_51 deployed 
 associated reports saved in respective tables with id:trng_04_30_2022_06_45_51


In [30]:
training_logger.info('Traning job with id {} finished'.format(nb_run_id))

2022-04-30 06:47:23,157:Traning job with id trng_04_30_2022_06_45_51 finished
