In [1]:
# https://github.com/alamhanz/clump_project/blob/master/customer_segmentation_1/Clustering%20Process.ipynb
# https://towardsdatascience.com/sql-window-functions-in-python-pandas-data-science-dc7c7a63cbb4
# https://stackoverflow.com/questions/62116416/rolling-sum-over-a-partition-in-python

# 5. Machine Learning Training

## Goal
train and predict on all eligble products using Machine Learning

## Plan
Checklist what will be done on the notebook :

        [ ] Get Data
        [ ] Train Test
        [ ] Model Training
        [ ] Evaluation
         

In [2]:
import pandas as pd
import seaborn as sns
import sys
import numpy as np
import joblib
import matplotlib.pyplot as plt
import random

import sys
sys.path.insert(1,'../src')
import dmdf
from ramal import *
from scipy.stats import uniform,randint

pd.options.mode.chained_assignment = None  # default='warn'

import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
## ML packages
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

In [4]:
import sklearn
print(sklearn.__version__)
print(joblib.__version__)

0.24.2
1.1.0


In [5]:
# !pip install joblib==1.1.0

In [6]:
PATH_DATA = '../data/'
PATH_MODEL = '../artifacts/'

## Get The Data

In [7]:
dfX = pd.read_csv(PATH_DATA+"data_demand_ML_prep.csv")

In [8]:
dfX.shape

(14322, 54)

In [9]:
col_use = []

In [10]:
col_use.sort()

In [11]:
y_target = 'demand'

##  Train Test

In [12]:
dfX_train = dfX[dfX.MonthYear<='2022-01']
dfX_test = dfX[dfX.MonthYear>'2022-01']

In [13]:
dfX_test.shape

(2046, 54)

In [14]:
len(dfX_test)/len(dfX)

0.14285714285714285

In [15]:
X_train = dfX_train[col_use]
y_train = dfX_train[y_target]

X_test = dfX_test[col_use]
y_test = dfX_test[y_target]

In [16]:
## StantardScaler
SS_demand = StandardScaler()
SS_demand.fit(X_train)

X_tr = SS_demand.transform(X_train)
X_te = SS_demand.transform(X_test)

## Modelling

In [17]:
base_reg = LinearRegression(fit_intercept=True)

all_model = {'reg' : base_reg
            ,'lrd' : Ridge(alpha=0.8, fit_intercept=True, copy_X=True, max_iter=3000, tol=0.0001) ##--> L2
            ,'llo' : Lasso(alpha=0.9, fit_intercept=True, copy_X=True, max_iter=3000, tol=0.0001) ##--> L1
            ,'eln' : ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=3000, tol=0.0001)
            ,'dtr' : DecisionTreeRegressor(max_depth=10, min_samples_split=2, max_features=0.8)
            # ,'gbr' : GradientBoostingRegressor(loss='absolute_error', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_leaf=1, max_depth=3, max_features=0.9, alpha=0.9)
            ,'rfr' : RandomForestRegressor(n_estimators=30, max_depth=20, min_samples_split=2, min_weight_fraction_leaf=0.0, 
                                           max_features=0.9, max_samples=0.95,n_jobs=3)
            
            ,'bag_dt' : BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0,n_jobs=3)
            ,'bag_lr' : BaggingRegressor(base_estimator=base_reg, n_estimators=10, max_samples=1.0, max_features=1.0,n_jobs=3)
            ,'ada_dt' : AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear') # loss{‘linear’, ‘square’, ‘exponential’}, default=’linear’
            ,'ada_lr' : AdaBoostRegressor(base_estimator=base_reg, n_estimators=50, learning_rate=1.0, loss='linear') # loss{‘linear’, ‘square’, ‘exponential’}, default=’linear’
            }

In [18]:
param_options = {'reg' : dict(fit_intercept=[True, False])
                ,'lrd' : dict(alpha=uniform(loc=0.5,scale=0.48)
                            ,fit_intercept=[True, False]
                            ,max_iter=[200,400,600,1000,2000,3000,5000]
                            ,tol=[0.00005,0.0001,0.0005,0.001,0.002,0.05,0.1,0.2]
                            ) 
                ,'llo' : dict(alpha=uniform(loc=0.5,scale=0.48)
                            ,fit_intercept=[True, False]
                            ,max_iter=[200,400,600,1000,2000,3000,5000]
                            ,tol=[0.00005,0.0001,0.0005,0.001,0.002,0.05,0.1,0.2]
                            ) 
                ,'eln' : dict(alpha=uniform(loc=0.5,scale=0.48)
                            ,l1_ratio=uniform(loc=0.5,scale=0.48)
                            ,fit_intercept=[True, False]
                            ,max_iter=[200,400,600,1000,2000,3000,5000]
                            ,tol=[0.00005,0.0001,0.0005,0.001,0.002,0.05,0.1,0.2]
                            ) 
                 
                ,'dtr' : dict(max_depth=randint(3, 35)
                            ,min_samples_split=randint(2, 15)
                            ,max_features=uniform(loc=0.5,scale=0.48)
                            ) 
                 
                ,'rfr' : dict(n_estimators=randint(10,250)
                              ,max_depth=randint(3, 25)
                              ,min_samples_split=randint(2, 15)
                              ,min_weight_fraction_leaf=uniform(loc=0.2,scale=0.2)
                              ,max_features=uniform(loc=0.9,scale=0.1)
                             )
                 
                ,'gbr' : dict(learning_rate=uniform(loc=0.2,scale=0.15)
                              ,n_estimators=randint(10,350)
                              ,min_samples_leaf=randint(2, 15)
                              ,max_depth=randint(3, 30)
                              ,max_features=uniform(loc=0.9,scale=0.1)
                              ,alpha=uniform(loc=0.95,scale=0.05)
                             )

                ,'bag_dt' : dict(n_estimators=randint(120,500)
                                 ,max_samples=uniform(loc=0.85,scale=0.15)
                                 ,max_features=uniform(loc=0.9,scale=0.1))
                ,'bag_lr' : dict(n_estimators=randint(120,500)
                                 ,max_samples=uniform(loc=0.85,scale=0.15)
                                 ,max_features=uniform(loc=0.9,scale=0.1))
                 
                ,'ada_dt' : dict(n_estimators=randint(150,500)
                                ,learning_rate=uniform(loc=0.7,scale=0.3)
                                ,loss=['linear', 'square', 'exponential'])
                ,'ada_lr' : dict(n_estimators=randint(150,500)
                                ,learning_rate=uniform(loc=0.7,scale=0.3)
                                ,loss=['linear', 'square', 'exponential'])
                } 

In [19]:
## scoring
## https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
best_clf = {}
best_scr = {}
sla_model = {}
for mdl in all_model.keys():
    L1 = time.time()
    clf = all_model[mdl]
    parm = param_options[mdl]
    if 'bag' in mdl or 'ada' in mdl:
        RS = RandomizedSearchCV(clf, parm, n_iter = 50, cv = 4, scoring = 'neg_mean_absolute_error',n_jobs = 5)
    else:
        RS = RandomizedSearchCV(clf, parm, n_iter = 100, cv = 4, scoring = 'neg_mean_absolute_error',n_jobs = 4)
        
    RS.fit(X_tr,y_train)
    
    best_clf[mdl] = RS.best_estimator_
    best_scr[mdl] = RS.best_score_
    diff_time = round((time.time()-L1)/60, 2)
    
    joblib.dump(best_clf[mdl],PATH_MODEL+mdl+'.pkl')
    
    print(mdl, 'is done in ',diff_time, ' minutes')
    sla_model[mdl] = diff_time

reg is done in  0.02  minutes
lrd is done in  0.07  minutes
llo is done in  0.27  minutes
eln is done in  0.38  minutes
dtr is done in  0.35  minutes
rfr is done in  2.37  minutes


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


bag_dt is done in  22.12  minutes
bag_lr is done in  10.66  minutes
ada_dt is done in  7.0  minutes
ada_lr is done in  0.86  minutes


In [20]:
sla_model

{'reg': 0.02,
 'lrd': 0.07,
 'llo': 0.27,
 'eln': 0.38,
 'dtr': 0.35,
 'rfr': 2.37,
 'bag_dt': 22.12,
 'bag_lr': 10.66,
 'ada_dt': 7.0,
 'ada_lr': 0.86}

In [21]:
best_scr

{'reg': -9.442845709202292,
 'lrd': -9.415076520613074,
 'llo': -9.164994967397636,
 'eln': -9.120358682178045,
 'dtr': -9.75751445268725,
 'rfr': -10.602962260167885,
 'bag_dt': -9.794535959771716,
 'bag_lr': -9.34256417179956,
 'ada_dt': -15.10308734304264,
 'ada_lr': -11.853100501859414}

In [22]:
best_clf

{'reg': LinearRegression(),
 'lrd': Ridge(alpha=0.9632671829430663, max_iter=1000),
 'llo': Lasso(alpha=0.5022595740623403, max_iter=5000, tol=5e-05),
 'eln': ElasticNet(alpha=0.5064237029438007, l1_ratio=0.5176527372798125, max_iter=3000,
            tol=0.05),
 'dtr': DecisionTreeRegressor(max_depth=4, max_features=0.9725079349399351,
                       min_samples_split=8),
 'rfr': RandomForestRegressor(max_depth=17, max_features=0.9427935937943268,
                       max_samples=0.95, min_samples_split=6,
                       min_weight_fraction_leaf=0.20132746764485396,
                       n_estimators=62, n_jobs=3),
 'bag_dt': BaggingRegressor(max_features=0.9332405696695212,
                  max_samples=0.8641894952795036, n_estimators=363, n_jobs=3),
 'bag_lr': BaggingRegressor(base_estimator=LinearRegression(),
                  max_features=0.904536370811343, max_samples=0.9612635187524665,
                  n_estimators=158, n_jobs=3),
 'ada_dt': AdaBoostRegres

## Evaluations


In [23]:
all_val = []

for ml in best_clf.keys():
    Y_prob_pred = best_clf[ml].predict(X_te)
    eval_temp = eval_model(y_test,Y_prob_pred)
    eval_temp['model_name'] = ml
    all_val.append(eval_temp)

In [24]:
pd.DataFrame(all_val)[['model_name','MAE','R2']]

Unnamed: 0,model_name,MAE,R2
0,reg,11.455454,0.619078
1,lrd,11.438412,0.619362
2,llo,11.178993,0.636057
3,eln,11.238896,0.629846
4,dtr,11.546309,0.591492
5,rfr,13.735414,0.282431
6,bag_dt,11.897736,0.586301
7,bag_lr,11.439928,0.618966
8,ada_dt,15.71741,0.463325
9,ada_lr,13.042229,0.495981
