In [10]:
# general libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Seeding to ensure consistent accuracy scores for every model iteration
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Import pre-processed files from CSV

In [11]:
# %%time

# train1 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\train1.csv")
# test1 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\test1.csv")
# test_val1 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\test_val1.csv")

# train2 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\train2.csv")
# test2 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\test2.csv")
# test_val2 = pd.read_csv(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\test_val2.csv")

# print('Done')

Importing the pre-processed datasets proved to consume far more memory (RAM) and time than recreating it from scratch (as below)

In [12]:
train = pd.read_csv\
(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\train_data.csv")

test = pd.read_csv\
(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\test_data.csv")

test_val = pd.read_csv\
(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\test_data_hidden.csv")

test['Date'] = pd.to_datetime(test['Date'], format='%d-%m-%Y').dt.strftime('%Y-%m-%d')

length_train = len(train.index)
length_test = len(test.index)
length_test_val = len(test_val.index)

combi = train.append(test, ignore_index=True)
combi = combi.append(test_val , ignore_index=True)

# Creating dummy variables for dates
combi['year'] = pd.to_datetime(combi['Date'],format='%Y-%m-%d').dt.year 
combi['month'] = pd.to_datetime(combi['Date'],format='%Y-%m-%d').dt.month 
combi['day'] = pd.to_datetime(combi['Date'],format='%Y-%m-%d').dt.day
combi['year'] = combi.year.replace({2013 : 0, 2014 : 1 , 2015 : 2 })  # replace year with discrete values
# It was noted that test data Dates format was initially flipped as format='%d-%m-%Y'. This was corrected to '%Y-%m-%d' earlier duing EDA.

# Creating a new timedelta column
combi['date_delta'] = pd.to_datetime(combi['Date'], format='%Y-%m-%d')
combi['date_delta'] = ((combi['date_delta']-combi['date_delta'].min())/np.timedelta64(1,'D')) + 1

# Finally, we also address the StateHoliday variable by converting it from object/string to discrete values
combi['StateHoliday'] = combi.StateHoliday.replace({'0' : 0, 'a' : 1 , 'b' : 2 ,'c' : 3})

# Train a single model for all stores, using storeId as a feature.
combi1 = pd.get_dummies(combi, columns=['Store','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday','year','month','day'],drop_first=True)
display(combi1.head())

# Train separate model for each store.
combi2 = pd.get_dummies(combi, columns=['DayOfWeek','Open', 'Promo','StateHoliday','SchoolHoliday','year','month','day'],drop_first=True)
display(combi2.head())

train1 = combi1.iloc[:length_train].reset_index(drop=True)
test1 = combi1.iloc[length_train:(length_train+length_test)].reset_index(drop=True)
test_val1 = combi1.iloc[(length_train+length_test):].reset_index(drop=True)

train2 = combi2.iloc[:length_train].reset_index(drop=True)
test2 = combi2.iloc[length_train:(length_train+length_test)].reset_index(drop=True)
test_val2 = combi2.iloc[(length_train+length_test):].reset_index(drop=True)

Unnamed: 0,Date,Sales,Customers,date_delta,Store_2,Store_3,Store_4,Store_5,Store_6,Store_7,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2015-06-30,5735.0,568,911.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2015-06-30,9863.0,877,911.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2015-06-30,13261.0,1072,911.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2015-06-30,13106.0,1488,911.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2015-06-30,6635.0,645,911.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


Unnamed: 0,Store,Date,Sales,Customers,date_delta,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,1,2015-06-30,5735.0,568,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,2015-06-30,9863.0,877,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,2015-06-30,13261.0,1072,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,2015-06-30,13106.0,1488,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,2015-06-30,6635.0,645,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Modelling - Notes on Accuracy Metrics

In [13]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

#### Between RMSE and MAPE, which metric is more suitable to use to compare different models?

Because of the square in RMSE, error terms with larger magnitudes will be more pronounced (making them even larger). In this dataset, we are not so concerned about the impact of large errors since we observed only one outlier in Sales data.

However, due to this variability in error magnitudes in RMSE (the squared errors), without the benefit of other information such as MAPE/MAE, it is impossible to determine to what extent RMSE reflects the ‘true’ central tendency (average error) and to what extent it represents the variability within the distribution of squared errors. In other words, a small number of large errors when squared can inflate the RMSE value beyond the true measure of average error.

* **Thus, for comparisons across different models, it appears better to use MAPE as the main unit of comparison, since there is an inconsistent functional relationship between RMSE and average error.** However, both RMSE and MAPE will be assessed throughout this project.


* Percentage errors are also more meaningful to compare than absolute values, since percentages are scale-independent and can therefore be used to compare forecasts on different scales and/or models. 


* MAPE is challenged when observed values comprise zeroes, near-zeroes and extreme values – this means we should be careful in using MAPE if the observed Sales value is zero/near-zero. However, for this study, this appears unlikely to be the case, unless store Open = 0 (thus Sales = 0) 


* Finally, MAPE is also not sensitive to outliers, and our dataset has little-to-no outliers for Sales, meaning it is not a concern to use it in this instance.

As a note, R-Square /Adjusted R-Square is better used to evaluate how well the variability of the dependent variables can be explain  <u>*by a particular model*</u>, while RMSE/MAPE is better used to compare the performance <u>*between different models*</u>.


In [14]:
# due to long model training runtimes, let's save them using joblib
import joblib

#### Model 1 - Linear Regression - Single Model for All Stores (Store as Feature)

In [15]:
display(train1.head())
display(test_val1.head())

Unnamed: 0,Date,Sales,Customers,date_delta,Store_2,Store_3,Store_4,Store_5,Store_6,Store_7,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2015-06-30,5735.0,568,911.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2015-06-30,9863.0,877,911.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2015-06-30,13261.0,1072,911.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2015-06-30,13106.0,1488,911.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2015-06-30,6635.0,645,911.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


Unnamed: 0,Date,Sales,Customers,date_delta,Store_2,Store_3,Store_4,Store_5,Store_6,Store_7,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2015-07-31,5263.0,555,942.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2015-07-31,6064.0,625,942.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2015-07-31,8314.0,821,942.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2015-07-31,13995.0,1498,942.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2015-07-31,4822.0,559,942.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# %%time 

# # drop Customers as it has a VIF>5, and drop Date as we have already encoded it into dummies
# Y_train1 = train1['Sales']
Y_val1 = test_val1['Sales']
# X_train1 = train1.drop(['Date','Sales','Customers'],axis=1).values
# X_val1 = test_val1.drop(['Date','Sales','Customers'],axis=1).values

# # instantiate
# lr_0 = LinearRegression()

# # fit
# lr_0.fit(X_train1,Y_train1)

# # This regression takes a significant amount of time, so let's save with joblib
# joblib.dump(lr_0, r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\Model1.sav")

In [17]:
# load back and predict
lr_0 = joblib.load(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\Model1.sav")
Y_pred1 = lr_0.predict(X_val1)

# evaluate
Model_1_RMSE = np.sqrt(mean_squared_error(Y_pred1,Y_val1))
Model_1_MAPE = (mean_absolute_percentage_error(Y_pred1,Y_val1)*100)

print('Model 1 RMSE %0.3f' %Model_1_RMSE)
print('Model 1 MAPE %0.3f%%' %Model_1_MAPE)

Model 1 RMSE 1427.783
Model 1 MAPE 25.543%


#### Model 2 - Linear Regression - Separate model for each Store

In [18]:
display(train2.head())
display(test_val2.head())

Unnamed: 0,Store,Date,Sales,Customers,date_delta,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,1,2015-06-30,5735.0,568,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,2015-06-30,9863.0,877,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,2015-06-30,13261.0,1072,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,2015-06-30,13106.0,1488,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,2015-06-30,6635.0,645,911.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Unnamed: 0,Store,Date,Sales,Customers,date_delta,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,1,2015-07-31,5263.0,555,942.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,2015-07-31,6064.0,625,942.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3,2015-07-31,8314.0,821,942.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,4,2015-07-31,13995.0,1498,942.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5,2015-07-31,4822.0,559,942.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
%%time

Y_pred2 = np.zeros(test_val2.shape[0]) # call an array of zeroes to later fill with 'pred' values using for loop
Y_val2 = test_val2['Sales'] # call Y_val2 here as it remains the same for every instance of the looped regression

# the following grouby statements creates some sort of 3D dataset, so can't be printed
train_bystore = train2.groupby(['Store'])
test_bystore = test_val2.groupby(['Store'])

for i in range(1,1116):
    df1 = train_bystore.get_group(i)
    df2 = test_bystore.get_group(i)
    Y_train2 = df1['Sales']
    X_train2 = df1.drop(['Store','Date','Sales','Customers'],axis=1).values
    X_val2 = df2.drop(['Store','Date','Sales','Customers'],axis=1).values
    model = LinearRegression()
    pred = model.fit(X_train2, Y_train2).predict(X_val2) # for each regression from 1-1115, output a 'pred'
    i = 0
    for j in df2.index:
            Y_pred2[j] = pred[i] # place the 'pred' output into each row of Y_pred2
            i+=1

Model_2_RMSE = np.sqrt(mean_squared_error(Y_pred2,Y_val2))
Model_2_MAPE = (mean_absolute_percentage_error(Y_pred2,Y_val2)*100)

print('Model 2 RMSE %0.3f' %Model_2_RMSE)
print('Model 2 MAPE %0.3f%%' %Model_2_MAPE)

Model 2 RMSE 958.589
Model 2 MAPE 21.670%
Wall time: 9.63 s


In [20]:
print('From our two models, both of the RMSE and MAPE scores suggest that having a ' + '\033[1m' + 'Separate Model For All Stores ' + '\033[0;0m' + 'performs better than having a ' + '\033[1m' +'Single Model For All Stores.\n' + '\n \033[0;0m')
print('Model_1_RMSE: %0.3f' %Model_1_RMSE)
print('Model_1_MAPE: %0.3f%%' %Model_1_MAPE)
print('\n')
print('Model_2_RMSE: %0.3f' %Model_2_RMSE)
print('Model_2_MAPE: %0.3f%%' %Model_2_MAPE)

From our two models, both of the RMSE and MAPE scores suggest that having a [1mSeparate Model For All Stores [0;0mperforms better than having a [1mSingle Model For All Stores.

 [0;0m
Model_1_RMSE: 1427.783
Model_1_MAPE: 25.543%


Model_2_RMSE: 958.589
Model_2_MAPE: 21.670%


#### Average Ensemble of Model1 and Model2

In [21]:
avg_ensemble = (Y_pred1+Y_pred2)/2

avg_ensemble_RMSE = np.sqrt(mean_squared_error(avg_ensemble,Y_val1))
avg_ensemble_MAPE = (mean_absolute_percentage_error(avg_ensemble,Y_val1)*100)

print('Avg Ensemble RMSE %0.3f' %avg_ensemble_RMSE)
print('Avg Ensemble MAPE %0.3f%%' %avg_ensemble_MAPE)

print('\n An average ensemble of Model 1 and Model 2 does not improve the performance of RMSE and MAPE over the standalone performance of Model 2')

Avg Ensemble RMSE 1044.469
Avg Ensemble MAPE 22.655%

 An average ensemble of Model 1 and Model 2 does not improve the performance of RMSE and MAPE over the standalone performance of Model 2


#### Model 1: Regularization with Ridge Regression

In [22]:
Y_train_ridge = train1['Sales']
Y_val_ridge = test_val1['Sales']
X_train_ridge = train1.drop(['Date','Sales','Customers'],axis=1).values
X_val_ridge = test_val1.drop(['Date','Sales','Customers'],axis=1).values

# # instantiate
# rr = Ridge(alpha=10)

# # fit
# rr.fit(X_train_ridge, Y_train_ridge)

# #save
# joblib.dump(rr, r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\RRmodel.sav")

#load and predict
rr = joblib.load(r"D:\Simplilearn\07 AI Capstone Project\Project 3 Data\toCSV\RRmodel.sav")
Y_pred_ridge = rr.predict(X_val_ridge)

# evaluate
Model_1_ridge_RMSE = np.sqrt(mean_squared_error(Y_pred_ridge,Y_val_ridge))
Model_1_ridge_MAPE = (mean_absolute_percentage_error(Y_pred_ridge,Y_val_ridge)*100)

print('Model 1 Ridge RMSE %0.3f' %Model_1_ridge_RMSE)
print('Model 1 Ridge MAPE %0.3f%%' %Model_1_ridge_MAPE)

Model 1 Ridge RMSE 1431.524
Model 1 Ridge MAPE 25.590%


#### Model 2: Regularization with Ridge Regression

In [23]:
Y_pred_ridge2 = np.zeros(test_val2.shape[0])
Y_val_ridge2 = test_val2['Sales']


train_bystore = train2.groupby(['Store'])
test_bystore = test_val2.groupby(['Store'])

for i in range(1,1116):
    df1 = train_bystore.get_group(i)
    df2 = test_bystore.get_group(i)
    Y_train_ridge2 = df1['Sales']
    X_train_ridge2 = df1.drop(['Store','Date','Sales','Customers'],axis=1).values
    X_val_ridge2 = df2.drop(['Store','Date','Sales','Customers'],axis=1).values
    model = Ridge(alpha=10)
    pred = model.fit(X_train_ridge2, Y_train_ridge2).predict(X_val_ridge2) 
    i = 0
    for j in df2.index:
            Y_pred_ridge2[j] = pred[i] 
            i+=1

Model_2_ridge_RMSE = np.sqrt(mean_squared_error(Y_pred_ridge2,Y_val_ridge2))
Model_2_ridge_MAPE = (mean_absolute_percentage_error(Y_pred_ridge2,Y_val_ridge2)*100)

print('Model 2 Ridge RMSE %0.3f' %Model_2_ridge_RMSE)
print('Model 2 Ridge RMSE %0.3f%%' %Model_2_ridge_MAPE)

Model 2 Ridge RMSE 943.773
Model 2 Ridge RMSE 21.442%


#### Model 2: Bagging with DecisionTree Regressor

In [24]:
%%time
from sklearn.ensemble import BaggingRegressor
from sklearn import tree

Y_pred_bag = np.zeros(test_val2.shape[0]) 
Y_val_bag = test_val2['Sales'] 

train_bystore = train2.groupby(['Store'])
test_bystore = test_val2.groupby(['Store'])

for i in range(1,1116):
    df1 = train_bystore.get_group(i)
    df2 = test_bystore.get_group(i)
    Y_train_bag = df1['Sales']
    X_train_bag = df1.drop(['Store','Date','Sales','Customers'],axis=1).values
    X_val_bag = df2.drop(['Store','Date','Sales','Customers'],axis=1).values
    model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
    pred = model.fit(X_train_bag, Y_train_bag).predict(X_val_bag) 
    i = 0
    for j in df2.index:
            Y_pred_bag[j] = pred[i] 
            i+=1

Model_2_bag_RMSE = np.sqrt(mean_squared_error(Y_pred_bag,Y_val_bag))
Model_2_bag_MAPE = (mean_absolute_percentage_error(Y_pred_bag,Y_val_bag)*100)

print('Model 2 Bagging RMSE %0.3f' %Model_2_bag_RMSE)
print('Model 2 Bagging MAPE %0.3f%%' %Model_2_bag_MAPE)

Model 2 Bagging RMSE 1224.309
Model 2 Bagging MAPE 10.282%
Wall time: 57.7 s


In [25]:
from IPython.display import display

# =============================================
# print('Model 1 RMSE %0.3f' %Model_1_RMSE)
# print('Model 1 MAPE %0.3f%%' %Model_1_MAPE)

# print('Model 2 RMSE %0.3f' %Model_2_RMSE)
# print('Model 2 MAPE %0.3f%%' %Model_2_MAPE)

# print('Avg Ensemble RMSE %0.3f' %avg_ensemble_RMSE)
# print('Avg Ensemble MAPE %0.3f%%' %avg_ensemble_MAPE)

# print('Model 1 Ridge RMSE %0.3f' %Model_1_ridge_RMSE)
# print('Model 1 Ridge MAPE %0.3f%%' %Model_1_ridge_MAPE)

# print('Model 2 Ridge RMSE %0.3f' %Model_2_ridge_RMSE)
# print('Model 2 Ridge RMSE %0.3f%%' %Model_2_ridge_MAPE)

# print('Model 2 Bagging RMSE %0.3f' %Model_2_bag_RMSE)
# print('Model 2 Bagging MAPE %0.3f%%' %Model_2_bag_MAPE)
# =============================================

# place results into a df
Data_Week1 = [['Model_1', Model_1_RMSE, Model_1_MAPE], ['Model_2', Model_2_RMSE, Model_2_MAPE], ['Avg_Ensemble (0 & 1)', avg_ensemble_RMSE, avg_ensemble_MAPE], ['Model_1_Ridge', Model_1_ridge_RMSE, Model_1_ridge_MAPE], ['Model_2_Ridge', Model_2_ridge_RMSE, Model_2_ridge_MAPE], ['Model_2_Bagging', Model_2_bag_RMSE, Model_2_bag_MAPE]]
Results_All_Models = pd.DataFrame(Data_Week1, columns = ['Model', 'RMSE Score','MAPE Score (%)'])

# formatting
Results_All_Models['RMSE Score'] = Results_All_Models['RMSE Score'].map('{:,.2f}'.format)
Results_All_Models['MAPE Score (%)'] = Results_All_Models['MAPE Score (%)'].map('{:,.2f}%'.format)
display(Results_All_Models)

print("The best RMSE score goes to Model_2_Ridge, though the best MAPE score goes to Model_2_Bagging (with DecisionTree Regressor). \n\nHowever, choosing the 'winner' of the two may prove difficult:\n") 
reasons = ['Both of these use the same dataset (train2, test_val2, test2), hence it is valid to compare their RMSE scores directly.\n\n', 'At the same time, MAPE scores for both may be unreliable/unusable since we do have Sales = 0 when store Open = 0.\n\n', 'Finally, the MAPE score for Model_2_Bagging is nearly half of Model_2_Ridge (despite a comparable RMSE score) indicating a wider range in the predictions, possibly stemming from the bagging process, thus causing any errors to appear smaller in percentage terms.\n\n']
for i, item in enumerate(reasons,1):
    print(i, '. ' + item, sep='',end='')
print('\033[1m' + 'By this reasoning, Model_2_Ridge (based on its RMSE alone) is better.' + '\033[0;0m')

Unnamed: 0,Model,RMSE Score,MAPE Score (%)
0,Model_1,1427.78,25.54%
1,Model_2,958.59,21.67%
2,Avg_Ensemble (0 & 1),1044.47,22.65%
3,Model_1_Ridge,1431.52,25.59%
4,Model_2_Ridge,943.77,21.44%
5,Model_2_Bagging,1224.31,10.28%


The best RMSE score goes to Model_2_Ridge, though the best MAPE score goes to Model_2_Bagging (with DecisionTree Regressor). 

However, choosing the 'winner' of the two may prove difficult:

1. Both of these use the same dataset (train2, test_val2, test2), hence it is valid to compare their RMSE scores directly.

2. At the same time, MAPE scores for both may be unreliable/unusable since we do have Sales = 0 when store Open = 0.

3. Finally, the MAPE score for Model_2_Bagging is nearly half of Model_2_Ridge (despite a comparable RMSE score) indicating a wider range in the predictions, possibly stemming from the bagging process, thus causing any errors to appear smaller in percentage terms.

[1mBy this reasoning, Model_2_Ridge (based on its RMSE alone) is better.[0;0m


#### Average Ensemble of Model_2 and Model_2_Ridge

In [26]:
# Given our inconclusive results, lets try a couple other average ensembles
avg_ensemble2 = (Y_pred2 + Y_pred_ridge2)/2

avg_ensemble2_RMSE = np.sqrt(mean_squared_error(avg_ensemble2,Y_val1))
avg_ensemble2_MAPE = (mean_absolute_percentage_error(avg_ensemble2,Y_val1)*100)

print('Avg Ensemble 2 RMSE %0.3f' %avg_ensemble2_RMSE)
print('Avg Ensemble 2 MAPE %0.3f%%' %avg_ensemble2_MAPE)

Avg Ensemble 2 RMSE 937.908
Avg Ensemble 2 MAPE 21.451%


#### Average Ensemble of Model_2_Ridge and Model_2_Bagging

In [27]:
avg_ensemble3 = (Y_pred_bag + Y_pred_ridge2)/2

avg_ensemble3_RMSE = np.sqrt(mean_squared_error(avg_ensemble3,Y_val1))
avg_ensemble3_MAPE = (mean_absolute_percentage_error(avg_ensemble3,Y_val1)*100)

print('Avg Ensemble 3 RMSE %0.3f' %avg_ensemble3_RMSE)
print('Avg Ensemble 3 MAPE %0.3f%%' %avg_ensemble3_MAPE)

Avg Ensemble 3 RMSE 981.582
Avg Ensemble 3 MAPE 21.512%


In [28]:
print("The best performing model thus appears to be Avg Ensemble 2: which is an average ensemble of Model_2 and Model_2_Ridge")

temp2 = pd.DataFrame({
    "Model":          ['Avg_Ensemble_2 (1 & 4)',       'Avg_Ensemble_3 (4 & 5)'],
    "RMSE Score":     [avg_ensemble2_RMSE,              avg_ensemble3_RMSE],
    "MAPE Score (%)": [avg_ensemble2_MAPE,              avg_ensemble3_MAPE]})

temp2['RMSE Score'] = temp2['RMSE Score'].map('{:,.2f}'.format)
temp2['MAPE Score (%)'] = temp2['MAPE Score (%)'].map('{:,.2f}%'.format)

Results_All_Models = Results_All_Models.append(temp2, ignore_index = True)
display(Results_All_Models)

The best performing model thus appears to be Avg Ensemble 2: which is an average ensemble of Model_2 and Model_2_Ridge


Unnamed: 0,Model,RMSE Score,MAPE Score (%)
0,Model_1,1427.78,25.54%
1,Model_2,958.59,21.67%
2,Avg_Ensemble (0 & 1),1044.47,22.65%
3,Model_1_Ridge,1431.52,25.59%
4,Model_2_Ridge,943.77,21.44%
5,Model_2_Bagging,1224.31,10.28%
6,Avg_Ensemble_2 (1 & 4),937.91,21.45%
7,Avg_Ensemble_3 (4 & 5),981.58,21.51%


In [29]:
# Store this variable for use in later segments
Results_All_Models_data = Results_All_Models
%store Results_All_Models_data
del Results_All_Models_data

Stored 'Results_All_Models_data' (DataFrame)
