In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
mbs = pd.read_excel("MapleBSCupHist.xlsx", index_col=[0], parse_dates=[0])

In [4]:
mbs.head()
type(mbs)

pandas.core.frame.DataFrame

In [5]:
# mbs.info
mbs_series_value = mbs.values
mbs_series_value

array([[  686],
       [   20],
       [ 6830],
       [14213],
       [ 9400],
       [10740],
       [10290],
       [ 8260],
       [19456],
       [ 9462],
       [12899],
       [13439],
       [11509],
       [12878],
       [13967],
       [19111],
       [12164],
       [ 8097],
       [10525],
       [10897],
       [14392],
       [15304],
       [24049],
       [19706],
       [16909],
       [17734],
       [18601],
       [18764],
       [17659],
       [16330],
       [12646],
       [16297],
       [18601],
       [18997],
       [23166],
       [21995],
       [28056],
       [34825],
       [30823],
       [44551],
       [27123],
       [ 8310],
       [11779],
       [13380],
       [22498],
       [32002],
       [32410],
       [30688],
       [12943]], dtype=int64)

### NORMALIZATION TECHNIQUE

In [6]:
# NOrmalization - Use this if 
# a) If you data is on different scales
# b) Because some algorithms work better when data is normalized
# c) As a data scientist, you should do this

# Here's the formula for NORMALIZATION
# normalization = (x-min)/(max-min)

In [7]:
mbs_series_value.min()

20

In [8]:
mbs_series_value.max()

44551

In [9]:
# So when we normalize the data, the first value of 686 is transformed to value 0.014955.  The value will always 
# within the value of 0-1.  In Sklearn library, the MixMaxScaler does this for you.

(686-20)/(44551-20)

0.01495587343648245

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
scaler = MinMaxScaler(feature_range=(0,1))

In [20]:
scaler = scaler.fit(mbs_series_value)

In [21]:
scaler.data_min_

array([20.])

In [22]:
scaler.data_max_

array([44551.])

In [23]:
mbs_normalize = scaler.transform(mbs_series_value)

In [24]:
mbs_normalize[0]

array([0.01495587])

In [25]:
for i in range(5):
    print(mbs_normalize[i])

[0.01495587]
[0.]
[0.15292717]
[0.31872179]
[0.21063978]


### Notice that the value is the same (0.0149558) when using sklearn data transformation technique using MinMaxScaler and transform AND doing it manually.

In [29]:
# Now that you have NORMALIZED the data, you can now do the same ARIMA techniques
# a) Split normalized data into train & test sets
# b) Create the model
# c) Fit the model

train_mbs = mbs_normalize[0:40]
train_mbs

array([[0.01495587],
       [0.        ],
       [0.15292717],
       [0.31872179],
       [0.21063978],
       [0.24073118],
       [0.23062586],
       [0.18503964],
       [0.43645999],
       [0.21203207],
       [0.28921426],
       [0.30134064],
       [0.25800004],
       [0.28874267],
       [0.31319755],
       [0.42871258],
       [0.2727089 ],
       [0.18137926],
       [0.23590308],
       [0.24425681],
       [0.32274146],
       [0.34322158],
       [0.53960163],
       [0.44207406],
       [0.37926388],
       [0.3977903 ],
       [0.41725989],
       [0.42092026],
       [0.39610608],
       [0.36626171],
       [0.28353282],
       [0.36552065],
       [0.41725989],
       [0.42615257],
       [0.51977274],
       [0.49347645],
       [0.62958389],
       [0.78159035],
       [0.69172037],
       [1.        ]])

In [31]:
test_mbs = mbs_normalize[40:49]

In [32]:
test_mbs

array([[0.60863219],
       [0.18616245],
       [0.26406324],
       [0.30001572],
       [0.50477196],
       [0.71819631],
       [0.72735847],
       [0.68868878],
       [0.29020233]])

In [33]:
from statsmodels.tsa.arima_model import ARIMA

In [34]:
from statsmodels.tsa.arima_model import ARIMA
# Create the model
mbs_arimanormalize_model = ARIMA(train_mbs, order=(3,1,2))

In [35]:
# fit the model
mbs_arimanormalize_fit = mbs_arimanormalize_model.fit()



In [36]:
# Check AIC
mbs_arimanormalize_fit.aic

-60.837199010832876

In [37]:
# WITHOUT normalizing, it was 774.0700534328645!!

In [38]:
# Forecast cups using ARIMA model (2,1,2).  Give us the next 9 months.  Then you can compare to the TEST set
cups_forecast = mbs_arimanormalize_fit.forecast(steps = 9)[0]

In [39]:
cups_forecast

array([0.92080961, 0.97577212, 1.01463415, 0.98914647, 1.08032688,
       1.03686126, 1.10925219, 1.11523407, 1.12137367])

In [43]:
# How does cups_forecast compare to test set?
test_mbs

array([[0.60863219],
       [0.18616245],
       [0.26406324],
       [0.30001572],
       [0.50477196],
       [0.71819631],
       [0.72735847],
       [0.68868878],
       [0.29020233]])

In [44]:
# Calculate error
from sklearn.metrics import mean_squared_error
import numpy as np
np.sqrt(mean_squared_error(test_mbs, cups_forecast))

0.5972749573162686

In [51]:
# Let's transform cups_forecast back and test_mbs. We had to reshape the data first

cups_forecast_reshape = cups_forecast.reshape(len(cups_forecast), 1)
test_mbs_reshape = test_mbs.reshape(len(test_mbs), 1)

In [52]:
# After reshaping, we can REVERSE it 
cups_forecast_reverse = scaler.inverse_transform(cups_forecast_reshape)
test_mbs_reverse = scaler.inverse_transform(test_mbs_reshape)

In [50]:
cups_forecast_reverse

array([[41024.5726564 ],
       [43472.10829143],
       [45202.67344832],
       [44067.68145882],
       [48128.03625622],
       [46192.46875525],
       [49416.10949221],
       [49682.48822736],
       [49955.89071812]])

In [53]:
test_mbs_reverse

array([[27123.],
       [ 8310.],
       [11779.],
       [13380.],
       [22498.],
       [32002.],
       [32410.],
       [30688.],
       [12943.]])

In [55]:
# Let's calc the error
np.sqrt(mean_squared_error(test_mbs_reverse, cups_forecast_reverse))

22980.615525492125

#### So, this data transformation technique DID NOT work.  Sqrt error is still higher!


## STANDARDIZATION TECHNIQUE

In [57]:
# x-mean / standard deviation

In [64]:
# Your data should be normally distributed.  If it is, this technique should work like a charm
from sklearn.preprocessing import StandardScaler 

In [65]:
# mbs_series_value

In [67]:
std_scaler = StandardScaler()
std_scaler = std_scaler.fit(mbs_series_value)

In [68]:
std_scaler.mean_

array([17048.59183673])

In [69]:
std_scaler.var_

array([75238271.30279051])

In [70]:
std_mbs = std_scaler.transform(mbs_series_value)

In [71]:
std_mbs[0]

array([-1.88639524])

In [72]:
# After you have transformed, you can now do
# a) Split normalized data into train & test sets
# b) Create the model
# c) Fit the model

In [75]:
train_stdmbs = std_mbs[0:40]
test_stdmbs = std_mbs[40:49]

In [76]:
from statsmodels.tsa.arima_model import ARIMA
# Create the model
mbs_arimaSTD_model = ARIMA(train_stdmbs, order=(3,1,2))

In [77]:
# Fit the model
mbs_arimaSTD_model_fit = mbs_arimaSTD_model.fit()

In [78]:
# Check AIC - it is higher than after we normalized!  But let's continue with the forecast
mbs_arimaSTD_model_fit.aic

66.75951252821253

In [79]:
mbs_arimaSTD_model_forecast = mbs_arimaSTD_model_fit.forecast(steps = 9)[0]

In [80]:
mbs_arimaSTD_model_forecast

array([2.76419939, 3.04635121, 3.24581355, 3.11508754, 3.58304925,
       3.36004813, 3.73160474, 3.76234238, 3.79391806])

In [82]:
# How does the STD forecast compare to the test?
test_stdmbs

array([[ 1.16144898],
       [-1.00744663],
       [-0.60751579],
       [-0.4229412 ],
       [ 0.62824629],
       [ 1.72393459],
       [ 1.77097171],
       [ 1.57244738],
       [-0.47332165]])

In [83]:
# By looking at the 2 sets, looks like there is some variation.  Let's inverse and check the sqrt.
# Let's reshape and reverse

mbs_arimaSTDforecast_reshape = mbs_arimaSTD_model_forecast.reshape(len(mbs_arimaSTD_model_forecast), 1)
test_STDmbs_reshape = test_stdmbs.reshape(len(test_stdmbs), 1)

In [86]:
# After reshaping, we can REVERSE it 
arimaSTD_forecast_reverse = std_scaler.inverse_transform(mbs_arimaSTDforecast_reshape)
test_STDmbs_reverse = std_scaler.inverse_transform(test_STDmbs_reshape)

In [88]:
test_STDmbs_reverse

array([[27123.],
       [ 8310.],
       [11779.],
       [13380.],
       [22498.],
       [32002.],
       [32410.],
       [30688.],
       [12943.]])

In [87]:
# Let's calc the error
np.sqrt(mean_squared_error(test_STDmbs_reverse, arimaSTD_forecast_reverse))

26597.856028563107