In [1]:
import warnings
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
%matplotlib inline

  import pandas.util.testing as tm


In [2]:
# date parser
parser = lambda date: pd.to_datetime(date, format='%d-%m-%y %H:%M')

# load the dataset
df = pd.read_csv(
    'EVANS-EFCO_final_clean_with_2018.csv', parse_dates=[0],
    index_col=[0], date_parser=parser, engine='python'
)

In [3]:
series = df.loc[:, 'ENERGY CONSUMPTION'].values
series

array([ 7. ,  7. ,  7. , ..., 10. ,  9.5,  9. ])

## Check stationarity of the data

In [4]:
# ADF Test to check if our data is stationary or not
result = adfuller(series, autolag='AIC')

In [5]:
print(f'ADF Statistic: {result[0]}\n')
print(f'n_lags: {result[1]}')
print(f'p-value: {result[1]}\n')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

ADF Statistic: -21.49100685507755

n_lags: 0.0
p-value: 0.0

Critial Values:
   1%, -3.430599102593299
Critial Values:
   5%, -2.8616500960359854
Critial Values:
   10%, -2.5668286008605627


Since P < 0.05 and ADF stat is way lesser than any critical value, our time series is stationary.

## Use grid search to find a suitable model

In [6]:
# Define the p, d and q parameters to take any value between 0 and 2
p = d = q = range(0, 2)

# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p, d, q))

In [7]:
print('Examples of parameter combinations for ARIMA...')
print('ARIMA: {}'.format(pdq[1]))
print('ARIMA: {}'.format(pdq[3]))
print('ARIMA: {}'.format(pdq[5]))

Examples of parameter combinations for ARIMA...
ARIMA: (0, 0, 1)
ARIMA: (0, 1, 1)
ARIMA: (1, 0, 1)


In [8]:
warnings.filterwarnings("ignore") # specify to ignore warning messages

for param in pdq:
    mod = sm.tsa.ARIMA(series, order=param)
    results = mod.fit()
    print('ARIMA{} - AIC:{}'.format(param, results.aic))


ARIMA(0, 0, 0) - AIC:260819.4006451146
ARIMA(0, 0, 1) - AIC:237456.74031349208
ARIMA(0, 1, 0) - AIC:203251.30845778226
ARIMA(0, 1, 1) - AIC:201576.3006565532
ARIMA(1, 0, 0) - AIC:202516.70940911974
ARIMA(1, 0, 1) - AIC:201186.29796719062
ARIMA(1, 1, 0) - AIC:201714.7774227533
ARIMA(1, 1, 1) - AIC:201575.68003539517


Since the ***ARIMA(1, 0, 1)*** model has the lowest ***AIC*** score,  it is our chosen model. 

## Fit model and predict

In [9]:
# trainX for model training
# test for testing prediction
train, test = df[:'2017'], df['2018']

In [10]:
model = sm.tsa.ARIMA(train['ENERGY CONSUMPTION'].values, order=(1, 0, 1))
model = model.fit(disp=0)

In [11]:
print(model.summary())

                              ARMA Model Results                              
Dep. Variable:                      y   No. Observations:                17544
Model:                     ARMA(1, 1)   Log Likelihood              -67890.931
Method:                       css-mle   S.D. of innovations             11.597
Date:                Fri, 25 Sep 2020   AIC                         135789.862
Time:                        01:00:08   BIC                         135820.952
Sample:                             0   HQIC                        135800.098
                                                                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         61.1793      2.079     29.421      0.000      57.104      65.255
ar.L1.y        0.9688      0.002    491.245      0.000       0.965       0.973
ma.L1.y       -0.2581      0.008    -34.073      0.0

In [12]:
model.save('arima-model.pkl')