# ARIMA

In [10]:
#source: ttps://github.com/tgsmith61591/pmdarima/blob/master/examples/quick_start_example.ipynb
#download: https://github.com/tgsmith61591/pmdarima

## Finding the best p,d,q

In [76]:
import pandas as pd
from matplotlib import pyplot
import os
from matplotlib.pyplot import figure
import numpy as np
import pmdarima as pm
from scipy import stats
import statsmodels.api as sm

print('numpy version: %r' % np.__version__)
print('pmdarima version: %r' % pm.__version__)

numpy version: '1.15.4'
pmdarima version: '1.1.0'


In [77]:
df_train = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, parse_dates=[0], index_col=0, squeeze=True)
train = series['DF_0'].values
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test = df_test.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2,3]]

### Fitting an ARIMA

In [78]:
from pmdarima.arima import ARIMA

fit = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12)).fit(y = train)



In [18]:
fit = ARIMA(order=(1, 1, 1), seasonal_order=None).fit(y = train)

### Finding the optimal model hyper-parameters using auto_arima

In [20]:
# fitting a stepwise model:
from pmdarima.arima import auto_arima

stepwise_fit = auto_arima(train, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                          start_P=0, seasonal=True, d=1, D=1, trace=True,
                          error_action='ignore',  # don't want to know if an order does not work
                          suppress_warnings=True,  # don't want convergence warnings
                          stepwise=True)  # set to stepwise

stepwise_fit.summary()

Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=58439.175, BIC=58479.611, Fit time=117.746 seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=86644.983, BIC=86661.157, Fit time=1.370 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=73272.548, BIC=73304.897, Fit time=17.769 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=58448.099, BIC=58480.448, Fit time=129.631 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(1, 1, 1, 12); AIC=58753.938, BIC=58802.461, Fit time=108.357 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=75842.745, BIC=75875.094, Fit time=9.286 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 2, 12); AIC=59158.151, BIC=59206.674, Fit time=369.423 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(1, 1, 2, 12); AIC=58521.390, BIC=58578.001, Fit time=1068.343 seconds
Fit ARIMA: order=(2, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=58504.685, BIC=58553.208, Fit time=139.617 seco

0,1,2,3
Dep. Variable:,y,No. Observations:,24047.0
Model:,"SARIMAX(1, 1, 1)x(0, 1, 1, 12)",Log Likelihood,-29214.588
Date:,"Thu, 10 Jan 2019",AIC,58439.175
Time:,00:57:29,BIC,58479.611
Sample:,0,HQIC,58452.288
,- 24047,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-4.18e-06,4.51e-06,-0.927,0.354,-1.3e-05,4.66e-06
ar.L1,0.0421,0.004,11.097,0.000,0.035,0.050
ma.L1,-0.8171,0.002,-391.646,0.000,-0.821,-0.813
ma.S.L12,-0.9972,0.001,-1329.152,0.000,-0.999,-0.996
sigma2,0.6686,0.002,333.559,0.000,0.665,0.673

0,1,2,3
Ljung-Box (Q):,381.52,Jarque-Bera (JB):,704926.72
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.83,Skew:,3.17
Prob(H) (two-sided):,0.0,Kurtosis:,28.76


In [23]:
# fitting a stepwise model:
from pmdarima.arima import auto_arima

stepwise_fit = auto_arima(wineind, start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                          start_P=0, seasonal= False, d=1, D=1, trace=True,
                          error_action='ignore',  # don't want to know if an order does not work
                          suppress_warnings=True,  # don't want convergence warnings
                          stepwise=True)  # set to stepwise

stepwise_fit.summary()

Fit ARIMA: order=(1, 1, 1); AIC=58375.664, BIC=58408.015, Fit time=5.496 seconds
Fit ARIMA: order=(0, 1, 0); AIC=69808.999, BIC=69825.175, Fit time=0.010 seconds
Fit ARIMA: order=(1, 1, 0); AIC=63354.800, BIC=63379.063, Fit time=0.113 seconds
Fit ARIMA: order=(0, 1, 1); AIC=58393.433, BIC=58417.696, Fit time=0.699 seconds
Fit ARIMA: order=(2, 1, 1); AIC=58324.962, BIC=58365.400, Fit time=5.323 seconds
Fit ARIMA: order=(2, 1, 0); AIC=60964.003, BIC=60996.354, Fit time=0.209 seconds
Fit ARIMA: order=(2, 1, 2); AIC=57502.768, BIC=57551.294, Fit time=19.496 seconds
Fit ARIMA: order=(3, 1, 3); AIC=57506.857, BIC=57571.559, Fit time=18.163 seconds
Fit ARIMA: order=(1, 1, 2); AIC=57510.920, BIC=57551.359, Fit time=9.188 seconds
Fit ARIMA: order=(3, 1, 2); AIC=57499.933, BIC=57556.547, Fit time=14.719 seconds
Fit ARIMA: order=(3, 1, 1); AIC=57862.085, BIC=57910.612, Fit time=12.571 seconds
Total fit time: 86.011 seconds


0,1,2,3
Dep. Variable:,D.y,No. Observations:,24046.0
Model:,"ARIMA(3, 1, 2)",Log Likelihood,-28742.966
Method:,css-mle,S.D. of innovations,0.8
Date:,"Thu, 10 Jan 2019",AIC,57499.933
Time:,16:08:38,BIC,57556.547
Sample:,1,HQIC,57518.291
,,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.03e-06,2.14e-05,0.141,0.888,-3.9e-05,4.51e-05
ar.L1.D.y,0.8292,0.012,67.942,0.000,0.805,0.853
ar.L2.D.y,0.0242,0.009,2.818,0.005,0.007,0.041
ar.L3.D.y,0.0200,0.007,2.693,0.007,0.005,0.035
ma.L1.D.y,-1.6462,0.010,-158.116,0.000,-1.667,-1.626
ma.L2.D.y,0.6467,0.010,62.289,0.000,0.626,0.667

0,1,2,3,4
,Real,Imaginary,Modulus,Frequency
AR.1,1.1333,-0.0000j,1.1333,-0.0000
AR.2,-1.1726,-6.5367j,6.6410,-0.2783
AR.3,-1.1726,+6.5367j,6.6410,0.2783
MA.1,1.0015,+0.0000j,1.0015,0.0000
MA.2,1.5440,+0.0000j,1.5440,0.0000


#### =============================================================

## Exogeneous Variable

In [79]:
df_train.head()

Unnamed: 0,Week,Year,addrcode,DF_0,DF_wm1,DF_wm2,DF_wm3,RF_wm1,RF_wm2,RF_wm3,...,Age 45 - 60,Age > 60,bin,bowl,bucket,misc_short,jar,pottedplant,tire,misc_tall
0,15,2014,800101,0.0,0.0,0.0,0.0,3.07,0.08,1.75,...,8958,6584,46.919,3.551,120.213,2.029,2.917,164.469,20.796,0.634
1,16,2014,800101,0.0,0.0,0.0,0.0,29.34,3.07,0.08,...,8958,6584,46.919,3.551,120.213,2.029,2.917,164.469,20.796,0.634
2,17,2014,800101,0.0,0.0,0.0,0.0,55.64,29.34,3.07,...,8958,6584,46.919,3.551,120.213,2.029,2.917,164.469,20.796,0.634
3,18,2014,800101,0.228948,0.0,0.0,0.0,12.71,55.64,29.34,...,8958,6584,46.919,3.551,120.213,2.029,2.917,164.469,20.796,0.634
4,19,2014,800101,0.0,0.228948,0.0,0.0,3.06,12.71,55.64,...,8958,6584,46.919,3.551,120.213,2.029,2.917,164.469,20.796,0.634


In [96]:
exogx_train =  df_train.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]].values
exogx_train[1]

array([   0.        ,    0.        ,   33.38464286, 3786.        ,
       1934.        , 3354.        , 7153.        , 5710.        ,
       5753.        , 8958.        , 6584.        ])

In [87]:
df_test.head()

Unnamed: 0,Week,Year,addrcode,DF_0,DF_wm1,DF_wm2,DF_wm3,RF_wm1,RF_wm2,RF_wm3,...,LST_wm5,LST_wm6,Age < 7,Age 7 - 9,Age 10 - 14,Age 15 - 24,Age 25 - 34,Age 35 - 44,Age 45 - 60,Age > 60
0,7,2017,800101,3.434223,1.602637,1.144741,1.373689,0.0,19.79,68.64,...,25.9,26.32,3786,1934,3354,7153,5710,5753,8958,6584
1,8,2017,800101,0.457896,3.434223,1.602637,1.144741,0.0,0.0,19.79,...,26.101786,25.9,3786,1934,3354,7153,5710,5753,8958,6584
2,9,2017,800101,0.686845,0.457896,3.434223,1.602637,14.36,0.0,0.0,...,27.526429,26.101786,3786,1934,3354,7153,5710,5753,8958,6584
3,10,2017,800101,0.686845,0.686845,0.457896,3.434223,0.0,14.36,0.0,...,28.921429,27.526429,3786,1934,3354,7153,5710,5753,8958,6584
4,11,2017,800101,0.228948,0.686845,0.686845,0.457896,8.37,0.0,14.36,...,29.655,28.921429,3786,1934,3354,7153,5710,5753,8958,6584


In [136]:
exogx_test =  df_test.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]].values
exgox_test = np.array(exogx_test)
len(exgox_test)

7848

## Fitting the model

In [83]:
fit0 = sm.tsa.ARIMA(train, (0,0,0),exog = exogx_train).fit()
pred0 = fit.predict(steps = 1, exog = exogx_test)

In [84]:
len(pred0)

24046

In [104]:
fit = sm.tsa.ARIMA(train, (3,1,2),exog = exogx_train).fit()
fit.fittedvalues



array([ 0.02737678,  0.0439902 ,  0.06189429, ..., -0.33926155,
       -0.30402143,  0.62005963])

In [126]:
pred = fit.forecast(steps = 1, exog = exogx_test)

ValueError: new exog needed for each step

In [86]:
len(pred)

24046

In [128]:
model = sm.tsa.statespace.SARIMAX(train, trend='c', order=(1,1,1),seasonal_order=(0,1,1,12),exog= exogx_train)
model_fit = model.fit()



In [144]:
predicted = model_fit.predict(step=1, exog=exogx_test, dynamic=True)

In [142]:
len(predicted)

24047

In [139]:
df_predicted = pd.DataFrame(predicted)
df_compare_addrcode = pd.concat([df_series_week_addrcode, df_predicted], axis = 1)
df_compare_addrcode.columns = [['Week','Year','addrcode','actual','predicted']]
#df_compare_addrcode.to_csv('LGBM_subdist_withCD.csv', encoding = 'utf-8')