# Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [27]:
# Imports
import sys
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
    
# To import the main.py file
sys.path.append('../')
from python_files import main

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [28]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [248]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["India"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["India"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["India"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [249]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [250]:
def find_params():
    stepwise_model = auto_arima(data, start_p = 1, start_q = 1,
                               max_p = 3, max_q = 3, m = 12,
                               start_P = 0, seasonal = True,
                               d = 1, D = 1, trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [251]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=0.2, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [252]:
def forecast():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    future_forecast = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
    print(future_forecast)
    future_forecast = pd.DataFrame(future_forecast,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,future_forecast],axis=1), template = 'plotly_dark'),future_forecast

In [253]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Recovery

In [254]:
data=create_data_frame('Recovery')
plot_data_frame()

In [255]:
stepwise_model=find_params()
print(stepwise_model.aic())


Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=2711.536, BIC=2726.589, Time=6.785 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=2885.420, BIC=2891.441, Time=0.050 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=2781.927, BIC=2793.970, Time=2.502 seconds
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=2860.123, BIC=2872.165, Time=0.730 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=2933.356, BIC=2936.367, Time=0.069 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=2752.172, BIC=2764.214, Time=0.617 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=2714.017, BIC=2732.081, Time=5.930 seconds
Fit ARIMA(1,1,1)x(0,1,2,12) [intercept=True]; AIC=2703.799, BIC=2721.863, Time=12.020 seconds
Fit ARIMA(1,1,1)x(1,1,2,12) [intercept=True]; AIC=2712.125, BIC=2733.199, Time=12.551 seconds
Fit ARIMA(0,1,1)x(0,1,2,12) [intercept=True]; AIC=2849.042, BIC=2864.095, Time=2.547 seconds
Fit ARIMA(1,1,0)x(0,1,2,

In [256]:
train,test=split_dataframe()
# prepare Fourier terms
fig,future_forecast=forecast()
fig

Test shape: (33, 1)
Train shape: (130, 1)
Min date from train set: 2020-02-16
Max date from train set:2020-06-24
Min date from test set: 2020-06-25
Max date from test set: 2020-07-27
[283469.52869386 296399.74316934 308938.82599519 321887.93158171
 335550.57454695 349445.73944427 363673.28181384 377994.74004657
 393304.83909492 410005.63763456 428273.47983166 445117.85262168
 462163.27693125 479382.69036746 497677.90057465 514884.49106225
 532854.00603183 552401.90094222 571618.1132025  592963.96154914
 612956.38658745 633712.14675231 657294.8009396  680253.96555792
 703014.02072351 726316.15161047 750598.6933434  773949.8056342
 798107.57671459 823911.46953297 849479.61514045 877216.26730312
 903706.75312029]


In [257]:
test

Unnamed: 0_level_0,Total
Date,Unnamed: 1_level_1
2020-06-25,285637
2020-06-26,295881
2020-06-27,309713
2020-06-28,321723
2020-06-29,334822
2020-06-30,347912
2020-07-01,359860
2020-07-02,379892
2020-07-03,394227
2020-07-04,409083


In [258]:
mean_absolute_percentage_error(test,future_forecast)

0.9467500755859032

In [259]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-25,283469.528694,285637
2020-06-26,296399.743169,295881
2020-06-27,308938.825995,309713
2020-06-28,321887.931582,321723
2020-06-29,335550.574547,334822
2020-06-30,349445.739444,347912
2020-07-01,363673.281814,359860
2020-07-02,377994.740047,379892
2020-07-03,393304.839095,394227
2020-07-04,410005.637635,409083


## Deaths

In [260]:
data=create_data_frame('Death')
plot_data_frame()

In [261]:
stepwise_model=find_params()
print(stepwise_model.aic())

Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=1664.913, BIC=1679.095, Time=5.149 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=1726.815, BIC=1732.488, Time=0.046 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=1703.102, BIC=1714.447, Time=2.734 seconds
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=1671.931, BIC=1683.276, Time=1.969 seconds
Near non-invertible roots for order (0, 1, 1)(0, 1, 1, 12); setting score to inf (at least one inverse root too close to the border of the unit circle: 0.999)
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=1735.593, BIC=1738.430, Time=0.078 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=1729.323, BIC=1740.669, Time=0.713 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=1670.175, BIC=1687.192, Time=4.485 seconds
Near non-invertible roots for order (1, 1, 1)(1, 1, 1, 12); setting score to inf (at least one inverse root too close to the border of the unit ci

In [262]:
# split_date = pd.Timestamp('2020-06-15')
train,test=split_dataframe()
fig,future_forecast=forecast()
fig

Test shape: (28, 1)
Train shape: (111, 1)
Min date from train set: 2020-03-11
Max date from train set:2020-06-29
Min date from test set: 2020-06-30
Max date from test set: 2020-07-27
[17285.2460286  17680.15184695 18065.69601256 18484.65307598
 18903.62220413 19344.29034846 19778.33098235 20193.43359197
 20610.36952364 21048.578446   21729.7004983  22172.2106567
 22610.41659111 23051.6905195  23483.98441515 23950.04797565
 24416.45713357 24904.87711908 25386.96109892 25850.37957505
 26315.88614594 26802.90388822 27533.05742965 28024.80724511
 28512.44744761 29003.33758111 29485.41777027 30001.42663572]


In [263]:
mean_absolute_percentage_error(test,future_forecast)

4.347325048452029

In [264]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-30,17285.246029,17400
2020-07-01,17680.151847,17834
2020-07-02,18065.696013,18213
2020-07-03,18484.653076,18655
2020-07-04,18903.622204,19268
2020-07-05,19344.290348,19693
2020-07-06,19778.330982,20159
2020-07-07,20193.433592,20642
2020-07-08,20610.369524,21129
2020-07-09,21048.578446,21604


## Confirmed

In [265]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [266]:
stepwise_model=find_params()
print(stepwise_model.aic())

Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=2845.149, BIC=2860.739, Time=5.636 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=3287.756, BIC=3293.992, Time=0.057 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=2874.129, BIC=2886.601, Time=3.304 seconds
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=3081.435, BIC=3093.907, Time=2.320 seconds
Near non-invertible roots for order (0, 1, 1)(0, 1, 1, 12); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=3349.184, BIC=3352.302, Time=0.079 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=2899.786, BIC=2912.258, Time=0.308 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=2840.445, BIC=2859.153, Time=5.662 seconds
Near non-invertible roots for order (1, 1, 1)(1, 1, 1, 12); setting score to inf (at least one inverse root too close to the border of the unit ci

In [267]:
train,test=split_dataframe()
fig,future_forecast=forecast()
fig

Test shape: (36, 1)
Train shape: (144, 1)
Min date from train set: 2020-01-30
Max date from train set:2020-06-21
Min date from test set: 2020-06-22
Max date from test set: 2020-07-27
[ 440532.77418811  455951.10615583  472057.70975625  487678.18998551
  503817.76384097  520624.16633321  537431.70792966  554393.24434399
  571662.72680558  588997.95485178  606071.40800495  624066.39719172
  642410.88559332  661005.56776417  680273.67410731  699283.98964028
  718646.36136557  738247.62517267  758010.40688332  778425.81167444
  799347.79190133  820663.9371005   841846.04502098  863606.85550542
  885784.67648816  908244.16318229  931391.50229279  954295.4788346
  977565.93980763 1001089.72109878 1024789.44852719 1049156.22726723
 1074044.00967157 1099340.38527414 1124517.1518214  1150287.04915339]


In [268]:
mean_absolute_percentage_error(test,future_forecast)

8.708014665374005

In [269]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-22,440532.8,440215
2020-06-23,455951.1,456183
2020-06-24,472057.7,473105
2020-06-25,487678.2,490401
2020-06-26,503817.8,508953
2020-06-27,520624.2,528859
2020-06-28,537431.7,548318
2020-06-29,554393.2,566840
2020-06-30,571662.7,585481
2020-07-01,588998.0,604641
