# Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [311]:
# Imports
import sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX 
# To import the main.py file
sys.path.append('../')
from python_files import main
import warnings
warnings.filterwarnings("ignore")

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [312]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [337]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["India"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["India"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["India"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [338]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [339]:
def find_params():
    stepwise_model = auto_arima(train, start_p = 1, start_q = 1,
                               max_p = 3, max_q = 3, m = 7,
                               start_P = 0, seasonal = True,
                               d = None, D = 1, trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [340]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=14, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [341]:
def Predict():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    pred = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
#     print(pred)
    pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,pred],axis=1), template = 'plotly_dark'),pred

In [342]:
def Future(order,seasonal_order):
    # Train the model on the full dataset 
    model = SARIMAX(data['Total'],  
                        order = order,  
                        seasonal_order = seasonal_order) 
    result = model.fit() 
  
    forecast = result.predict(start = len(data),  
                          end = (len(data)-1) + 14).rename('Forecast') 
    return px.line(pd.concat([data,forecast],axis=1), template = 'plotly_dark')



In [343]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Recovery

In [344]:
data=create_data_frame('Recovery')
plot_data_frame()

In [345]:
train,test=split_dataframe()
stepwise_model=find_params()
# print(stepwise_model.aic())
# print(stepwise_model.summary())

Test shape: (14, 1)
Train shape: (158, 1)
Min date from train set: 2020-02-16
Max date from train set:2020-07-22
Min date from test set: 2020-07-23
Max date from test set: 2020-08-05
Performing stepwise search to minimize aic
Fit ARIMA(1,2,1)x(0,1,1,7) [intercept=True]; AIC=2618.887, BIC=2633.907, Time=3.006 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=True]; AIC=2760.660, BIC=2766.668, Time=0.034 seconds
Fit ARIMA(1,2,0)x(1,1,0,7) [intercept=True]; AIC=2663.810, BIC=2675.826, Time=1.117 seconds
Fit ARIMA(0,2,1)x(0,1,1,7) [intercept=True]; AIC=2620.006, BIC=2632.022, Time=1.961 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=False]; AIC=2758.743, BIC=2761.747, Time=0.053 seconds
Fit ARIMA(1,2,1)x(0,1,0,7) [intercept=True]; AIC=2664.834, BIC=2676.850, Time=0.407 seconds
Fit ARIMA(1,2,1)x(1,1,1,7) [intercept=True]; AIC=2622.495, BIC=2640.519, Time=3.267 seconds
Fit ARIMA(1,2,1)x(0,1,2,7) [intercept=True]; AIC=2620.616, BIC=2638.640, Time=4.184 seconds
Fit ARIMA(1,2,1)x(1,1,0,7) [intercept

In [346]:
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

In [347]:
# prepare Fourier terms
fig,pred=Predict()
fig

In [348]:
mean_absolute_percentage_error(test,pred)

3.906186473351308

In [349]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-23,811352.8,817209
2020-07-24,838389.3,849432
2020-07-25,867525.3,885573
2020-07-26,895961.7,917568
2020-07-27,925208.6,951166
2020-07-28,956218.7,988029
2020-07-29,987927.7,1019735
2020-07-30,1021467.0,1055348
2020-07-31,1052819.0,1094374
2020-08-01,1086401.0,1145629


In [350]:
Future(order, seasonal_order)

## Deaths

In [351]:
data=create_data_frame('Death')
plot_data_frame()

In [352]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (134, 1)
Min date from train set: 2020-03-11
Max date from train set:2020-07-22
Min date from test set: 2020-07-23
Max date from test set: 2020-08-05
Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,7) [intercept=True]; AIC=1665.509, BIC=1679.691, Time=3.320 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=True]; AIC=1716.132, BIC=1721.804, Time=0.034 seconds
Fit ARIMA(1,1,0)x(1,1,0,7) [intercept=True]; AIC=1691.900, BIC=1703.245, Time=0.839 seconds
Fit ARIMA(0,1,1)x(0,1,1,7) [intercept=True]; AIC=1669.774, BIC=1681.119, Time=0.568 seconds
Near non-invertible roots for order (0, 1, 1)(0, 1, 1, 7); setting score to inf (at least one inverse root too close to the border of the unit circle: 0.999)
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=False]; AIC=1718.224, BIC=1721.061, Time=0.029 seconds
Fit ARIMA(1,1,1)x(0,1,0,7) [intercept=True]; AIC=1719.761, BIC=1731.106, Time=0.328 seconds
Fit ARIMA(1,1,1)x(1,1,1,7) [intercept=True]; AIC=1671.962, BIC=

In [353]:
fig,pred=Predict()
fig

In [354]:
mean_absolute_percentage_error(test,pred)

2.019266704667233

In [355]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-23,30504.145837,30601
2020-07-24,31146.168479,31358
2020-07-25,31779.682624,32060
2020-07-26,32408.138392,32771
2020-07-27,33030.106202,33408
2020-07-28,33754.49783,34193
2020-07-29,34411.833842,34955
2020-07-30,35048.385256,35718
2020-07-31,35688.459746,36511
2020-08-01,36324.166066,37364


In [356]:
Future(order, seasonal_order)

## Confirmed

In [357]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [358]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (175, 1)
Min date from train set: 2020-01-30
Max date from train set:2020-07-22
Min date from test set: 2020-07-23
Max date from test set: 2020-08-05
Performing stepwise search to minimize aic
Fit ARIMA(1,2,1)x(0,1,1,7) [intercept=True]; AIC=2670.135, BIC=2685.695, Time=2.527 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=True]; AIC=2699.289, BIC=2705.513, Time=0.037 seconds
Fit ARIMA(1,2,0)x(1,1,0,7) [intercept=True]; AIC=2677.218, BIC=2689.666, Time=1.667 seconds
Fit ARIMA(0,2,1)x(0,1,1,7) [intercept=True]; AIC=2668.178, BIC=2680.626, Time=1.501 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=False]; AIC=2698.837, BIC=2701.949, Time=0.038 seconds
Fit ARIMA(0,2,1)x(0,1,0,7) [intercept=True]; AIC=2678.319, BIC=2687.655, Time=0.276 seconds
Fit ARIMA(0,2,1)x(1,1,1,7) [intercept=True]; AIC=2669.929, BIC=2685.489, Time=1.074 seconds
Fit ARIMA(0,2,1)x(0,1,2,7) [intercept=True]; AIC=2669.429, BIC=2684.989, Time=3.085 seconds
Fit ARIMA(0,2,1)x(1,1,0,7) [intercept

In [359]:
fig,pred=Predict()
fig

In [360]:
mean_absolute_percentage_error(test,pred)

0.5149547820637668