##### Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [718]:
# Imports
import sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import math
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX 
# To import the main.py file
sys.path.append('../')
from python_files import main
import warnings
warnings.filterwarnings("ignore")

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [719]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [720]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["US"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["US"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["US"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no Zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [721]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [722]:
def find_params():
    stepwise_model = auto_arima(train,method='nm',start_p = 0, start_q = 0,
                               max_p = 2, max_q = 2, m = 7,
                               start_P = 0,max_P=0,start_Q=1,max_Q=1,seasonal = False,
                               d = None, D = 1, n_jobs=-1,trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [723]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=14, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [724]:
def Predict():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    pred = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
    pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,pred],axis=1), template = 'plotly_dark'),pred

In [725]:
def Future(order,seasonal_order):
    # Train the model on the full dataset 
    model = SARIMAX(data['Total'],  
                        order = order,  
                        seasonal_order = seasonal_order) 
    result = model.fit() 
  
    forecast = result.predict(start = len(data),  
                          end = (len(data)-1) + 14).rename('Forecast') 
    return px.line(pd.concat([data,forecast],axis=1), template = 'plotly_dark')



In [726]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [727]:
def pred_to_int(pred):
    L=[]
    for i in  (pred['Prediction'].to_frame())['Prediction']:
        if i%1>=0.5:
            L.append((math.ceil(i)))
        else:
            L.append((math.floor(i)))
    pred['Prediction']=L
    return pred


## Recovery

In [728]:
data=create_data_frame('Recovery')
plot_data_frame()

In [729]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
# print(stepwise_model.aic())
# print(stepwise_model.summary())

Test shape: (14, 1)
Train shape: (199, 1)
Min date from train set: 2020-02-09
Max date from train set:2020-08-25
Min date from test set: 2020-08-26
Max date from test set: 2020-09-08
Performing stepwise search to minimize aic
Fit ARIMA(0,2,0)x(0,0,0,0) [intercept=True]; AIC=4325.589, BIC=4332.155, Time=0.124 seconds
Fit ARIMA(1,2,0)x(0,0,0,0) [intercept=True]; AIC=4217.160, BIC=4227.010, Time=0.154 seconds
Fit ARIMA(0,2,1)x(0,0,0,0) [intercept=True]; AIC=4182.160, BIC=4192.010, Time=0.174 seconds
Fit ARIMA(0,2,0)x(0,0,0,0) [intercept=False]; AIC=4323.617, BIC=4326.900, Time=0.155 seconds
Fit ARIMA(1,2,1)x(0,0,0,0) [intercept=True]; AIC=4184.423, BIC=4197.556, Time=0.176 seconds
Fit ARIMA(0,2,2)x(0,0,0,0) [intercept=True]; AIC=4183.986, BIC=4197.119, Time=0.165 seconds
Fit ARIMA(1,2,2)x(0,0,0,0) [intercept=True]; AIC=4182.565, BIC=4198.981, Time=0.119 seconds
Total fit time: 1.086 seconds


In [730]:
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

In [731]:
# prepare Fourier terms
fig,pred=Predict()
fig

In [732]:
pred=pred_to_int(pred)

In [733]:
mean_absolute_percentage_error(test,pred)

1.6027964987853742

In [734]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-26,2079973,2084465
2020-08-27,2106397,2101326
2020-08-28,2132971,2118367
2020-08-29,2159696,2140614
2020-08-30,2186571,2153939
2020-08-31,2213596,2184825
2020-09-01,2240772,2202663
2020-09-02,2268098,2231757
2020-09-03,2295574,2266957
2020-09-04,2323201,2283454


In [735]:
Future(order, seasonal_order)

In [736]:
data.index


DatetimeIndex(['2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12',
               '2020-02-13', '2020-02-14', '2020-02-15', '2020-02-16',
               '2020-02-17', '2020-02-18',
               ...
               '2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02',
               '2020-09-03', '2020-09-04', '2020-09-05', '2020-09-06',
               '2020-09-07', '2020-09-08'],
              dtype='datetime64[ns]', name='Date', length=213, freq='D')

## Deaths

In [737]:
data=create_data_frame('Death')
plot_data_frame()

In [738]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (179, 1)
Min date from train set: 2020-02-29
Max date from train set:2020-08-25
Min date from test set: 2020-08-26
Max date from test set: 2020-09-08
Performing stepwise search to minimize aic
Fit ARIMA(0,1,0)x(0,0,0,0) [intercept=True]; AIC=2830.777, BIC=2837.140, Time=0.105 seconds
Fit ARIMA(1,1,0)x(0,0,0,0) [intercept=True]; AIC=2541.777, BIC=2551.323, Time=0.138 seconds
Fit ARIMA(0,1,1)x(0,0,0,0) [intercept=True]; AIC=2667.920, BIC=2677.466, Time=0.177 seconds
Fit ARIMA(0,1,0)x(0,0,0,0) [intercept=False]; AIC=3034.497, BIC=3037.679, Time=0.059 seconds
Fit ARIMA(2,1,0)x(0,0,0,0) [intercept=True]; AIC=2539.398, BIC=2552.125, Time=0.127 seconds
Fit ARIMA(2,1,1)x(0,0,0,0) [intercept=True]; AIC=2922.537, BIC=2938.446, Time=0.115 seconds
Fit ARIMA(1,1,1)x(0,0,0,0) [intercept=True]; AIC=2534.896, BIC=2547.623, Time=0.108 seconds
Fit ARIMA(1,1,2)x(0,0,0,0) [intercept=True]; AIC=2524.463, BIC=2540.372, Time=0.245 seconds
Fit ARIMA(0,1,2)x(0,0,0,0) [intercept

In [739]:
fig,pred=Predict()
fig

In [740]:
pred=pred_to_int(pred)

In [741]:
mean_absolute_percentage_error(test,pred)

0.6660364598539591

In [742]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-26,179719,179674
2020-08-27,180767,180785
2020-08-28,181811,181756
2020-08-29,182852,182714
2020-08-30,183890,183024
2020-08-31,184924,183597
2020-09-01,185956,184664
2020-09-02,186984,185720
2020-09-03,188010,186790
2020-09-04,189033,187755


In [743]:
Future(order, seasonal_order)

## Confirmed

In [744]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [745]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (217, 1)
Min date from train set: 2020-01-22
Max date from train set:2020-08-25
Min date from test set: 2020-08-26
Max date from test set: 2020-09-08
Performing stepwise search to minimize aic
Fit ARIMA(0,2,0)x(0,0,0,0) [intercept=True]; AIC=4184.286, BIC=4191.027, Time=0.113 seconds
Fit ARIMA(1,2,0)x(0,0,0,0) [intercept=True]; AIC=4185.671, BIC=4195.783, Time=0.138 seconds
Fit ARIMA(0,2,1)x(0,0,0,0) [intercept=True]; AIC=4185.645, BIC=4195.757, Time=0.163 seconds
Fit ARIMA(0,2,0)x(0,0,0,0) [intercept=False]; AIC=4182.702, BIC=4186.073, Time=0.089 seconds
Fit ARIMA(1,2,1)x(0,0,0,0) [intercept=True]; AIC=4187.668, BIC=4201.151, Time=0.207 seconds
Total fit time: 0.721 seconds


In [746]:
fig,pred=Predict()
fig

In [747]:
pred=pred_to_int(pred)

In [748]:
mean_absolute_percentage_error(test,pred)

0.44591830881805455

In [749]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-26,5815884,5821819
2020-08-27,5854058,5867785
2020-08-28,5892232,5913941
2020-08-29,5930406,5961094
2020-08-30,5968580,5996431
2020-08-31,6006754,6030587
2020-09-01,6044928,6073840
2020-09-02,6083102,6113510
2020-09-03,6121276,6150016
2020-09-04,6159450,6200518


In [750]:
Future(order, seasonal_order)