# Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [1]:
# Imports
import sys
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX 
# To import the main.py file
sys.path.append('../')
from python_files import main
import warnings
warnings.filterwarnings("ignore")

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [2]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [55]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["US"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["US"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["US"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [56]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [57]:
def find_params():
    stepwise_model = auto_arima(train, start_p = 1, start_q = 1,
                               max_p = 3, max_q = 3, m = 7,
                               start_P = 0, seasonal = True,
                               d = None, D = 1, trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [58]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=15, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [59]:
def Predict():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    pred = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
#     print(pred)
    pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,pred],axis=1), template = 'plotly_dark'),pred

In [60]:
def Future(order,seasonal_order):
    # Train the model on the full dataset 
    model = SARIMAX(data['Total'],  
                        order = order,  
                        seasonal_order = seasonal_order) 
    result = model.fit() 
  
    forecast = result.predict(start = len(data),  
                          end = (len(data)-1) + 14).rename('Forecast') 
    return px.line(pd.concat([data,forecast],axis=1), template = 'plotly_dark')



In [61]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Recovery

In [62]:
data=create_data_frame('Recovery')
plot_data_frame()

In [63]:
train,test=split_dataframe()
stepwise_model=find_params()
# print(stepwise_model.aic())
# print(stepwise_model.summary())

Test shape: (15, 1)
Train shape: (158, 1)
Min date from train set: 2020-02-09
Max date from train set:2020-07-15
Min date from test set: 2020-07-16
Max date from test set: 2020-07-30
Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,7) [intercept=True]; AIC=3234.826, BIC=3249.879, Time=0.677 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=True]; AIC=3294.930, BIC=3300.952, Time=0.033 seconds
Fit ARIMA(1,1,0)x(1,1,0,7) [intercept=True]; AIC=3244.160, BIC=3256.202, Time=0.340 seconds
Fit ARIMA(0,1,1)x(0,1,1,7) [intercept=True]; AIC=3232.850, BIC=3244.893, Time=0.582 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=False]; AIC=3293.641, BIC=3296.652, Time=0.023 seconds
Fit ARIMA(0,1,1)x(0,1,0,7) [intercept=True]; AIC=3296.914, BIC=3305.946, Time=0.089 seconds
Fit ARIMA(0,1,1)x(1,1,1,7) [intercept=True]; AIC=3233.977, BIC=3249.030, Time=0.463 seconds
Fit ARIMA(0,1,1)x(0,1,2,7) [intercept=True]; AIC=3233.547, BIC=3248.600, Time=0.882 seconds
Fit ARIMA(0,1,1)x(1,1,0,7) [intercept

In [64]:
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

In [65]:
# prepare Fourier terms
fig,pred=Predict()
fig

In [66]:
mean_absolute_percentage_error(test,pred)

1.754454991036087

In [67]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-16,1093431.0,1090645
2020-07-17,1110899.0,1107204
2020-07-18,1131764.0,1122720
2020-07-19,1147482.0,1131121
2020-07-20,1166597.0,1160087
2020-07-21,1181906.0,1182018
2020-07-22,1198673.0,1210849
2020-07-23,1217118.0,1233269
2020-07-24,1235480.0,1261624
2020-07-25,1257242.0,1279414


In [68]:
Future(order, seasonal_order)

## Deaths

In [69]:
data=create_data_frame('Death')
plot_data_frame()

In [70]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (15, 1)
Train shape: (138, 1)
Min date from train set: 2020-02-29
Max date from train set:2020-07-15
Min date from test set: 2020-07-16
Max date from test set: 2020-07-30
Performing stepwise search to minimize aic
Fit ARIMA(1,2,1)x(0,1,1,7) [intercept=True]; AIC=1841.128, BIC=1855.428, Time=1.513 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=True]; AIC=1917.559, BIC=1923.278, Time=0.029 seconds
Fit ARIMA(1,2,0)x(1,1,0,7) [intercept=True]; AIC=1866.119, BIC=1877.558, Time=0.852 seconds
Fit ARIMA(0,2,1)x(0,1,1,7) [intercept=True]; AIC=1839.453, BIC=1850.892, Time=0.920 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=False]; AIC=1915.560, BIC=1918.420, Time=0.029 seconds
Fit ARIMA(0,2,1)x(0,1,0,7) [intercept=True]; AIC=1870.843, BIC=1879.422, Time=0.223 seconds
Fit ARIMA(0,2,1)x(1,1,1,7) [intercept=True]; AIC=1840.868, BIC=1855.167, Time=2.479 seconds
Fit ARIMA(0,2,1)x(0,1,2,7) [intercept=True]; AIC=1841.214, BIC=1855.514, Time=2.285 seconds
Fit ARIMA(0,2,1)x(1,1,0,7) [intercept

In [71]:
fig,pred=Predict()
fig

In [72]:
mean_absolute_percentage_error(test,pred)

0.9584152427350744

In [73]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-16,138442.362766,138358
2020-07-17,139234.670102,139266
2020-07-18,139844.997802,140119
2020-07-19,140234.151305,140534
2020-07-20,140644.166561,141025
2020-07-21,141568.205467,142121
2020-07-22,142459.065199,143316
2020-07-23,143438.942619,144430
2020-07-24,144201.826884,145560
2020-07-25,144785.39265,146465


In [74]:
Future(order, seasonal_order)

## Confirmed

In [75]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [76]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (15, 1)
Train shape: (176, 1)
Min date from train set: 2020-01-22
Max date from train set:2020-07-15
Min date from test set: 2020-07-16
Max date from test set: 2020-07-30
Performing stepwise search to minimize aic
Fit ARIMA(1,2,1)x(0,1,1,7) [intercept=True]; AIC=3057.445, BIC=3073.035, Time=0.757 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=True]; AIC=3117.929, BIC=3124.165, Time=0.037 seconds
Fit ARIMA(1,2,0)x(1,1,0,7) [intercept=True]; AIC=3062.838, BIC=3075.310, Time=0.331 seconds
Fit ARIMA(0,2,1)x(0,1,1,7) [intercept=True]; AIC=3057.662, BIC=3070.134, Time=0.998 seconds
Fit ARIMA(0,2,0)x(0,1,0,7) [intercept=False]; AIC=3115.991, BIC=3119.109, Time=0.058 seconds
Fit ARIMA(1,2,1)x(0,1,0,7) [intercept=True]; AIC=3081.186, BIC=3093.658, Time=0.103 seconds
Fit ARIMA(1,2,1)x(1,1,1,7) [intercept=True]; AIC=3055.180, BIC=3073.888, Time=3.210 seconds
Fit ARIMA(1,2,1)x(1,1,0,7) [intercept=True]; AIC=3064.782, BIC=3080.372, Time=0.694 seconds
Fit ARIMA(1,2,1)x(2,1,1,7) [intercept

In [77]:
fig,pred=Predict()
fig

In [78]:
mean_absolute_percentage_error(test,pred)

1.5322890907645064

In [79]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-16,3570789.0,3576157
2020-07-17,3645668.0,3647715
2020-07-18,3714861.0,3711413
2020-07-19,3783435.0,3773260
2020-07-20,3851391.0,3834677
2020-07-21,3928362.0,3899211
2020-07-22,4005845.0,3970121
2020-07-23,4087746.0,4038816
2020-07-24,4172778.0,4112531
2020-07-25,4252158.0,4178970


In [80]:
Future(order, seasonal_order)