##### Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [119]:
# Imports
import sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX 
# To import the main.py file
sys.path.append('../')
from python_files import main
import warnings
warnings.filterwarnings("ignore")

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [120]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [278]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["Spain"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["Spain"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["Spain"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [279]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [280]:
def find_params():
    stepwise_model = auto_arima(train, start_p = 1, start_q = 1,
                               max_p = 3, max_q = 3, m = 7,
                               start_P = 0, seasonal = True,
                               d = None, D = 1, trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [281]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=14, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [282]:
def Predict():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    pred = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
#     print(pred)
    pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,pred],axis=1), template = 'plotly_dark'),pred

In [283]:
def Future(order,seasonal_order):
    # Train the model on the full dataset 
    model = SARIMAX(data['Total'],  
                        order = order,  
                        seasonal_order = seasonal_order) 
    result = model.fit() 
  
    forecast = result.predict(start = len(data),  
                          end = (len(data)-1) + 14).rename('Forecast') 
    return px.line(pd.concat([data,forecast],axis=1), template = 'plotly_dark')



In [284]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Recovery

In [285]:
data=create_data_frame('Recovery')
plot_data_frame()

In [286]:
train,test=split_dataframe()
stepwise_model=find_params()
# print(stepwise_model.aic())
# print(stepwise_model.summary())

Test shape: (14, 1)
Train shape: (161, 1)
Min date from train set: 2020-02-15
Max date from train set:2020-07-24
Min date from test set: 2020-07-25
Max date from test set: 2020-08-07
Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,7) [intercept=True]; AIC=2507.388, BIC=2522.540, Time=2.526 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=True]; AIC=2590.279, BIC=2596.339, Time=0.032 seconds
Fit ARIMA(1,1,0)x(1,1,0,7) [intercept=True]; AIC=2561.061, BIC=2573.182, Time=0.635 seconds
Fit ARIMA(0,1,1)x(0,1,1,7) [intercept=True]; AIC=2569.299, BIC=2581.420, Time=0.716 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=False]; AIC=2588.279, BIC=2591.309, Time=0.029 seconds
Fit ARIMA(1,1,1)x(0,1,0,7) [intercept=True]; AIC=2572.100, BIC=2584.222, Time=0.163 seconds
Fit ARIMA(1,1,1)x(1,1,1,7) [intercept=True]; AIC=2508.001, BIC=2526.184, Time=2.599 seconds
Near non-invertible roots for order (1, 1, 1)(1, 1, 1, 7); setting score to inf (at least one inverse root too close to the borde

In [287]:
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

In [288]:
# prepare Fourier terms
fig,pred=Predict()
fig

In [289]:
mean_absolute_percentage_error(test,pred)

0.09728686502593326

In [290]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-25,150464.778807,150376
2020-07-26,150344.611546,150376
2020-07-27,150258.32076,150376
2020-07-28,150064.680247,150376
2020-07-29,150305.673857,150376
2020-07-30,150456.119238,150376
2020-07-31,150260.581681,150376
2020-08-01,150353.604178,150376
2020-08-02,150235.83258,150376
2020-08-03,150150.790462,150376


In [291]:
Future(order, seasonal_order)

## Deaths

In [292]:
data=create_data_frame('Death')
plot_data_frame()

In [293]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (144, 1)
Min date from train set: 2020-03-03
Max date from train set:2020-07-24
Min date from test set: 2020-07-25
Max date from test set: 2020-08-07
Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,7) [intercept=True]; AIC=1908.116, BIC=1922.679, Time=3.590 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=True]; AIC=1976.238, BIC=1982.063, Time=0.032 seconds
Fit ARIMA(1,1,0)x(1,1,0,7) [intercept=True]; AIC=1960.902, BIC=1972.552, Time=0.730 seconds
Fit ARIMA(0,1,1)x(0,1,1,7) [intercept=True]; AIC=1956.038, BIC=1967.689, Time=1.237 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=False]; AIC=1974.239, BIC=1977.152, Time=0.025 seconds
Fit ARIMA(1,1,1)x(0,1,0,7) [intercept=True]; AIC=1974.204, BIC=1985.854, Time=0.299 seconds
Fit ARIMA(1,1,1)x(1,1,1,7) [intercept=True]; AIC=1910.060, BIC=1927.536, Time=2.700 seconds
Near non-invertible roots for order (1, 1, 1)(1, 1, 1, 7); setting score to inf (at least one inverse root too close to the borde

In [294]:
fig,pred=Predict()
fig

In [295]:
mean_absolute_percentage_error(test,pred)

1.7394298803370507

In [296]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-25,28400.487786,28432
2020-07-26,28355.672674,28432
2020-07-27,28220.122019,28434
2020-07-28,28171.261645,28436
2020-07-29,28152.073827,28441
2020-07-30,28109.474045,28443
2020-07-31,28132.2664,28445
2020-08-01,28049.161203,28445
2020-08-02,27951.313295,28445
2020-08-03,27761.746323,28472


In [297]:
Future(order, seasonal_order)

## Confirmed

In [298]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [299]:
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (175, 1)
Min date from train set: 2020-02-01
Max date from train set:2020-07-24
Min date from test set: 2020-07-25
Max date from test set: 2020-08-07
Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,7) [intercept=True]; AIC=2963.961, BIC=2979.551, Time=2.442 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=True]; AIC=3048.966, BIC=3055.201, Time=0.036 seconds
Fit ARIMA(1,1,0)x(1,1,0,7) [intercept=True]; AIC=3027.837, BIC=3040.309, Time=0.286 seconds
Fit ARIMA(0,1,1)x(0,1,1,7) [intercept=True]; AIC=3036.272, BIC=3048.744, Time=0.355 seconds
Fit ARIMA(0,1,0)x(0,1,0,7) [intercept=False]; AIC=3047.148, BIC=3050.266, Time=0.042 seconds
Fit ARIMA(1,1,1)x(0,1,0,7) [intercept=True]; AIC=3031.228, BIC=3043.700, Time=0.258 seconds
Fit ARIMA(1,1,1)x(1,1,1,7) [intercept=True]; AIC=2973.602, BIC=2992.310, Time=3.252 seconds
Fit ARIMA(1,1,1)x(0,1,2,7) [intercept=True]; AIC=2976.859, BIC=2995.567, Time=4.881 seconds
Fit ARIMA(1,1,1)x(1,1,0,7) [intercept

In [300]:
fig,pred=Predict()
fig

In [301]:
mean_absolute_percentage_error(test,pred)

0.9551300402983702

In [302]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-25,274311.457764,272421
2020-07-26,276272.655178,272421
2020-07-27,278829.333722,278782
2020-07-28,280966.242574,280610
2020-07-29,283454.539395,282641
2020-07-30,286187.715758,285430
2020-07-31,288421.546498,288522
2020-08-01,290698.276095,288522
2020-08-02,292958.612639,288522
2020-08-03,295648.838844,297054


In [303]:
Future(order, seasonal_order)