##### Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [1]:
# Imports
import sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import math
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX 
# To import the main.py file
sys.path.append('../')
from python_files import main
import warnings
warnings.filterwarnings("ignore")

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [2]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [3]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["US"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["US"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["US"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no Zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [4]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [5]:
def find_params():
    stepwise_model = auto_arima(train,method='nm',start_p = 0, start_q = 0,
                               max_p = 2, max_q = 2, m = 7,
                               start_P = 0,max_P=0,start_Q=1,max_Q=1,seasonal = False,
                               d = None, D = 1, n_jobs=-1,trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [6]:
def split_dataframe():
    # Setting a Split date for test and train datasets
    train, test = train_test_split(data, test_size=14, shuffle=False)
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [7]:
def Predict():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    pred = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
    pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,pred],axis=1), template = 'plotly_dark'),pred

In [8]:
def Future(order,seasonal_order):
    # Train the model on the full dataset 
    model = SARIMAX(data['Total'],  
                        order = order,  
                        seasonal_order = seasonal_order) 
    result = model.fit() 
  
    forecast = result.predict(start = len(data),  
                          end = (len(data)-1) + 14).rename('Forecast') 
    return px.line(pd.concat([data,forecast],axis=1), template = 'plotly_dark')



In [9]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
def pred_to_int(pred):
    L=[]
    for i in  (pred['Prediction'].to_frame())['Prediction']:
        if i%1>=0.5:
            L.append((math.ceil(i)))
        else:
            L.append((math.floor(i)))
    pred['Prediction']=L
    return pred


## Recovery

In [11]:
data=create_data_frame('Recovery')
plot_data_frame()

In [12]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
# print(stepwise_model.aic())
# print(stepwise_model.summary())

Test shape: (14, 1)
Train shape: (200, 1)
Min date from train set: 2020-02-09
Max date from train set:2020-08-26
Min date from test set: 2020-08-27
Max date from test set: 2020-09-09
Performing stepwise search to minimize aic
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=4346.551, Time=0.14 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=4237.683, Time=0.12 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=4202.637, Time=0.11 sec
 ARIMA(0,2,0)(0,0,0)[0]             : AIC=4344.575, Time=0.04 sec
 ARIMA(1,2,1)(0,0,0)[0] intercept   : AIC=4204.940, Time=0.07 sec
 ARIMA(0,2,2)(0,0,0)[0] intercept   : AIC=4204.448, Time=0.19 sec
 ARIMA(1,2,2)(0,0,0)[0] intercept   : AIC=4203.178, Time=0.17 sec
 ARIMA(0,2,1)(0,0,0)[0]             : AIC=4207.207, Time=0.15 sec

Best model:  ARIMA(0,2,1)(0,0,0)[0] intercept
Total fit time: 1.008 seconds


In [13]:
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

In [14]:
# prepare Fourier terms
fig,pred=Predict()
fig

In [15]:
pred=pred_to_int(pred)

In [16]:
mean_absolute_percentage_error(test,pred)

2.0736353851307694

In [17]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-27,2111070,2101326
2020-08-28,2137823,2118367
2020-08-29,2164727,2140614
2020-08-30,2191779,2153939
2020-08-31,2218981,2184825
2020-09-01,2246332,2202663
2020-09-02,2273833,2231757
2020-09-03,2301483,2266957
2020-09-04,2329282,2283454
2020-09-05,2357230,2302187


In [18]:
Future(order, seasonal_order)

In [19]:
data.index


DatetimeIndex(['2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12',
               '2020-02-13', '2020-02-14', '2020-02-15', '2020-02-16',
               '2020-02-17', '2020-02-18',
               ...
               '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03',
               '2020-09-04', '2020-09-05', '2020-09-06', '2020-09-07',
               '2020-09-08', '2020-09-09'],
              dtype='datetime64[ns]', name='Date', length=214, freq='D')

## Deaths

In [20]:
data=create_data_frame('Death')
plot_data_frame()

In [21]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (180, 1)
Min date from train set: 2020-02-29
Max date from train set:2020-08-26
Min date from test set: 2020-08-27
Max date from test set: 2020-09-09
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=2845.762, Time=0.12 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=2555.018, Time=0.11 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=2681.870, Time=0.11 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=3051.556, Time=0.06 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=2552.754, Time=0.11 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=2987.460, Time=0.07 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=2548.501, Time=0.09 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=2538.040, Time=0.15 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=2623.478, Time=0.13 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=2565.035, Time=0.14 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=2536.726, Time=0.10 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=2729.6

In [22]:
fig,pred=Predict()
fig

In [23]:
pred=pred_to_int(pred)

In [24]:
mean_absolute_percentage_error(test,pred)

0.4060051190595915

In [25]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-27,180645,180785
2020-08-28,181615,181756
2020-08-29,182575,182714
2020-08-30,183525,183024
2020-08-31,184466,183597
2020-09-01,185397,184664
2020-09-02,186319,185720
2020-09-03,187232,186790
2020-09-04,188136,187755
2020-09-05,189031,188538


In [26]:
Future(order, seasonal_order)

## Confirmed

In [27]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [28]:
data.index.freq='D'
train,test=split_dataframe()
stepwise_model=find_params()
order=stepwise_model.get_params()['order']
seasonal_order=stepwise_model.get_params()['seasonal_order']

Test shape: (14, 1)
Train shape: (218, 1)
Min date from train set: 2020-01-22
Max date from train set:2020-08-26
Min date from test set: 2020-08-27
Max date from test set: 2020-09-09
Performing stepwise search to minimize aic
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=4204.754, Time=0.17 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=4206.146, Time=0.11 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=4206.126, Time=0.11 sec
 ARIMA(0,2,0)(0,0,0)[0]             : AIC=4203.304, Time=0.06 sec
 ARIMA(1,2,1)(0,0,0)[0] intercept   : AIC=4201.979, Time=0.12 sec
 ARIMA(2,2,1)(0,0,0)[0] intercept   : AIC=4210.746, Time=0.11 sec
 ARIMA(1,2,2)(0,0,0)[0] intercept   : AIC=4208.918, Time=0.12 sec
 ARIMA(0,2,2)(0,0,0)[0] intercept   : AIC=4205.798, Time=0.19 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=4208.143, Time=0.21 sec
 ARIMA(2,2,2)(0,0,0)[0] intercept   : AIC=4218.305, Time=0.22 sec
 ARIMA(1,2,1)(0,0,0)[0]             : AIC=4206.736, Time=0.13 sec

Best model:  ARIMA(1,2,1)(0,0,0)[0] intercept
T

In [29]:
fig,pred=Predict()
fig

In [30]:
pred=pred_to_int(pred)

In [31]:
mean_absolute_percentage_error(test,pred)

0.4265162799837424

In [32]:
pred['Actual']=test.Total
pred

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-27,5864780,5867785
2020-08-28,5909123,5913941
2020-08-29,5952550,5961094
2020-08-30,5997150,5996431
2020-08-31,6041023,6030587
2020-09-01,6085896,6073840
2020-09-02,6130200,6113510
2020-09-03,6175361,6150016
2020-09-04,6220082,6200518
2020-09-05,6265543,6244970


In [33]:
Future(order, seasonal_order)