# Predictive Analysis With Auto ARIMA 

## Data Collection and Preprocessing

In [1]:
# Imports
import sys
import os
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima

# To import the main.py file
sys.path.append('../')
from python_files import main

# Getting all the data
confirmed_global, deaths_global, recovered_global, country_cases = main.collect_data()
rec = recovered_global.groupby("country").sum().T
death=deaths_global.groupby("country").sum().T
conf=confirmed_global.groupby("country").sum().T
death.index = pd.to_datetime(death.index, infer_datetime_format = True)
rec.index=pd.to_datetime(rec.index, infer_datetime_format=True)
conf.index=pd.to_datetime(conf.index, infer_datetime_format=True)

In [2]:
# Setting up plotly to work offline and in jupyter notebooks
pyo.init_notebook_mode(connected = True)
%matplotlib inline

In [3]:
def create_data_frame(Type):
    if Type=='Death':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = death.index, data = death["India"].values, columns = ["Total"])
    elif Type=='Recovery':
        # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = rec.index, data = rec["India"].values, columns = ["Total"])
    elif Type=='Confirmed':
         # Creating a dataframe with Total number of cases everyday in a column
        data = pd.DataFrame(index = conf.index, data = conf["India"].values, columns = ["Total"])
    data['Date'] = data.index
    # Setting Date column as index
    data=data.set_index('Date', drop=True)
    # Making sure no zeroes exist in dataframe
    data = data[(data != 0).all(1)]
    return data


In [4]:
# Plotting the Data
def plot_data_frame():
    return px.line(data, template = 'plotly_dark')

In [5]:
def find_params():
    stepwise_model = auto_arima(data, start_p = 1, start_q = 1,
                               max_p = 3, max_q = 3, m = 12,
                               start_P = 0, seasonal = True,
                               d = 1, D = 1, trace = True,
                               error_action = 'ignore',  
                               suppress_warnings = True, 
                               stepwise = True)
    return stepwise_model
    # Finding the pest p,d,q parametes for the model

In [6]:
def split_dataframe(split_date):
    # Setting a Split date for test and train datasets
    train = data.loc[:split_date]
    test = data.loc[split_date:]
    print('Test shape:',test.shape)
    print('Train shape:',train.shape)
    print(f'Min date from train set: {train.index.min().date()}')
    print(f'Max date from train set:{train.index.max().date()}')
    print(f'Min date from test set: {test.index.min().date()}')
    print(f'Max date from test set: {test.index.max().date()}')
    return train,test

In [7]:
def forecast():
    # Fitting the model with train dataset
    stepwise_model.fit(train)
    # Forecasting
    future_forecast = stepwise_model.predict(n_periods=len(test))
    # This returns an array of predictions:
    print(future_forecast)
    future_forecast = pd.DataFrame(future_forecast,index = test.index,columns=['Prediction'])
    # Plotting the predicted values and actual values
    return px.line(pd.concat([data,future_forecast],axis=1), template = 'plotly_dark'),future_forecast

In [8]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Recovery

In [9]:
data=create_data_frame('Recovery')
plot_data_frame()

In [10]:
stepwise_model=find_params()
print(stepwise_model.aic())


Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=2632.421, BIC=2647.339, Time=5.381 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=2759.600, BIC=2765.568, Time=0.068 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=2689.742, BIC=2701.677, Time=3.051 seconds
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=2746.598, BIC=2758.532, Time=2.371 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=2807.421, BIC=2810.405, Time=0.067 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=2662.896, BIC=2674.830, Time=0.603 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=2627.291, BIC=2645.192, Time=7.384 seconds
Fit ARIMA(1,1,1)x(1,1,0,12) [intercept=True]; AIC=2631.161, BIC=2646.079, Time=4.916 seconds
Fit ARIMA(1,1,1)x(2,1,1,12) [intercept=True]; AIC=2635.853, BIC=2656.738, Time=64.470 seconds
Near non-invertible roots for order (1, 1, 1)(2, 1, 1, 12); setting score to inf (at least one inverse root too close 

In [30]:
split_date = pd.Timestamp('2020-05-24')
train,test=split_dataframe(split_date)
fig,future_forecast=forecast()
fig

Test shape: (61, 1)
Train shape: (116, 1)
Min date from train set: 2020-01-30
Max date from train set:2020-05-24
Min date from test set: 2020-05-24
Max date from test set: 2020-07-23
[ 145839.18507865  153123.99529507  160634.59992534  168879.7072002
  177138.42643793  185305.27912658  194256.37809976  202912.79931749
  211838.92735564  221356.77536063  230720.94862054  240345.54278536
  250322.40370556  260379.97532078  270646.78739957  281876.26022332
  292906.69102581  304009.55977777  315809.21395271  327356.67016458
  339149.6117874   351704.00977927  364015.85621155  376573.91889953
  389446.04051063  402437.22629124  415568.47113282  429674.91803152
  443777.92194974  457804.0806261   472819.9247824   487506.98445488
  502594.83322289  518323.01233214  533951.238226    549931.46366308
  566231.7587146   582635.08716406  599252.9140265   616805.90327217
  634347.31481617  651875.86614927  670298.11310347  688452.71022958
  706966.28960451  726167.73852865  745252.98299976  764671

In [31]:
mean_absolute_percentage_error(test,future_forecast)

6.488968368587282

In [32]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-24,1.458392e+05,138536
2020-05-25,1.531240e+05,144950
2020-05-26,1.606346e+05,150793
2020-05-27,1.688797e+05,158086
2020-05-28,1.771384e+05,165386
...,...,...
2020-07-19,9.537907e+05,1118206
2020-07-20,9.766644e+05,1155338
2020-07-21,9.994221e+05,1193078
2020-07-22,1.022525e+06,1238798


## Deaths

In [14]:
data=create_data_frame('Death')
plot_data_frame()

In [15]:
stepwise_model=find_params()
print(stepwise_model.aic())

Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=1618.552, BIC=1632.572, Time=5.048 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=1675.139, BIC=1680.747, Time=0.042 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=1652.263, BIC=1663.479, Time=2.483 seconds
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=1621.623, BIC=1632.839, Time=2.089 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=1682.635, BIC=1685.439, Time=0.042 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=1678.776, BIC=1689.992, Time=0.327 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=1625.239, BIC=1642.063, Time=6.790 seconds
Fit ARIMA(1,1,1)x(0,1,2,12) [intercept=True]; AIC=1621.839, BIC=1638.663, Time=12.990 seconds
Near non-invertible roots for order (1, 1, 1)(0, 1, 2, 12); setting score to inf (at least one inverse root too close to the border of the unit circle: 0.992)
Fit ARIMA(1,1,1)x(1,1,0,12) [intercept=True]; AIC=16

In [51]:
split_date = pd.Timestamp('2020-06-13')
train,test=split_dataframe(split_date)
fig,future_forecast=forecast()
fig

Test shape: (41, 1)
Train shape: (136, 1)
Min date from train set: 2020-01-30
Max date from train set:2020-06-13
Min date from test set: 2020-06-13
Max date from test set: 2020-07-23
[ 332831.62835005  345557.90879496  358226.14882071  370993.45348928
  383843.02069339  396652.72868303  409363.58839572  422928.33172057
  437000.83840025  451239.84247669  466265.99116606  480865.77304531
  496127.40613314  512120.93068884  528259.7402007   544407.72542366
  560853.81642149  577358.65814659  593191.76816796  610093.67379084
  627499.75452031  645257.14067342  663633.4612804   681853.87562981
  700577.99461338  720126.89741013  739796.14459219  759436.23977427
  779490.88029989  799692.57908337  819047.18170303  839597.5048067
  860619.110189    881977.53301707  904039.24643795  926026.57915267
  948483.8832796   971758.01725765  995151.23930894 1018600.56591192
 1042360.91350649]


In [52]:
mean_absolute_percentage_error(test,future_forecast)

6.499972073514283

In [53]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-13,332831.6,320922
2020-06-14,345557.9,332424
2020-06-15,358226.1,343091
2020-06-16,370993.5,354065
2020-06-17,383843.0,366946
2020-06-18,396652.7,380532
2020-06-19,409363.6,395048
2020-06-20,422928.3,410451
2020-06-21,437000.8,425282
2020-06-22,451239.8,440215


## Confirmed

In [19]:
data=create_data_frame('Confirmed')
plot_data_frame()

In [20]:
stepwise_model=find_params()
print(stepwise_model.aic())

Performing stepwise search to minimize aic
Fit ARIMA(1,1,1)x(0,1,1,12) [intercept=True]; AIC=2742.680, BIC=2758.149, Time=6.352 seconds
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=True]; AIC=3156.826, BIC=3163.013, Time=0.054 seconds
Fit ARIMA(1,1,0)x(1,1,0,12) [intercept=True]; AIC=2764.618, BIC=2776.993, Time=3.093 seconds
Near non-invertible roots for order (1, 1, 0)(1, 1, 0, 12); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(0,1,1)x(0,1,1,12) [intercept=True]; AIC=2972.041, BIC=2984.416, Time=2.594 seconds
Near non-invertible roots for order (0, 1, 1)(0, 1, 1, 12); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(0,1,0)x(0,1,0,12) [intercept=False]; AIC=3220.036, BIC=3223.130, Time=0.098 seconds
Fit ARIMA(1,1,1)x(0,1,0,12) [intercept=True]; AIC=2792.444, BIC=2804.819, Time=0.378 seconds
Fit ARIMA(1,1,1)x(1,1,1,12) [intercept=True]; AIC=2740.509, BIC=2759.071, Time=7

In [21]:
split_date = pd.Timestamp('2020-06-07')
train,test=split_dataframe(split_date)
fig,future_forecast=forecast()
fig

Test shape: (47, 1)
Train shape: (130, 1)
Min date from train set: 2020-01-30
Max date from train set:2020-06-07
Min date from test set: 2020-06-07
Max date from test set: 2020-07-23
[ 268672.64956865  280779.61140867  292976.98379516  305422.09733053
  318463.80607885  331020.03442079  344145.58922526  358172.14159498
  372126.25366302  386107.31773757  400396.20412541  414809.29530142
  429777.28108998  445525.88582633  461673.96863713  477907.01037157
  495077.99058724  511642.15477837  529013.30818857  547148.5627955
  565421.97794073  583820.41784637  602443.01725173  621109.54956809
  640640.1181606   660797.4705857   681473.96691336  702402.53769818
  724072.69835336  745099.74698927  767021.93051737  789834.32311171
  812834.63492099  835816.34654574  859370.46108334  883192.4809834
  907674.57972092  932998.3996317   958730.56751502  984735.36172191
 1011559.08473014 1037874.28147272 1065018.08464584 1093089.64699684
 1121316.04138615 1149651.08248093 1178437.88550127]


In [22]:
mean_absolute_percentage_error(test,future_forecast)

5.57090250586919

In [23]:
future_forecast['Actual']=test.Total
future_forecast

Unnamed: 0_level_0,Prediction,Actual
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-07,268672.6,257486
2020-06-08,280779.6,265928
2020-06-09,292977.0,276146
2020-06-10,305422.1,286605
2020-06-11,318463.8,297535
2020-06-12,331020.0,308993
2020-06-13,344145.6,320922
2020-06-14,358172.1,332424
2020-06-15,372126.3,343091
2020-06-16,386107.3,354065
