In [1]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# This is not recommended but I am doing this to suppress warnings from SARIMAX
warnings.simplefilter('ignore')

dataDir = os.path.join('data', 'JHU', 'upto07082020_forPublication')

countryName = 'US'
nDaysMin = 15
k = 1

pdqTupleGlobal = (1, 2, 2)


# confirmedFilename = 'time_series_covid19_confirmed_global.csv'
# deathsFilename = 'time_series_covid19_deaths_global.csv'
# recoveredFilename = 'time_series_covid19_recovered_global.csv'

confirmedFilename = 'https://raw.githubusercontent.com/arkobarman/covid-19_timeSeriesAnalysis/master/data/JHU/upto07082020_forPublication/time_series_covid19_confirmed_global.csv'
deathsFilename = 'https://raw.githubusercontent.com/arkobarman/covid-19_timeSeriesAnalysis/master/data/JHU/upto07082020_forPublication/time_series_covid19_deaths_global.csv'
recoveredFilename = 'https://raw.githubusercontent.com/arkobarman/covid-19_timeSeriesAnalysis/master/data/JHU/upto07082020_forPublication/time_series_covid19_recovered_global.csv'

In [2]:
# Function to get all three frames for a given country
def getCountryCovidFrDict(countryName):
    countryCovidFrDict = {}
    for key in covidFrDict.keys():
        dataFr = covidFrDict[key]
        countryCovidFrDict[key] = dataFr[dataFr['Country/Region'] == countryName]
    return countryCovidFrDict

In [3]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps, k):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix + k >= len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:end_ix+k]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [4]:
def meanAbsolutePercentageError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    absPcErrorList = [absError/yTrue for absError, yTrue in zip(absErrorList, yTrueList)]
    MAPE = 100*np.mean(absPcErrorList)
    return MAPE

def meanAbsolutePercentageError_kDay(yTrueListList, yPredListList):
    # Store true and predictions for day 1 in a list, day 2 in a list and so on
    # Keep each list of these lists in a respective dict with key as day #
    yTrueForDayK = {}
    yPredForDayK = {}
    for i in range(len(yTrueListList[0])):
        yTrueForDayK[i] = []
        yPredForDayK[i] = []
    for yTrueList, yPredList in zip(yTrueListList, yPredListList):
        for i in range(len(yTrueList)):
            yTrueForDayK[i].append(yTrueList[i])
            yPredForDayK[i].append(yPredList[i])
            
    # Get MAPE for each day in a list
    MAPEList = []
    for i in yTrueForDayK.keys():
        MAPEList.append(meanAbsolutePercentageError(yTrueForDayK[i], yPredForDayK[i]))
    return np.mean(MAPEList)

def meanForecastError(yTrueList, yPredList):
    forecastErrors = [yTrue - yPred for yTrue, yPred in zip(yTrueList, yPredList)]
    MFE = np.mean(forecastErrors)
    return MFE

def meanAbsoluteError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(absErrorList)

def meanSquaredError(yTrueList, yPredList):
    sqErrorList = [np.square(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(sqErrorList)

def rootMeanSquaredError(yTrueList, yPredList):
    return np.sqrt(meanSquaredError(yTrueList, yPredList))

def medianSymmetricAccuracy(yTrueList, yPredList):
    '''https://helda.helsinki.fi//bitstream/handle/10138/312261/2017SW001669.pdf?sequence=1'''
    logAccRatioList = [np.abs(np.log(yPred/yTrue)) for yTrue, yPred in zip(yTrueList, yPredList)]
    MdSA = 100*(np.exp(np.median(logAccRatioList))-1)
    return MdSA

def medianSymmetricAccuracy_kDay(yTrueListList, yPredListList):
    # Store true and predictions for day 1 in a list, day 2 in a list and so on
    # Keep each list of these lists in a respective dict with key as day #
    yTrueForDayK = {}
    yPredForDayK = {}
    for i in range(len(yTrueListList[0])):
        yTrueForDayK[i] = []
        yPredForDayK[i] = []
    for yTrueList, yPredList in zip(yTrueListList, yPredListList):
        for i in range(len(yTrueList)):
            yTrueForDayK[i].append(yTrueList[i])
            yPredForDayK[i].append(yPredList[i])
    # Get MdSA for each day in a list
    MdSAList = []
    for i in yTrueForDayK.keys():
        MdSAList.append(medianSymmetricAccuracy(yTrueForDayK[i], yPredForDayK[i]))
    return(np.mean(MdSAList))

In [5]:
# Load all 3 csv files
covidFrDict = {}
# covidFrDict['confirmed'] = pd.read_csv(os.path.join(dataDir, confirmedFilename))
# covidFrDict['deaths'] = pd.read_csv(os.path.join(dataDir, deathsFilename))

covidFrDict['confirmed'] = pd.read_csv(confirmedFilename)
covidFrDict['deaths'] = pd.read_csv(deathsFilename)

# Recovered is back again!
covidFrDict['recovered'] = pd.read_csv(recoveredFilename)

countryCovidFrDict = getCountryCovidFrDict(countryName)

In [6]:
# USCovidFrDict['confirmed']

# Get list of dates
colNamesList = list(countryCovidFrDict['confirmed'])
dateList = [colName for colName in colNamesList if '/20' in colName]
dataList = [countryCovidFrDict['confirmed'][date].iloc[0] for date in dateList]
dataDict = dict(zip(dateList, dataList))

In [7]:
# Get time series for cases > 100 only
daysSince = 100
nCasesGreaterDaysSinceList = []
datesGreaterDaysSinceList = []

for key in dataDict.keys():
    if dataDict[key] > daysSince:
        datesGreaterDaysSinceList.append(key)
        nCasesGreaterDaysSinceList.append(dataDict[key])
nCasesGreaterDaysSinceList[-1]

3054699

In [8]:
# Get time series for cases > 100 only
daysSince = 100
nCasesGreaterDaysSinceList = []
datesGreaterDaysSinceList = []

for key in dataDict.keys():
    if dataDict[key] > daysSince:
        datesGreaterDaysSinceList.append(key)
        nCasesGreaterDaysSinceList.append(dataDict[key])

XList, yList = split_sequence(nCasesGreaterDaysSinceList, nDaysMin, k)

In [9]:
def getPredictions(X, nDaysInFuture=5, invertible=False, pdqTuple = (1, 2, 2)):
    p, d, q = pdqTuple
    predList = []
    Xcopy = X.copy()
    for i in range(nDaysInFuture):
        if invertible:
            model = SARIMAX(Xcopy, order=(p, d, q))

            model_fit = model.fit(disp=False)

            # make prediction
            yhat = model_fit.predict(len(Xcopy), len(Xcopy), typ='levels')
        else:
            model = SARIMAX(Xcopy, order=(p, d, q), enforce_invertibility=False)

            model_fit = model.fit(disp=False)

            # make prediction
            yhat = model_fit.predict(len(Xcopy), len(Xcopy), typ='levels')
        Xcopy = np.append(Xcopy, yhat)
        predList.append(np.around(yhat[0]))
        
    return predList
getPredictions(XList[0], nDaysInFuture=k, pdqTuple=pdqTupleGlobal)

[12058.0]

In [10]:
yPredsList = []
for X, y in zip(XList, yList):
    yPred = getPredictions(X, nDaysInFuture=k, pdqTuple=pdqTupleGlobal)
    yPredsList.append(yPred)
#     print(X, y, yPred)

In [11]:
MAPE = meanAbsolutePercentageError_kDay(yList[-10:], yPredsList[-10:])
print('kMAPE:', MAPE)
MdSA = medianSymmetricAccuracy_kDay(yList[-10:], yPredsList[-10:])
print('kMdSA:', MdSA)

kMAPE: 0.18328873108728028
kMdSA: 0.15495738259889702


In [12]:
# # Evaluate metrics
# MAPEList = []
# MFEList = []
# MAEList = []
# MSEList = []
# RMSEList = []
# MdSAList = []
# for yPredList, yTrueList in zip(yPredsList, yList):
#     MAPEList.append(meanAbsolutePercentageError(yTrueList, yPredList))
#     MFEList.append(meanForecastError(yTrueList, yPredList))
#     MAEList.append(meanAbsoluteError(yTrueList, yPredList))
#     MSEList.append(meanSquaredError(yTrueList, yPredList))
#     RMSEList.append(rootMeanSquaredError(yTrueList, yPredList))
#     MdSAList.append(medianSymmetricAccuracy(yTrueList, yPredList))

In [13]:
# print('Mean of MAPE:', np.mean(MAPEList[2:]))
# print('Mean of MFE:', np.mean(MFEList[2:]))
# print('Mean of MAE:', np.mean(MAEList[2:]))
# print('Mean of MSE:', np.mean(MSEList[2:]))
# print('Mean of RMSE:', np.mean(RMSEList[2:]))
# print('Mean of MdSA:', np.mean(MdSAList[2:]))

In [14]:
# # Test Evaluation metric functions
# print('Mean Absolute Percentage Error:', meanAbsolutePercentageError([1, 2], [0.5, 2.5]))
# print('Mean Forecast Error or Forecast Bias:', meanForecastError([1,2], [0.5,2.5]))
# print('Mean Absolute Error:', meanAbsoluteError([1,2], [0.5,2.5]))
# print('Mean Squared Error:', meanSquaredError([1,2], [0.5,2.5]))
# print('Root Mean Squared Error:', rootMeanSquaredError([1,2], [0.5,2.5]))