In [81]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# This is not recommended but I am doing this to suppress warnings from SARIMAX
warnings.simplefilter('ignore')

dataDir = os.path.join('data', 'JHU')


confirmedFilename = 'time_series_covid19_confirmed_global.csv'
deathsFilename = 'time_series_covid19_deaths_global.csv'
recoveredFilename = 'time_series_covid19_recovered_global.csv'

In [41]:
# Function to get all three frames for a given country
def getCountryCovidFrDict(countryName):
    countryCovidFrDict = {}
    for key in covidFrDict.keys():
        dataFr = covidFrDict[key]
        countryCovidFrDict[key] = dataFr[dataFr['Country/Region'] == countryName]
    return countryCovidFrDict

In [147]:
def meanAbsolutePercentageError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    absPcErrorList = [absError/yTrue for absError, yTrue in zip(absErrorList, yTrueList)]
    MAPE = 100*np.mean(absPcErrorList)
    return MAPE

def meanForecastError(yTrueList, yPredList):
    forecastErrors = [yTrue - yPred for yTrue, yPred in zip(yTrueList, yPredList)]
    MFE = np.mean(forecastErrors)
    return MFE

def meanAbsoluteError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(absErrorList)

def meanSquaredError(yTrueList, yPredList):
    sqErrorList = [np.square(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(sqErrorList)

def rootMeanSquaredError(yTrueList, yPredList):
    return np.sqrt(meanSquaredError(yTrueList, yPredList))

def medianSymmetricAccuracy(yTrueList, yPredList):
    '''https://helda.helsinki.fi//bitstream/handle/10138/312261/2017SW001669.pdf?sequence=1'''
    logAccRatioList = [np.abs(np.log(yPred/yTrue)) for yTrue, yPred in zip(yTrueList, yPredList)]
    MdSA = 100*np.exp(np.median(logAccRatioList) - 1)
    return MdSA

In [61]:
# Load all 3 csv files
covidFrDict = {}
covidFrDict['confirmed'] = pd.read_csv(os.path.join(dataDir, confirmedFilename))
covidFrDict['deaths'] = pd.read_csv(os.path.join(dataDir, deathsFilename))

# Recovered is back again!
covidFrDict['recovered'] = pd.read_csv(os.path.join(dataDir, recoveredFilename))

In [62]:
countryCovidFrDict = getCountryCovidFrDict('US')

In [64]:
# USCovidFrDict['confirmed']

# Get list of dates
colNamesList = list(countryCovidFrDict['confirmed'])
dateList = [colName for colName in colNamesList if '/20' in colName]
dataList = [countryCovidFrDict['confirmed'][date].iloc[0] for date in dateList]
dataDict = dict(zip(dateList, dataList))

In [74]:
# Get time series for cases > 100 only
daysSince = 100
nCasesGreaterDaysSinceList = []
datesGreaterDaysSinceList = []

for key in dataDict.keys():
    if dataDict[key] > daysSince:
        datesGreaterDaysSinceList.append(key)
        nCasesGreaterDaysSinceList.append(dataDict[key])
nCasesGreaterDaysSinceList[-1]

1528568

In [150]:
nDaysMin = 15
nDaysInFuture = 1
XList = []
yList = []
yStartList = list(range(nDaysMin, len(nCasesGreaterDaysSinceList)-nDaysInFuture+1))
for yStart in yStartList:
    X = nCasesGreaterDaysSinceList[0: yStart]
    XList.append(X)
    y = nCasesGreaterDaysSinceList[yStart:yStart+nDaysInFuture]
    yList.append(y)
# print(XList)

In [151]:
def getPredictions(X, nDaysInFuture=5, invertible=False, pdqTuple = (1, 2, 2)):
    p, d, q = pdqTuple
    predList = []
    Xcopy = X.copy()
    for i in range(nDaysInFuture):
        if invertible:
            model = SARIMAX(Xcopy, order=(p, d, q))

            model_fit = model.fit(disp=False)

            # make prediction
            yhat = model_fit.predict(len(Xcopy), len(Xcopy), typ='levels')
        else:
            model = SARIMAX(Xcopy, order=(p, d, q), enforce_invertibility=False)

            model_fit = model.fit(disp=False)

            # make prediction
            yhat = model_fit.predict(len(Xcopy), len(Xcopy), typ='levels')

        Xcopy.extend(yhat)
        predList.append(np.around(yhat[0]))
        
    return predList
getPredictions(XList[0])

[8820.0, 11992.0, 15871.0, 20397.0, 25517.0]

In [152]:
yPredsList = []
for X, y in zip(XList, yList):
    yPred = getPredictions(X, nDaysInFuture=nDaysInFuture)
    yPredsList.append(yPred)
    print(X, y, yPred)

[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421] [7783] [8820.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783] [13747] [9278.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747] [19273] [14143.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273] [25600] [24939.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600] [33276] [33674.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276] [43843] [41809.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843] [53736] [55473.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736] [65778] [64934.0]
[118, 

[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736, 65778, 83836, 101657, 121465, 140909, 161831, 188172, 213242, 243622, 275367, 308650, 336802, 366317, 397121, 428654, 462780, 496535, 526396, 555313, 580619, 607670, 636350, 667592, 699706, 732197, 758809, 784326, 811865, 840351] [869170] [869292.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736, 65778, 83836, 101657, 121465, 140909, 161831, 188172, 213242, 243622, 275367, 308650, 336802, 366317, 397121, 428654, 462780, 496535, 526396, 555313, 580619, 607670, 636350, 667592, 699706, 732197, 758809, 784326, 811865, 840351, 869170] [905358] [898014.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736, 65778, 83836, 101657, 121465, 140909, 161831, 188172, 213242, 243622, 275367, 308650, 336802, 36631

[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736, 65778, 83836, 101657, 121465, 140909, 161831, 188172, 213242, 243622, 275367, 308650, 336802, 366317, 397121, 428654, 462780, 496535, 526396, 555313, 580619, 607670, 636350, 667592, 699706, 732197, 758809, 784326, 811865, 840351, 869170, 905358, 938154, 965785, 988197, 1012582, 1039909, 1069424, 1103461, 1132539, 1158040, 1180375, 1204351, 1229331, 1257023, 1283929, 1309550, 1329260, 1347881, 1369376, 1390406, 1417774] [1442824] [1445675.0]
[118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13747, 19273, 25600, 33276, 43843, 53736, 65778, 83836, 101657, 121465, 140909, 161831, 188172, 213242, 243622, 275367, 308650, 336802, 366317, 397121, 428654, 462780, 496535, 526396, 555313, 580619, 607670, 636350, 667592, 699706, 732197, 758809, 784326, 811865, 840351, 869170, 905358, 938154, 965785, 988197, 1012582, 1039909, 10694

In [153]:
# Evaluate metrics
MAPEList = []
MFEList = []
MAEList = []
MSEList = []
RMSEList = []
for yPredList, yTrueList in zip(yPredsList, yList):
    MAPEList.append(meanAbsolutePercentageError(yTrueList, yPredList))
    MFEList.append(meanForecastError(yTrueList, yPredList))
    MAEList.append(meanAbsoluteError(yTrueList, yPredList))
    MSEList.append(meanSquaredError(yTrueList, yPredList))
    RMSEList.append(rootMeanSquaredError(yTrueList, yPredList))

In [154]:
print('Mean of MAPE:', np.mean(MAPEList[2:]))
print('Mean of MFE:', np.mean(MFEList[2:]))
print('Mean of MAE:', np.mean(MAEList[2:]))
print('Mean of MSE:', np.mean(MSEList[2:]))
print('Mean of RMSE:', np.mean(RMSEList[2:]))

Mean of MAPE: 1.168012499805733
Mean of MFE: -215.80327868852459
Mean of MAE: 2701.377049180328
Mean of MSE: 11384674.655737706
Mean of RMSE: 2701.377049180328


In [24]:
# # Test Evaluation metric functions
# print('Mean Absolute Percentage Error:', meanAbsolutePercentageError([1, 2], [0.5, 2.5]))
# print('Mean Forecast Error or Forecast Bias:', meanForecastError([1,2], [0.5,2.5]))
# print('Mean Absolute Error:', meanAbsoluteError([1,2], [0.5,2.5]))
# print('Mean Squared Error:', meanSquaredError([1,2], [0.5,2.5]))
# print('Root Mean Squared Error:', rootMeanSquaredError([1,2], [0.5,2.5]))

Mean Absolute Percentage Error: 37.5
Mean Forecast Error or Forecast Bias: 0.0
Mean Absolute Error: 0.5
Mean Squared Error: 0.25
Root Mean Squared Error: 0.5
