In [1]:
import os
import numpy as np
import pandas as pd
import warnings

# This is not recommended but I am doing this to suppress warnings from SARIMAX
warnings.simplefilter('ignore')

countryName = 'US'
nFeatures = 1

nDaysMin = 3

nValid = 10
nTest = 10

dataDir = os.path.join('data', 'JHU')


confirmedFilename = 'time_series_covid19_confirmed_global.csv'
deathsFilename = 'time_series_covid19_deaths_global.csv'
recoveredFilename = 'time_series_covid19_recovered_global.csv'

In [2]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [3]:
def meanAbsolutePercentageError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    absPcErrorList = [absError/yTrue for absError, yTrue in zip(absErrorList, yTrueList)]
    MAPE = 100*np.mean(absPcErrorList)
    return MAPE

def meanForecastError(yTrueList, yPredList):
    forecastErrors = [yTrue - yPred for yTrue, yPred in zip(yTrueList, yPredList)]
    MFE = np.mean(forecastErrors)
    return MFE

def meanAbsoluteError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(absErrorList)

def meanSquaredError(yTrueList, yPredList):
    sqErrorList = [np.square(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(sqErrorList)

def rootMeanSquaredError(yTrueList, yPredList):
    return np.sqrt(meanSquaredError(yTrueList, yPredList))

def medianSymmetricAccuracy(yTrueList, yPredList):
    '''https://helda.helsinki.fi//bitstream/handle/10138/312261/2017SW001669.pdf?sequence=1'''
    logAccRatioList = [np.abs(np.log(yPred/yTrue)) for yTrue, yPred in zip(yTrueList, yPredList)]
    MdSA = 100*(np.exp(np.median(logAccRatioList))-1)
    return MdSA

In [4]:
# Function to get all three frames for a given country
def getCountryCovidFrDict(countryName):
    countryCovidFrDict = {}
    for key in covidFrDict.keys():
        dataFr = covidFrDict[key]
        countryCovidFrDict[key] = dataFr[dataFr['Country/Region'] == countryName]
    return countryCovidFrDict

In [5]:
# Load all 3 csv files
covidFrDict = {}
covidFrDict['confirmed'] = pd.read_csv(os.path.join(dataDir, confirmedFilename))
covidFrDict['deaths'] = pd.read_csv(os.path.join(dataDir, deathsFilename))

# Recovered is back again!
covidFrDict['recovered'] = pd.read_csv(os.path.join(dataDir, recoveredFilename))

countryCovidFrDict = getCountryCovidFrDict(countryName)

# Get list of dates
colNamesList = list(countryCovidFrDict['confirmed'])
dateList = [colName for colName in colNamesList if '/20' in colName]
dataList = [countryCovidFrDict['confirmed'][date].iloc[0] for date in dateList]
dataDict = dict(zip(dateList, dataList))

# Get time series for cases > 100 only
daysSince = 100
nCasesGreaterDaysSinceList = []
datesGreaterDaysSinceList = []

for key in dataDict.keys():
    if dataDict[key] > daysSince:
        datesGreaterDaysSinceList.append(key)
        nCasesGreaterDaysSinceList.append(dataDict[key])
        
XList, yList = split_sequence(nCasesGreaterDaysSinceList, nDaysMin)

XTrainList = XList[0:len(XList)-(nValid + nTest)]
XValidList = XList[len(XList)-(nValid+nTest):len(XList)-(nTest)]
XTestList = XList[-nTest:]

yTrain = yList[0:len(XList)-(nValid + nTest)]
yValid = yList[len(XList)-(nValid+nTest):len(XList)-(nTest)]
yTest = yList[-nTest:]

print('Total size of data points for LSTM:', len(yList))
print('Size of training set:', len(yTrain))
print('Size of validation set:', len(yValid))
print('Size of test set:', len(yTest))

# Convert from list to matrix
XTrain = XTrainList.reshape((XTrainList.shape[0], XTrainList.shape[1], nFeatures))
XValid = XValidList.reshape((XValidList.shape[0], XValidList.shape[1], nFeatures))
XTest = XTestList.reshape((XTestList.shape[0], XTestList.shape[1], nFeatures))

print(XTrain.shape)

Total size of data points for LSTM: 81
Size of training set: 61
Size of validation set: 10
Size of test set: 10
(61, 3, 1)


# Vanilla LSTM

In [6]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

nNeurons = 100
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    
    # define model
    model = Sequential()
    model.add(LSTM(nNeurons, activation='relu', input_shape=(nDaysMin, nFeatures)))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(LSTM(nNeurons, activation='relu', input_shape=(nDaysMin, nFeatures)))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 2.087865985147753
Updating best MAPE to 2.087865985147753...
Updating best seed to 0...
1 99.98549541772792
2 0.5963931127753913
Updating best MAPE to 0.5963931127753913...
Updating best seed to 2...
3 4.972414504324383
4 99.99299011341176
5 99.99406960910214
6 1.9009598388567153
7 2.0089585491198902
8 4.503383090280273
9 5.4620877682844595
10 1.6925085153004313
11 4.273439534645695
12 2.0228977371491803
13 1.8896666929489832
14 0.839376575531618
15 1.071065836364381
16 5.4111572752010195
17 2.7787870382620183
18 0.4257191786300121
Updating best MAPE to 0.4257191786300121...
Updating best seed to 18...
19 2.8288221600807284
20 2.1422684066849844
21 1.1580277268061798
22 99.98520565991707
23 0.19314019955399958
Updating best MAPE to 0.19314019955399958...
Updating best seed to 23...
24 14.761905968885337
25 0.50208534535255
26 2.6381898943196935
27 99.98536013788183
28 2.605321407384977
29 0.28404916617000453
30 0.18464760352696896
Updating best MAPE to 0.18464760352696896...
Updating

# Stacked LSTM

In [7]:
# define model
nNeurons = 50
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(LSTM(nNeurons, activation='relu', return_sequences=True, input_shape=(nDaysMin, nFeatures)))
    model.add(LSTM(nNeurons, activation='relu'))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(LSTM(nNeurons, activation='relu', return_sequences=True, input_shape=(nDaysMin, nFeatures)))
model.add(LSTM(nNeurons, activation='relu'))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 0.5745333049973439
Updating best MAPE to 0.5745333049973439...
Updating best seed to 0...
1 2.5552016050933024
2 2.3109116571125776
3 67.71869188504097
4 6.02835872238124
5 67.41853987658068
6 6.654431874096929
7 2.2520955914880645
8 7.023983390395669
9 0.356915382740839
Updating best MAPE to 0.356915382740839...
Updating best seed to 9...
10 1.7052422712876931
11 3.150882992603669
12 5.338710235676633
13 0.5691505848796378
14 2.4485353918107333
15 9.188548178695996
16 1.88770647881831
17 0.41557608264572277
18 67.54372674001175
19 6.292891362001843
20 66.98108017928817
21 0.37297786071070427
22 1.323859840472548
23 99.98524162180091
24 4.560117313425004
25 67.46879353058151
26 2.2844244697657348
27 2.1987187745035572
28 0.7674106132314238
29 67.72799091859352
30 2.117788233207981
31 4.790242210742071
32 2.8137942713057327
33 0.8537301693143479
34 67.32671806730635
35 67.18816691667196
36 2.315054904262495
37 0.5604823997956878
38 67.56904324762607
39 67.66622294880878
40 67.77276256

# Bidirectional LSTM

In [8]:
from tensorflow.keras.layers import Bidirectional

# define model
nNeurons = 50
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(Bidirectional(LSTM(nNeurons, activation='relu'), input_shape=(nDaysMin, nFeatures)))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(Bidirectional(LSTM(nNeurons, activation='relu'), input_shape=(nDaysMin, nFeatures)))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 0.23021782436421634
Updating best MAPE to 0.23021782436421634...
Updating best seed to 0...
1 2.625832391546587
2 2.0970733474760426
3 1.6786897399230092
4 0.46061842494730976
5 3.4873196022019366
6 4.464264766695389
7 1.7649109199292432
8 4.1326624919323836
9 2.234846586880339
10 6.716710546731941
11 2.169980973471276
12 99.99137660186955
13 1.345348179531578
14 4.100808179047807
15 4.482042124310602
16 2.0841590238337666
17 0.2527890391054501
18 2.1547003837181067
19 1.504656492383071
20 4.8406444980894
21 2.0585992272421363
22 1.7604670932823787
23 2.311487940449263
24 3.9283292563058057
25 0.7837239882308417
26 2.7488512609502513
27 0.6429144238502856
28 4.551569793612543
29 1.748000203514963
30 4.365119603384464
31 1.8960899003578724
32 0.35882920261526396
33 0.2924196852830445
34 0.9811873726674876
35 1.0994213621721383
36 0.8204036333774226
37 2.1620804836437024
38 2.25373219736822
39 0.19559088997879603
Updating best MAPE to 0.19559088997879603...
Updating best seed to 39...


# CNN LSTM

In [12]:
import tensorflow as tf
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Flatten
from tensorflow.compat.v1.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

# Number of subsequences to break X into (we do 10 = 5x2, 5 subsequences of size 2 each)
nSeq = 1
nSteps = 3

# define model
nNeurons = 50
nFeatures = 1
nFilters = 64

bestValidMAPE = 100
bestSeed = -1

# Reshape input
XTrainCNN = XTrainList.reshape((XTrainList.shape[0], nSeq, nSteps, nFeatures))
XValidCNN = XValidList.reshape((XValidList.shape[0], nSeq, nSteps, nFeatures))
XTestCNN = XTestList.reshape((XTestList.shape[0], nSeq, nSteps, nFeatures))

# print(XTrainCNN.shape)
# print(XValidCNN.shape)
# print(XTestCNN.shape)

for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=nFilters, kernel_size=1, activation='relu'), input_shape=(None, nSteps, nFeatures)))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(nNeurons, activation='relu'))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrainCNN, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValidCNN, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=nFilters, kernel_size=1, activation='relu'), input_shape=(None, nSteps, nFeatures)))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(nNeurons, activation='relu'))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(XTrainCNN, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTestCNN, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 1.6965639381262614
Updating best MAPE to 1.6965639381262614...
Updating best seed to 0...
1 2.880660853297173
2 6.802543917376595
3 1.7048536936129233
4 5.34822870711797
5 2.8766189345830853
6 3.537265054935385
7 2.5027957257678044
8 3.5715996972892765
9 0.6487244191915936
Updating best MAPE to 0.6487244191915936...
Updating best seed to 9...
10 0.7089852164248656
11 3.37935656648993
12 1.659604714300452
13 2.7236958175260866
14 2.4208249989656903
15 10.133283057439357
16 4.071340267134036
17 4.805823629413777
18 7.1000262983219375
19 3.1715758196352306
20 2.809473120683937
21 4.247119090537783
22 2.708257640313356
23 2.851562773898192
24 1.7912185980323612
25 0.8518325728989586
26 4.447860420515186
27 2.737499784768355
28 2.9537146156054344
29 2.765516178255032
30 4.519598561717604
31 4.949060793246502
32 2.917367247615214
33 7.745027779838658
34 1.6773865874804765
35 3.0921979741880046
36 4.397896549101059
37 2.131653406963912
38 1.711905260835515
39 3.487081855039657
40 1.49556598

# ConvLSTM

In [14]:
from tensorflow.keras.layers import ConvLSTM2D

# Number of subsequences to break X into (we do 10 = 5x2, 5 subsequences of size 2 each)
nSeq = 1
nSteps = 3
# Each input is rows x columns, we have rows=1 and columns=nSteps

# define model
nNeurons = 50
nFeatures = 1
nFilters = 64

bestValidMAPE = 100
bestSeed = -1

# Reshape input
XTrainConv = XTrainList.reshape((XTrainList.shape[0], nSeq, 1, nSteps, nFeatures))
XValidConv = XValidList.reshape((XValidList.shape[0], nSeq, 1, nSteps, nFeatures))
XTestConv = XTestList.reshape((XTestList.shape[0], nSeq, 1, nSteps, nFeatures))

for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1,2), activation='relu', input_shape=(nSeq, 1, nSteps, nFeatures)))
    model.add(Flatten())
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrainConv, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValidConv, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(ConvLSTM2D(filters=64, kernel_size=(1,2), activation='relu', input_shape=(nSeq, 1, nSteps, nFeatures)))
model.add(Flatten())
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(XTrainConv, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTestConv, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 1.45661411713753
Updating best MAPE to 1.45661411713753...
Updating best seed to 0...
1 0.18593917203549287
Updating best MAPE to 0.18593917203549287...
Updating best seed to 1...
2 0.27139314113359553
3 1.6523095797374168
4 0.770299875389173
5 0.60128883003599
6 4.1484729412889925
7 1.5573519932730897
8 0.170141643694705
Updating best MAPE to 0.170141643694705...
Updating best seed to 8...
9 0.3600757804351444
10 0.44161135592547884
11 3.2376299955136503
12 0.25767125775207517
13 0.26369756185835563
14 0.3907917945356878
15 0.183170570413176
16 3.7609325161320046
17 0.38396848966605485
18 1.2061216265822259
19 0.49098117583565354
20 2.323665503784255
21 2.6096478409539463
22 0.1732740579273751
23 0.4720361766078039
24 1.8303725038574785
25 0.5722478183537124
26 0.3476795521527752
27 1.2796275578634626
28 0.7420454314637367
29 0.3407762623188517
30 0.7817822812085415
31 0.528898486446352
32 1.368837157463855
33 0.7052203138295444
34 0.20328448586808986
35 0.17197781968706055
36 1.248