In [1]:
import os
import numpy as np
import pandas as pd
import warnings

# This is not recommended but I am doing this to suppress warnings from SARIMAX
warnings.simplefilter('ignore')

countryName = 'US'
nFeatures = 1

nDaysMin = 15

nValid = 10
nTest = 10

dataDir = os.path.join('data', 'JHU')


confirmedFilename = 'time_series_covid19_confirmed_global.csv'
deathsFilename = 'time_series_covid19_deaths_global.csv'
recoveredFilename = 'time_series_covid19_recovered_global.csv'

In [2]:
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [3]:
def meanAbsolutePercentageError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    absPcErrorList = [absError/yTrue for absError, yTrue in zip(absErrorList, yTrueList)]
    MAPE = 100*np.mean(absPcErrorList)
    return MAPE

def meanForecastError(yTrueList, yPredList):
    forecastErrors = [yTrue - yPred for yTrue, yPred in zip(yTrueList, yPredList)]
    MFE = np.mean(forecastErrors)
    return MFE

def meanAbsoluteError(yTrueList, yPredList):
    absErrorList = [np.abs(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(absErrorList)

def meanSquaredError(yTrueList, yPredList):
    sqErrorList = [np.square(yTrue - yPred) for yTrue, yPred in zip(yTrueList, yPredList)]
    return np.mean(sqErrorList)

def rootMeanSquaredError(yTrueList, yPredList):
    return np.sqrt(meanSquaredError(yTrueList, yPredList))

def medianSymmetricAccuracy(yTrueList, yPredList):
    '''https://helda.helsinki.fi//bitstream/handle/10138/312261/2017SW001669.pdf?sequence=1'''
    logAccRatioList = [np.abs(np.log(yPred/yTrue)) for yTrue, yPred in zip(yTrueList, yPredList)]
    MdSA = 100*(np.exp(np.median(logAccRatioList))-1)
    return MdSA

In [4]:
# Function to get all three frames for a given country
def getCountryCovidFrDict(countryName):
    countryCovidFrDict = {}
    for key in covidFrDict.keys():
        dataFr = covidFrDict[key]
        countryCovidFrDict[key] = dataFr[dataFr['Country/Region'] == countryName]
    return countryCovidFrDict

In [5]:
# Load all 3 csv files
covidFrDict = {}
covidFrDict['confirmed'] = pd.read_csv(os.path.join(dataDir, confirmedFilename))
covidFrDict['deaths'] = pd.read_csv(os.path.join(dataDir, deathsFilename))

# Recovered is back again!
covidFrDict['recovered'] = pd.read_csv(os.path.join(dataDir, recoveredFilename))

countryCovidFrDict = getCountryCovidFrDict(countryName)

# Get list of dates
colNamesList = list(countryCovidFrDict['confirmed'])
dateList = [colName for colName in colNamesList if '/20' in colName]
dataList = [countryCovidFrDict['confirmed'][date].iloc[0] for date in dateList]
dataDict = dict(zip(dateList, dataList))

# Get time series for cases > 100 only
daysSince = 100
nCasesGreaterDaysSinceList = []
datesGreaterDaysSinceList = []

for key in dataDict.keys():
    if dataDict[key] > daysSince:
        datesGreaterDaysSinceList.append(key)
        nCasesGreaterDaysSinceList.append(dataDict[key])
        
XList, yList = split_sequence(nCasesGreaterDaysSinceList, nDaysMin)

XTrainList = XList[0:len(XList)-(nValid + nTest)]
XValidList = XList[len(XList)-(nValid+nTest):len(XList)-(nTest)]
XTestList = XList[-nTest:]

yTrain = yList[0:len(XList)-(nValid + nTest)]
yValid = yList[len(XList)-(nValid+nTest):len(XList)-(nTest)]
yTest = yList[-nTest:]

print('Total size of data points for LSTM:', len(yList))
print('Size of training set:', len(yTrain))
print('Size of validation set:', len(yValid))
print('Size of test set:', len(yTest))

# Convert from list to matrix
XTrain = XTrainList.reshape((XTrainList.shape[0], XTrainList.shape[1], nFeatures))
XValid = XValidList.reshape((XValidList.shape[0], XValidList.shape[1], nFeatures))
XTest = XTestList.reshape((XTestList.shape[0], XTestList.shape[1], nFeatures))

print(XTrain.shape)

Total size of data points for LSTM: 69
Size of training set: 49
Size of validation set: 10
Size of test set: 10
(49, 15, 1)


# Vanilla LSTM

In [6]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

nNeurons = 100
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    
    # define model
    model = Sequential()
    model.add(LSTM(nNeurons, activation='relu', input_shape=(nDaysMin, nFeatures)))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(LSTM(nNeurons, activation='relu', input_shape=(nDaysMin, nFeatures)))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 0.24288807222012376
Updating best MAPE to 0.24288807222012376...
Updating best seed to 0...
1 nan
2 7.525165758673897
3 0.39705558697965837
4 nan
5 1.4345988664131382
6 0.18007773512805422
Updating best MAPE to 0.18007773512805422...
Updating best seed to 6...
7 nan
8 4.519540576893765
9 nan
10 2.164541740213353
11 nan
12 1.2417110352594827
13 99.99438149791875
14 nan
15 4.6998367042107745
16 6.082095196219033
17 1.486815654824598
18 nan
19 2.212491162298793
20 4.665923557373218
21 0.8710163368530618
22 0.3550653330999778
23 nan
24 0.5135988475575317
25 nan
26 138.14777362823392
27 nan
28 99.98550195011934
29 nan
30 99.99310022073955
31 0.4698529385999849
32 5.015164182879837
33 nan
34 nan
35 2.9506107803957127
36 nan
37 1.635810836487824
38 9.990553499457803
39 nan
40 0.9511807063035581
41 2.4780043235979683
42 3.665847022206455
43 nan
44 1.8535802807489485
45 0.43689414386724346
46 4.231397033878479
47 0.9843272648921187
48 1.669158657751801
49 1.5294578085756605
50 nan
51 1.607575

# Stacked LSTM

In [None]:
# define model
nNeurons = 50
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(LSTM(nNeurons, activation='relu', return_sequences=True, input_shape=(nDaysMin, nFeatures)))
    model.add(LSTM(nNeurons, activation='relu'))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(LSTM(nNeurons, activation='relu', return_sequences=True, input_shape=(nDaysMin, nFeatures)))
model.add(LSTM(nNeurons, activation='relu'))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

0 99.99913662535803
Updating best MAPE to 99.99913662535803...
Updating best seed to 0...
1 99.98921201684993
Updating best MAPE to 99.98921201684993...
Updating best seed to 1...
2 99.99742283266032
3 99.99350502147554
4 13.405217006545378
Updating best MAPE to 13.405217006545378...
Updating best seed to 4...
5 6.053179235799955
Updating best MAPE to 6.053179235799955...
Updating best seed to 5...
6 99.9902829711498
7 99.99201928776299
8 99.99893628187007
9 nan
10 99.99142061918795
11 nan
12 99.99096524701706
13 0.14014689977471162
Updating best MAPE to 0.14014689977471162...
Updating best seed to 13...
14 28.466741969272906
15 0.760013501078564
16 nan
17 99.99261983555829
18 99.99958024014626
19 3.711855079162997
20 99.9941152100652
21 1.4623770813477464
22 59.271165050826205


# Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Bidirectional

# define model
nNeurons = 50
nFeatures = 1

bestValidMAPE = 100
bestSeed = -1
for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(Bidirectional(LSTM(nNeurons, activation='relu'), input_shape=(nDaysMin, nFeatures)))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrain, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValid, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(Bidirectional(LSTM(nNeurons, activation='relu'), input_shape=(nDaysMin, nFeatures)))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')

# fit model
model.fit(XTrain, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTest, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

# CNN LSTM

In [None]:
import tensorflow as tf
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Flatten
from tensorflow.compat.v1.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

# Number of subsequences to break X into (we do 15 = 5x3, 5 subsequences of size 3 each)
nSeq = 5
nSteps = 3

# define model
nNeurons = 50
nFeatures = 1
nFilters = 64

bestValidMAPE = 100
bestSeed = -1

# Reshape input
XTrainCNN = XTrainList.reshape((XTrainList.shape[0], nSeq, nSteps, nFeatures))
XValidCNN = XValidList.reshape((XValidList.shape[0], nSeq, nSteps, nFeatures))
XTestCNN = XTestList.reshape((XTestList.shape[0], nSeq, nSteps, nFeatures))

# print(XTrainCNN.shape)
# print(XValidCNN.shape)
# print(XTestCNN.shape)

for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=nFilters, kernel_size=1, activation='relu'), input_shape=(None, nSteps, nFeatures)))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(nNeurons, activation='relu'))
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrainCNN, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValidCNN, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=nFilters, kernel_size=1, activation='relu'), input_shape=(None, nSteps, nFeatures)))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(nNeurons, activation='relu'))
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(XTrainCNN, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTestCNN, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)

# ConvLSTM

In [None]:
from tensorflow.keras.layers import ConvLSTM2D

# Number of subsequences to break X into (we do 15 = 5x3, 5 subsequences of size 3 each)
nSeq = 5
nSteps = 3
# Each input is rows x columns, we have rows=1 and columns=nSteps

# define model
nNeurons = 50
nFeatures = 1
nFilters = 64

bestValidMAPE = 100
bestSeed = -1

# Reshape input
XTrainConv = XTrainList.reshape((XTrainList.shape[0], nSeq, 1, nSteps, nFeatures))
XValidConv = XValidList.reshape((XValidList.shape[0], nSeq, 1, nSteps, nFeatures))
XTestConv = XTestList.reshape((XTestList.shape[0], nSeq, 1, nSteps, nFeatures))

for seed in range(100):
    tf.random.set_seed(seed=seed)
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1,2), activation='relu', input_shape=(nSeq, 1, nSteps, nFeatures)))
    model.add(Flatten())
    model.add(Dense(1))
    opt = Adam(learning_rate=0.1)
    model.compile(optimizer=opt, loss='mse')

    # fit model
    model.fit(XTrainConv, yTrain, epochs=1000, verbose=0)

    yPred = list(model.predict(XValidConv, verbose=0))
    yPredList = []
    for i in range(len(yPred)):
        yPredList.append(yPred[i][0])

#     for yTrue, yPred in zip(yTest, yPredList):
#         print(yTrue, yPred)

    MAPE = meanAbsolutePercentageError(yValid, yPredList)
    print(seed, MAPE)
    if MAPE < bestValidMAPE:
        print('Updating best MAPE to {}...'.format(MAPE))
        bestValidMAPE = MAPE
        print('Updating best seed to {}...'.format(seed))
        bestSeed = seed

# define model
print('Training model with best seed...')
tf.random.set_seed(seed=bestSeed)
model = Sequential()
model.add(ConvLSTM2D(filters=64, kernel_size=(1,2), activation='relu', input_shape=(nSeq, 1, nSteps, nFeatures)))
model.add(Flatten())
model.add(Dense(1))
opt = Adam(learning_rate=0.1)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(XTrainConv, yTrain, epochs=1000, verbose=0)

yPred = list(model.predict(XTestConv, verbose=0))
yPredList = []
for i in range(len(yPred)):
    yPredList.append(yPred[i][0])
    
MAPE = meanAbsolutePercentageError(yTest, yPredList)
print('Test MAPE:', MAPE)
MdSA = medianSymmetricAccuracy(yTest, yPredList)
print('Test MdSA:', MdSA)