# Multivariate LSTM Time Series Forecasting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# df = pd.read_csv('Data_cleaned/covid_clean_imputted_aggregated_vif.csv')
df = pd.read_csv('covid_clean_imputted_aggregated_vif_FINAL.csv')

In [3]:
# shoow all columns
pd.set_option('display.max_columns', None)

In [4]:
# replace the row number with the date
df['date'] = pd.to_datetime(df['date'])

In [5]:
df = df[df['date'] >= '2020-08-20']

In [6]:
df = df.set_index('date')

In [7]:
#  sort the index by date in ascending order and then by country in descending order 
df = df.sort_index(ascending=True, axis=0)

# df = df.sort_index(ascending=True, axis=0)

In [8]:
# sort again by state
# df = df.sort_values(by=['state'])

In [9]:
df.head(5)

Unnamed: 0_level_0,critical_staffing_shortage_anticipated_within_week_yes,hospital_onset_covid,inpatient_beds_used_covid,total_adult_patients_hospitalized_confirmed_and_suspected_covid,adult_icu_bed_covid_utilization_utilization,deaths_covid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-08-20,0.163098,6.475746,381.144524,380.922752,0.168298,16.404199
2020-08-21,0.161296,6.014806,375.289144,374.948576,0.16317,4.944459
2020-08-22,0.161633,6.158742,363.373915,357.390075,0.160292,4.493796
2020-08-23,0.161598,6.120531,363.130555,355.414296,0.15877,3.889325
2020-08-24,0.16239,6.43341,361.187255,354.84526,0.158838,3.963904


In [10]:
df.shape

(805, 6)

## KPSS Test

In [12]:
from statsmodels.tsa.stattools import kpss
def kpss_test(series, **kw):    
    statistic, p_value, n_lags, critical_values = kpss(series, **kw)
    # Format Output
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')
    print(f'Result: The series is {"not " if p_value < 0.05 else ""}stationary')

## Multivariate Time Series Forecasting with LSTMs in Keras

In [34]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

plot_template = dict(
    layout=go.Layout({
        "font_size": 18,
        "xaxis_title_font_size": 24,
        "yaxis_title_font_size": 24})
)

In [14]:
# plot the covid deaths sorted by month and grouped by state
fig = px.line(df, x=df.index, y='deaths_covid', color='state', title='Covid Deaths by State')
fig.update_layout(plot_template['layout'])
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['critical_staffing_shortage_anticipated_within_week_yes', 'hospital_onset_covid', 'inpatient_beds_used_covid', 'total_adult_patients_hospitalized_confirmed_and_suspected_covid', 'adult_icu_bed_covid_utilization_utilization', 'deaths_covid'] but received: state

In [None]:
fig.update_yaxes(range = [0, 2])
fig.show()

## Create the target variable

In [None]:
# group df by index 
# df_aggregate = df.groupby('date').sum()
# df_aggregate['deaths_covid']

In [None]:
# create a new column with average deaths per day in all states
# df['deaths_avg'] = df.groupby('date')['deaths_covid'].transform('mean')

In [None]:
# df.groupby('date')['deaths_covid']

In [None]:
# create a subset of the data for the state of California
# df_ca = df[df['state'] == 'CA']
# df = df_ca

In [None]:
# df_aggregate.shape

(805, 55)

In [None]:
# df_ca.shape

In [None]:
# df = df_aggregate

In [None]:
#  drop state column
# df = df.drop(['state'], axis=1)

In [11]:
target_sensor = "deaths_covid" # maybe specify a state ?
features = list(df.columns.difference([target_sensor]))

In [12]:
forecast_lead = 7
target = f"{target_sensor}_lead{forecast_lead}"

In [13]:
df[target] = df[target_sensor].shift(-forecast_lead)
df = df.iloc[:-forecast_lead]

## Create a hold-out test set and preprocess the data

In [14]:
# give the date index at 70% of the data
train_size = int(len(df) * 0.7)

In [15]:
# view row at index train_size
df.iloc[train_size]

critical_staffing_shortage_anticipated_within_week_yes               0.171635
hospital_onset_covid                                                13.489747
inpatient_beds_used_covid                                          356.044771
total_adult_patients_hospitalized_confirmed_and_suspected_covid    340.515150
adult_icu_bed_covid_utilization_utilization                          0.097775
deaths_covid                                                         5.076623
deaths_covid_lead7                                                   3.427361
Name: 2022-03-01 00:00:00, dtype: float64

In [16]:
test_start = "2022-01-01"

df_train = df.loc[:test_start].copy()
df_test = df.loc[test_start:].copy()

print("Test set fraction:", len(df_test) / len(df))

Test set fraction: 0.37468671679197996


## Standardize the features and target

In [17]:
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()

for c in df_train.columns:
    mean = df_train[c].mean()
    stdev = df_train[c].std()

    df_train[c] = (df_train[c] - mean) / stdev
    df_test[c] = (df_test[c] - mean) / stdev

## Create datasets that PyTorch DataLoader can work with

In [18]:
import torch
from torch.utils.data import Dataset

class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=5):
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i]

In [19]:
i = 27
sequence_length = 8

train_dataset = SequenceDataset(
    df_train,
    target=target,
    features=features,
    sequence_length=sequence_length
)

X, y = train_dataset[i]
print(X)

tensor([[-0.5967,  0.3265,  0.0324, -0.7287, -0.7090],
        [-0.6399,  0.2897, -0.0971, -0.7395, -0.7347],
        [-0.6640,  0.3295, -0.1223, -0.7299, -0.7720],
        [-0.7074,  0.1931, -0.1166, -0.7568, -0.7986],
        [-0.7186,  0.2667, -0.2004, -0.7771, -0.8108],
        [-0.7542,  0.3694, -0.1559, -0.7933, -0.7885],
        [-0.7179,  0.3465,  0.0233, -0.7658, -0.7641],
        [-0.7537,  0.3056,  0.1269, -0.7737, -0.7792]])


In [20]:
X, y = train_dataset[i + 1]
print(X)

tensor([[-0.6399,  0.2897, -0.0971, -0.7395, -0.7347],
        [-0.6640,  0.3295, -0.1223, -0.7299, -0.7720],
        [-0.7074,  0.1931, -0.1166, -0.7568, -0.7986],
        [-0.7186,  0.2667, -0.2004, -0.7771, -0.8108],
        [-0.7542,  0.3694, -0.1559, -0.7933, -0.7885],
        [-0.7179,  0.3465,  0.0233, -0.7658, -0.7641],
        [-0.7537,  0.3056,  0.1269, -0.7737, -0.7792],
        [-0.7761,  0.2893,  0.0946, -0.8183, -0.8126]])


In [21]:
print(df_train[features].iloc[(i - sequence_length + 1): (i + 1)])

            adult_icu_bed_covid_utilization_utilization  \
date                                                      
2020-09-09                                    -0.596660   
2020-09-10                                    -0.639942   
2020-09-11                                    -0.664013   
2020-09-12                                    -0.707430   
2020-09-13                                    -0.718626   
2020-09-14                                    -0.754167   
2020-09-15                                    -0.717880   
2020-09-16                                    -0.753712   

            critical_staffing_shortage_anticipated_within_week_yes  \
date                                                                 
2020-09-09                                           0.326548        
2020-09-10                                           0.289695        
2020-09-11                                           0.329532        
2020-09-12                                           0.1930

In [22]:
from torch.utils.data import DataLoader
torch.manual_seed(99)

train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)

X, y = next(iter(train_loader))
print(X.shape)
print(X)

torch.Size([3, 8, 5])
tensor([[[ 1.6157e+00,  2.1436e+00,  2.0318e+00,  1.7617e+00,  1.7673e+00],
         [ 1.6601e+00,  2.5638e+00,  2.1107e+00,  2.2780e+00,  2.2924e+00],
         [ 1.7008e+00,  2.4200e+00,  2.1711e+00,  2.3130e+00,  2.3284e+00],
         [ 1.7043e+00,  2.2870e+00,  2.3510e+00,  1.9022e+00,  1.9174e+00],
         [ 1.6549e+00,  2.2731e+00,  2.1362e+00,  1.8095e+00,  1.8264e+00],
         [ 1.6491e+00,  2.1781e+00,  2.0458e+00,  1.6714e+00,  1.6896e+00],
         [ 1.6879e+00,  2.1897e+00,  2.0348e+00,  1.7341e+00,  1.7537e+00],
         [ 1.7603e+00,  2.2429e+00,  2.0728e+00,  1.8762e+00,  1.8962e+00]],

        [[-6.2797e-01, -5.9954e-01, -4.0384e-01, -6.1695e-01, -6.1867e-01],
         [-6.5598e-01, -7.0519e-01, -2.2282e-01, -6.4756e-01, -6.4826e-01],
         [-6.6721e-01, -6.8999e-01, -4.7124e-01, -6.6501e-01, -6.6371e-01],
         [-6.8319e-01, -4.3166e-01, -3.3701e-01, -5.2096e-01, -5.1844e-01],
         [-7.1071e-01, -4.5147e-01, -3.1696e-01, -5.2595e-01, -5

## Create the datasets and data loaders for real

In [57]:
torch.manual_seed(42)

batch_size = 64
sequence_length = 32

train_dataset = SequenceDataset(
    df_train,
    target=target,
    features=features,
    sequence_length=sequence_length
)
test_dataset = SequenceDataset(
    df_test,
    target=target,
    features=features,
    sequence_length=sequence_length
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

X, y = next(iter(train_loader))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: torch.Size([64, 32, 5])
Target shape: torch.Size([64])


## The model and learning algorithm

In [100]:
from torch import nn

class ShallowRegressionLSTM(nn.Module):
    def __init__(self, num_sensors, hidden_units):
        super().__init__()
        self.num_sensors = num_sensors  # this is the number of features
        self.hidden_units = hidden_units
        self.num_layers = 1

        self.lstm = nn.LSTM(
            input_size=num_sensors,
            hidden_size=hidden_units,
            batch_first=True,
            num_layers=self.num_layers
        )

        self.linear = nn.Linear(in_features=self.hidden_units, out_features=1)

    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
        
        _, (hn, _) = self.lstm(x, (h0, c0))
        out = self.linear(hn[0]).flatten()  # First dim of Hn is num_layers, which is set to 1 above.

        return out

In [115]:
learning_rate = 1e-5
num_hidden_units = 32

model = ShallowRegressionLSTM(num_sensors=len(features), hidden_units=num_hidden_units)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training

In [116]:
epoch = 1309
train_loss = []
test_loss = []

In [117]:
def train_model(data_loader, model, loss_function, optimizer, lst):
    num_batches = len(data_loader)
    total_loss = 0
    model.train()
    
    for X, y in data_loader:
        output = model(X)
        loss = loss_function(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    lst.append(avg_loss)
    print(f"Train loss: {avg_loss}")

def test_model(data_loader, model, loss_function,lst):
    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in data_loader:
            output = model(X)
            total_loss += loss_function(output, y).item()

    avg_loss = total_loss / num_batches
    lst.append(avg_loss)
    print(f"Test loss: {avg_loss}")


print("Untrained test\n--------")
test_model(test_loader, model, loss_function,test_loss)
print()

Untrained test
--------
Test loss: 1.3861043453216553



In [118]:
for ix_epoch in range(epoch):
    print(f"Epoch {ix_epoch}\n---------")
    train_model(train_loader, model, loss_function, optimizer=optimizer, lst=train_loss)
    test_model(test_loader, model, loss_function, lst=test_loss)
    print()

Epoch 0
---------
Train loss: 1.1014036908745766
Test loss: 1.383664345741272

Epoch 1
---------
Train loss: 1.1044779047369957
Test loss: 1.381239104270935

Epoch 2
---------
Train loss: 1.0975927785038948
Test loss: 1.3788601279258728

Epoch 3
---------
Train loss: 1.0944224074482918
Test loss: 1.3764549970626831

Epoch 4
---------
Train loss: 1.091709941625595
Test loss: 1.3740797281265258

Epoch 5
---------
Train loss: 1.0952034294605255
Test loss: 1.3717106461524964

Epoch 6
---------
Train loss: 1.091972328722477
Test loss: 1.3693118810653686

Epoch 7
---------
Train loss: 1.089638166129589
Test loss: 1.3669347882270813

Epoch 8
---------
Train loss: 1.093012422323227
Test loss: 1.364566707611084

Epoch 9
---------
Train loss: 1.078346110880375
Test loss: 1.3621972799301147

Epoch 10
---------
Train loss: 1.0848573669791222
Test loss: 1.3598775506019591

Epoch 11
---------
Train loss: 1.078510545194149
Test loss: 1.3575246334075928

Epoch 12
---------
Train loss: 1.07448246330022

In [105]:
# plot train_loss and test_loss with epoch in plotly
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(epoch)), y=train_loss, name="train_loss"))
fig.add_trace(go.Scatter(x=list(range(epoch)), y=test_loss, name="test_loss"))
fig.update_layout(title="Train and Test Loss", xaxis_title="Epoch", yaxis_title="Loss")
fig.show()

In [106]:
# lowest test loss
print("Lowest test loss:", min(test_loss))

Lowest test loss: 0.09761203862726689


In [107]:
# min test loss index
print("Min test loss index:", test_loss.index(min(test_loss)))

Min test loss index: 1309


## Evaluation

In [119]:
def predict(data_loader, model):

    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_star = model(X)
            output = torch.cat((output, y_star), 0)
    
    return output


train_eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_eval_loader, model).numpy()
df_test[ystar_col] = predict(test_loader, model).numpy()

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]

for c in df_out.columns:
    df_out[c] = df_out[c] * target_stdev + target_mean

print(df_out)

            deaths_covid_lead7  Model forecast
date                                          
2020-08-20            4.477194        5.159067
2020-08-21            4.222207        5.109308
2020-08-22            4.030190        5.013459
2020-08-23            4.573384        4.921908
2020-08-24            3.911342        4.852153
...                        ...             ...
2022-10-22            2.286635        2.238874
2022-10-23            2.129412        2.228937
2022-10-24            1.994329        2.252217
2022-10-25            1.855308        2.257771
2022-10-26            1.863226        2.267521

[799 rows x 2 columns]


In [120]:
fig = px.line(df_out, labels=dict(created_at="Date", value="Covid Deaths"))
fig.add_vline(x=test_start, line_width=4, line_dash="dash")
fig.add_annotation(xref="paper", x=0.75, yref="paper", y=0.8, text="Test set start", showarrow=False)
fig.update_layout(
    template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
)
fig.show()

In [121]:
#  plot test loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test.index, y=df_test[target], name="Actual"))
fig.add_trace(go.Scatter(x=df_test.index, y=df_test[ystar_col], name="Forecast"))
fig.update_layout(
    template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
)
# location of legend is set to top right
fig.update_layout(legend=dict(x=1, y=1))
fig.show()

In [122]:
#  calculate the mean absolute percentage error (MAPE)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print("MAPE on train set:", mape(df_train[target], df_train[ystar_col]))
print("MAPE on test set:", mape(df_test[target], df_test[ystar_col]))

MAPE on train set: 82.41856802345171
MAPE on test set: 44.32590443897456


In [123]:
# calculate the mean absolute error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

print("MAE on train set:", mae(df_train[target], df_train[ystar_col]))
print("MAE on test set:", mae(df_test[target], df_test[ystar_col]))

MAE on train set: 0.17212314342415153
MAE on test set: 0.23064866802349007


In [124]:
# calculate the mean squared error (MSE)
def mse(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

print("MSE on train set:", mse(df_train[target], df_train[ystar_col]))
print("MSE on test set:", mse(df_test[target], df_test[ystar_col]))

MSE on train set: 0.053585442728239216
MSE on test set: 0.11386798857258046


In [125]:
# calculate the root mean squared error (RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(np.mean(np.square(y_true - y_pred)))

print("RMSE on train set:", rmse(df_train[target], df_train[ystar_col]))
print("RMSE on test set:", rmse(df_test[target], df_test[ystar_col]))

RMSE on train set: 0.23148529700229173
RMSE on test set: 0.3374433116429787
