In [99]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import keras
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
warnings.filterwarnings("ignore")
import keras
import keras_tuner as kt

In [14]:
data = pd.read_csv('../data/regional_multivariate_data.csv')
enc = pd.read_csv('../data/regional_datasets/multivariate/enc_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
esc = pd.read_csv('../data/regional_datasets/multivariate/esc_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
midatl = pd.read_csv('../data/regional_datasets/multivariate/mid_atlantic_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
mount = pd.read_csv('../data/regional_datasets/multivariate/mountain_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
ne = pd.read_csv('../data/regional_datasets/multivariate/new_england_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
pac = pd.read_csv('../data/regional_datasets/multivariate/pacific_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
satl = pd.read_csv('../data/regional_datasets/multivariate/south_atlantic_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
wnc = pd.read_csv('../data/regional_datasets/multivariate/wnc_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')
wsc = pd.read_csv('../data/regional_datasets/multivariate/wsc_data_multi.csv').drop(columns = 'Unnamed: 0').set_index('date')

In [40]:
def process_normalize(df):
    x = df.drop(columns = ['new_confirmed', 'region'] + [col for col in df.columns if col[0].isdigit()])
    lag1df = x.shift(1)
    lag1df.columns = ['lag1_' + str(col) for col in lag1df.columns]
    df = df[['new_confirmed'] + [col for col in df.columns if col[0].isdigit()]].join(lag1df)
    df = df.dropna()
    normalizer = Normalizer()
    norm_df = normalizer.fit_transform(df.drop(columns = 'new_confirmed'))
    res = pd.concat([df.new_confirmed, 
                     pd.DataFrame(norm_df, columns = df.columns.drop('new_confirmed'), index = df.index)], axis = 1)
    return res

In [78]:
def make_series(data, length):
    series=[]
    for i in range(len(data)-length):
        subdf=data[i:i+length]
        series.append( (subdf.index[-1], subdf['new_confirmed'].values[-1], subdf.drop('new_confirmed', axis=1).values))
    return series

In [79]:
series = make_series(process_normalize(enc), 14)
print(series[0][0],series[0][1].shape,series[0][2].shape)
size = round(len(series)*0.2)
train = series[:-size]
test = series[-size:]

2020-03-12 () (1, 51)


In [107]:
series = make_series(process_normalize(enc), 7)
print(series[0][0],series[0][1].shape,series[0][2].shape)
size = round(len(series)*0.15)
train = series[:-size]
test = series[-size:]
val = train[-round(len(train)*0.15):]
train = train[:-round(len(train)*0.15)]
print(train[0][0],train[0][1].shape,train[0][2].shape)
print(val[0][0],val[0][1].shape,val[0][2].shape)

learning_rate = 0.1
batch_size = 32
epochs = 1000
sequence_length = 7
inputs = keras.layers.Input(shape=(7,51))
lstm_out = keras.layers.LSTM(32)(inputs)
outputs = keras.layers.Dense(1)(lstm_out)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mae")
model.summary()

x=np.array([x[2].reshape(7,51) for x in train])
y=np.array([x[1] for x in train])
xval=np.array([x[2].reshape(7,51) for x in val])
yval=np.array([x[1] for x in val])
print(x.shape)
print(y.shape)
history = model.fit(
    x=x,y=y,
    epochs=epochs,
    batch_size=batch_size,
    shuffle=True,
    validation_data = (xval, yval)
)



2020-03-18 () (7, 51)
2020-03-18 () (7, 51)
2021-11-22 () (7, 51)
Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_22 (InputLayer)       [(None, 7, 51)]           0         
                                                                 
 lstm_25 (LSTM)              (None, 32)                10752     
                                                                 
 dense_23 (Dense)            (None, 1)                 33        
                                                                 
Total params: 10785 (42.13 KB)
Trainable params: 10785 (42.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
(614, 7, 51)
(614,)
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000

In [115]:
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.LSTM(units = hp.Int('units', min_value = 32, max_value = 256, step = 32),
                                return_sequences = True,
                                input_shape = (x.shape[1], x.shape[2])))
    model.add(keras.layers.Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.05)))
    model.add(keras.layers.LSTM(units = hp.Int('units', min_value = 32, max_value = 256, step = 32)))
    model.add(keras.layers.Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.05)))
    model.add(keras.layers.Dense(1, activation = 'relu'))
    lr = hp.Choice('learning_rate', values = [0.1, 0.01, 0.001])
    opt1 = keras.optimizers.legacy.Adam(learning_rate = lr)
    model.compile(optimizer = opt1, loss='mse')
    return model

In [114]:
tuner = kt.Hyperband(build_model,
                     objective = 'val_loss',
                     max_epochs = 10,
                     hyperband_iterations = 1)

Reloading Tuner from ./untitled_project/tuner0.json


In [116]:
tuner.search(x = x, y = y, epochs = 50, batch_size = 128, validation_data = (xval, yval))

Trial 26 Complete [00h 00m 03s]
val_loss: 4039978240.0

Best val_loss So Far: 3980316416.0
Total elapsed time: 00h 01m 41s


In [118]:
np.sqrt(3980316416.0)

63089.74889789941