# LSTM Machine Learning Model

## Install Required Modules/Libraries

In [None]:
# To install packages to our local area, use: 
%pip install tensorflow==2.5
%pip install numpy==1.19.5
%pip install keras
%pip install pandas
%pip install matplotlib
%pip install sklearn
%pip install datetime

## Imports and LSTM Data

In [None]:
import pandas as pd
from pandas import DataFrame, Series, concat, read_json
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping

import datetime
import math
import numpy as np
from numpy import concatenate

In [None]:
# Read in our cleaned and pre-formatted data into a pandas DataFrame from json format
with open("monthly_data.json") as file:
    data = read_json(file)

## Data Handling

In [None]:
# With our data read in, we want to focus on VTI in this model, dropping all other indexes
VTIDF = data.drop(['VGT', 'VIS', 'VHT', 'VFH', 'VCR'], axis=1)
VTIDF.info()

# Manually order the columns into desired output, makes referencing later on more simplistic
col_names = ['date',
             'VTI',
             '10-Year Treasury Constant Maturity Rate',
             'Inflation Expectations',
             'Consumer Sentiment & Consumer Confidence',
             'Advance Retail Sales: Retail Trade',
             'Unemployment Rate']
VTIDF = VTIDF.reindex(columns = col_names)
cols = list(VTIDF)[1:]

# set the dataframes index to dates for time series cases
VTIDF = VTIDF.set_index('date')[cols]
VTIDF = VTIDF.astype(float)

In [None]:
VTIDF

In [None]:
# train test split pre-scaling, size_mult is set to 0.95, this is used when getting 95% of the data points from our base data frame
size_mult = 0.95
train_size = int(len(VTIDF) * size_mult)
training = VTIDF[:train_size].copy() # first 95% of data points
testing = VTIDF[train_size:].copy() # remaining 5% of data points

# scaler and train test split post-scaling
scaler = MinMaxScaler() # creates a scaler with default end values between 0 and 1, fit individually per column

scaled_training = scaler.fit_transform(training) # fit scaler to training data and transform the training set to normalized values
scaled_X_train = scaled_training # all columns including VTI
scaled_y_train = scaled_training[:,0]  # VTI column

scaled_testing = scaler.transform(testing) # transform testing set to normalized values
scaled_X_test = scaled_testing # all columns including VTI
scaled_y_test = scaled_testing[:,0] # VTI column

## Time Series for ML Model

In [None]:
# define general parameters for time series generators and ML model
window_length = 1 # previous data points we want to reference from when training/predicting
batch_size = 1
num_features = 6 # static number of features we are testing on, this includes our desired outcome feature 'VTI'

In [None]:
# create ordered arrays for the train and test sets to train the ML model or test the model after training
# Example: x = window, o = desired outcome, - = unused data, length = x.count(), sampling_rate = o.count()
# gen[0]: |x|o|-|-|-|-|
# gen[1]: |-|x|o|-|-|-|
train_generator = TimeseriesGenerator(scaled_X_train, scaled_y_train, length=window_length, sampling_rate=1, batch_size=batch_size)
test_generator = TimeseriesGenerator(scaled_X_test, scaled_y_test, length=window_length, sampling_rate=1, batch_size=batch_size)

## LSTM ML Model

In [None]:
# Model and Nodes
# Uses Sequenctial model type from Keras, utilizing the LSTM model.
# With multivariable equations, it is likely that more than one node is helpful in the training and accuracy of the model but too many can heavily impact the speed and accuracy of the model
model = Sequential()

model.add(LSTM(128, activation='relu', return_sequences=True, input_shape = (window_length, num_features))) # First Node Layer, 128 nodes
model.add(Dropout(0.2)) # Dropout Layer to remove loosely fitted data, helps improve accuracy
model.add(LSTM(32, activation='relu', return_sequences=False)) # Second Node Layer, 32 nodes
model.add(Dropout(0.2)) # Dropout Layer
model.add(Dense(1)) # Dense Layer, returns output value in size (1,1)

# designed an early stopping method in case the model begins to overfit or not improve after 10 interances, note this does not take the most efficient epoch but the last one trained
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')

# compile the model using the Adam optimizer with mean-squared-error as our loss determinant
model.compile(optimizer='adam', loss='mse')

# gives a summary of the amount of parameters and other helpful information of the created model
model.summary()

# call fit_generator onto the model, training the model on the given TimeseriesGenerators created earlier
history = model.fit_generator(train_generator, validation_data=test_generator, epochs=100, shuffle = False, callbacks=[early_stopping])


In [None]:
# use this line to save the model to local or specific directory
# model.save('LSTM_ManualSplit_model.h5')

# use this line to load a saved model from local or specific directory
model = keras.models.load_model('LSTM_Model.h5')

In [None]:
# we may care to see how training our model varies over the training sets that it iterates through. We also want to know if it becomes more accurate while not becoming overfit
# the main thing we are looking for in this graph is that the training loss decreases over iterations as well as the validation loss.
# note: if the validation loss trend is increasing over multiple tests, this means our data is being overfit and the structure of the ML model should be tweaked
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()

## Model Predictions

In [None]:
# lets determine values outside of our training set utilizing the model we just created. Earlier, we created a test_generator which created the 
# window and desired value arrays which are used in the training and predictions
test_predictions = model.predict_generator(test_generator)
df_test_pred = pd.concat([pd.DataFrame(test_predictions), pd.DataFrame(scaled_X_test[:,1:][window_length:])], axis=1)

In [None]:
# here we determine what our model looks like when tested on our train data. This can be useful to see if overfitting occured when visualizing later on
train_predictions = model.predict_generator(train_generator)
df_train_pred = pd.concat([pd.DataFrame(train_predictions), pd.DataFrame(scaled_X_train[:,1:][window_length:])], axis=1)

In [None]:
# we need to revert our scaled values back to the original/real-world values that we are able to visualize and compare
rev_trans_test = scaler.inverse_transform(df_test_pred)
rev_trans_train = scaler.inverse_transform(df_train_pred)

In [None]:
# copy x amount of rows from original dataframe to apply predicted values to, x being the length of our test predictions
df_final_test = VTIDF[-test_predictions.shape[0]:].copy()

In [None]:
# copy x amount of rows from original dataframe to apply predicted values to, x being the length of our train predictions
df_final_train = VTIDF[:train_predictions.shape[0]].copy()

In [None]:
# add the predictions of the test set onto the test dataframe
df_final_test['VTI_Pred'] = rev_trans_test[:,0]
df_final_test.drop(['10-Year Treasury Constant Maturity Rate',
             'Inflation Expectations',
             'Consumer Sentiment & Consumer Confidence',
             'Advance Retail Sales: Retail Trade',
             'Unemployment Rate'], axis=1,inplace=True)
df_final_test

In [None]:
# add predictions of the training set onto the training dataframe
df_final_train['VTI_Pred'] = rev_trans_train[:,0]
df_final_train.drop(['10-Year Treasury Constant Maturity Rate',
             'Inflation Expectations',
             'Consumer Sentiment & Consumer Confidence',
             'Advance Retail Sales: Retail Trade',
             'Unemployment Rate'], axis=1,inplace=True)
df_final_train

## Visualizations

In [None]:
# plot test set predictions compared to original values
df_final_test.plot()

In [None]:
# plot train set predictions compared to original values
df_final_train.plot()

In [None]:
# plot the entire VTI data alongside both prediction sets. Note, model requires one previous value to determine next hence the reason for the gap between the sets
plt.figure(figsize=(16,9))

plt.title('Model')
plt.xlabel('Date')
plt.ylabel('Price')

plt.plot(VTIDF['VTI'])
plt.plot(df_final_train['VTI_Pred'])
plt.plot(df_final_test['VTI_Pred'])

plt.legend(['Actual Value', 'Predicted Training Value', 'Predicted Test Value'], loc='lower right')

plt.show()