In [120]:
#!pip install hvplot

In [121]:
# Imports to get show started

import numpy as np
import pandas as pd
import hvplot.pandas
from numpy.random import seed
from pathlib import Path

# Import required preprocessing and Keras modules
from sklearn.preprocessing import MinMaxScaler
from tensorflow import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [122]:
# TODO after model experimentation
# Define random seed for reproducibility

# seed(1)
# random.set_seed(2)

In [123]:
# Read in prepared model dataset created in data_prep notebook 

model_df = pd.read_csv(Path('./ModelData/model_dataset.csv'),index_col="Date_Time",infer_datetime_format=True,parse_dates=True)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35513 entries, 2017-01-01 00:00:00 to 2021-01-15 00:00:00
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Close                 35513 non-null  float64
 1   Volume                35513 non-null  float64
 2   US_Holiday            35513 non-null  float64
 3   US_Market_Open        35513 non-null  float64
 4   Trail24hr_CloseRatio  35513 non-null  float64
 5   Trail12Wk_CloseRatio  35513 non-null  float64
 6   Trail52Wk_CloseRatio  35513 non-null  float64
 7   Hr_Return             35513 non-null  float64
 8   Trail24hr_Return      35513 non-null  float64
 9   Trail24hr_Std         35513 non-null  float64
 10  Trail12Wk_Return      35513 non-null  float64
 11  Trail12Wk_Std         35513 non-null  float64
 12  Trail52Wk_Return      35513 non-null  float64
 13  Trail52Wk_Std         35513 non-null  float64
 14  Trail24hr_VolRatio    35513 non-nul

In [124]:
# Last minute pruning of unwanted columns
#  remove Close price, US holiday
column_2drop_list = ['Close',
                     'Volume',
                     'US_Holiday']

# old: remove 12week numbers to reduce model fittin time for initial model evals
#column_2drop_list = ['Close',
#                     'US_Holiday',
#                     'Trail12Wk_CloseRatio',
#                     'Trail12Wk_Return',
#                     'Trail12Wk_Std',
#                     'Trail12Wk_VolRatio',
#                     'Vol_PctDelta']

model_df = model_df.drop(columns=column_2drop_list)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35513 entries, 2017-01-01 00:00:00 to 2021-01-15 00:00:00
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   US_Market_Open        35513 non-null  float64
 1   Trail24hr_CloseRatio  35513 non-null  float64
 2   Trail12Wk_CloseRatio  35513 non-null  float64
 3   Trail52Wk_CloseRatio  35513 non-null  float64
 4   Hr_Return             35513 non-null  float64
 5   Trail24hr_Return      35513 non-null  float64
 6   Trail24hr_Std         35513 non-null  float64
 7   Trail12Wk_Return      35513 non-null  float64
 8   Trail12Wk_Std         35513 non-null  float64
 9   Trail52Wk_Return      35513 non-null  float64
 10  Trail52Wk_Std         35513 non-null  float64
 11  Trail24hr_VolRatio    35513 non-null  float64
 12  Trail12Wk_VolRatio    35513 non-null  float64
 13  Trail52Wk_VolRatio    35513 non-null  float64
 14  Vol_PctDelta          35513 non-nul

In [125]:
def data_window_chopper(df, window_len, feature_col_numlist, target_col_num):
    """
    Function chops up dataframe features (X) defined by column numbers
    in feature_col_numlist and target (y) values defined by t_col_num
    with a rolling window of length window_len.
    """
    X = []
    y = []
    for i in range(len(df) - window_len):
        feature_list = []
        for feature_col_num in feature_col_numlist:
            feature_list.append(df.iloc[i:(i + window_len), feature_col_num])
        X.append(feature_list)
        y.append(df.iloc[(i + window_len), target_col_num])
    return np.array(X).reshape(-1,(len(feature_col_numlist)*window_len)), np.array(y).reshape(-1, 1)

In [126]:
# Create features (X) and target (y) data window sets

window_size = 24 # model dataset is hourly, try full day
feature_col_numlist = list(range(model_df.shape[1]-1))
target_col_num = (model_df.shape[1] - 1) # 0s based column index
X, y = data_window_chopper(model_df, window_size, feature_col_numlist, target_col_num)

print(f"X sample values:\n {X[:2]}")

X sample values:
 [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.97383408e-01  9.95895542e-01  9.95218306e-01  9.93073726e-01
   9.93073726e-01  9.91801344e-01  9.93073726e-01  9.93073726e-01
   9.93073726e-01  9.94828382e-01  9.99989739e-01  1.00000000e+00
   9.98178376e-01  1.00000000e+00  1.00000000e+00  1.00000000e+00
   1.00000000e+00  9.98475212e-01  1.00000000e+00  9.99509215e-01
   1.00000000e+00  1.00000000e+00  9.98674692e-01  9.95047532e-01
   9.89343186e-01  9.87867314e-01  9.87195538e-01  9.85068246e-01
   9.85068246e-01  9.83806121e-01  9.85068246e-01  9.85068246e-01
   9.85068246e-01  9.86808758e-01  9.91928507e-01  9.94585

In [127]:
# Split 70% of the data for training, 30% for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [128]:
# Use the MinMaxScaler to scale data between 0 and 1.
scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# target is already boolean, doesnt need scaling

In [129]:
# Keras LSTM API requires features data as a vertical vector

# reshape training and test data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

#print (f"X_train sample values:\n{X_train[:2]} \n")
#print (f"X_test sample values:\n{X_test[:5]}")

In [130]:
# LSTM RNN model definition

model = Sequential()
dropout_fraction = 0.4

# Layer 1
model.add(LSTM(
    units=window_size,
    return_sequences=True,
    activation="sigmoid",
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=window_size, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=window_size, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1,activation="sigmoid"))

In [131]:
# Model compilation and summary

# the output value is not continuous rather boolean so different loss parameter 
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_29 (LSTM)               (None, 480, 24)           2496      
_________________________________________________________________
dropout_29 (Dropout)         (None, 480, 24)           0         
_________________________________________________________________
lstm_30 (LSTM)               (None, 480, 24)           4704      
_________________________________________________________________
dropout_30 (Dropout)         (None, 480, 24)           0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 480, 24)           4704      
_________________________________________________________________
dropout_31 (Dropout)         (None, 480, 24)           0         
_________________________________________________________________
dense_8 (Dense)              (None, 480, 1)           

In [132]:
# Training time!
model.fit(X_train, y_train, epochs=10, shuffle = False, batch_size=2, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [None]:
# Save the model state for later finish or evaluation
model.save("./Model2")



In [None]:
# Evaluate model performance with test data

print("/n*** All done with training...Model2 saved! ***/n")
print("Please head over to eval_model2 notebook to reload model and run test data for eval and plot generation...")