In [17]:
#!pip install hvplot

In [18]:
# Imports to get show started

import numpy as np
import pandas as pd
import hvplot.pandas
from numpy.random import seed
from pathlib import Path

# Import required preprocessing and Keras modules
from sklearn.preprocessing import MinMaxScaler
from tensorflow import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [19]:
# TODO after model experimentation
# Define random seed for reproducibility

# seed(1)
# random.set_seed(2)

In [20]:
# Read in prepared model dataset created in data_prep notebook 

model_df = pd.read_csv(Path('./ModelData/model_dataset.csv'),index_col="Date_Time",infer_datetime_format=True,parse_dates=True)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35513 entries, 2017-01-01 00:00:00 to 2021-01-15 00:00:00
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Close                 35513 non-null  float64
 1   Volume                35513 non-null  float64
 2   US_Holiday            35513 non-null  float64
 3   US_Market_Open        35513 non-null  float64
 4   Trail24hr_CloseRatio  35513 non-null  float64
 5   Trail12Wk_CloseRatio  35513 non-null  float64
 6   Trail52Wk_CloseRatio  35513 non-null  float64
 7   Hr_Return             35513 non-null  float64
 8   Trail24hr_Return      35513 non-null  float64
 9   Trail24hr_Std         35513 non-null  float64
 10  Trail12Wk_Return      35513 non-null  float64
 11  Trail12Wk_Std         35513 non-null  float64
 12  Trail52Wk_Return      35513 non-null  float64
 13  Trail52Wk_Std         35513 non-null  float64
 14  Trail24hr_VolRatio    35513 non-nul

In [21]:
# Last minute pruning of unwanted columns
#  remove Close price, US holiday
column_2drop_list = ['Close',
                     'Volume',
                     'US_Holiday']

# old: remove 12week numbers to reduce model fittin time for initial model evals
#column_2drop_list = ['Close',
#                     'US_Holiday',
#                     'Trail12Wk_CloseRatio',
#                     'Trail12Wk_Return',
#                     'Trail12Wk_Std',
#                     'Trail12Wk_VolRatio',
#                     'Vol_PctDelta']

model_df = model_df.drop(columns=column_2drop_list)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35513 entries, 2017-01-01 00:00:00 to 2021-01-15 00:00:00
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   US_Market_Open        35513 non-null  float64
 1   Trail24hr_CloseRatio  35513 non-null  float64
 2   Trail12Wk_CloseRatio  35513 non-null  float64
 3   Trail52Wk_CloseRatio  35513 non-null  float64
 4   Hr_Return             35513 non-null  float64
 5   Trail24hr_Return      35513 non-null  float64
 6   Trail24hr_Std         35513 non-null  float64
 7   Trail12Wk_Return      35513 non-null  float64
 8   Trail12Wk_Std         35513 non-null  float64
 9   Trail52Wk_Return      35513 non-null  float64
 10  Trail52Wk_Std         35513 non-null  float64
 11  Trail24hr_VolRatio    35513 non-null  float64
 12  Trail12Wk_VolRatio    35513 non-null  float64
 13  Trail52Wk_VolRatio    35513 non-null  float64
 14  Vol_PctDelta          35513 non-nul

In [22]:
def data_window_chopper(df, window_len, feature_col_numlist, target_col_num):
    """
    Function chops up dataframe features (X) defined by column numbers
    in feature_col_numlist and target (y) values defined by t_col_num
    with a rolling window of length window_len.
    """
    X = []
    y = []
    for i in range(len(df) - window_len):
        feature_list = []
        for feature_col_num in feature_col_numlist:
            feature_list.append(df.iloc[i:(i + window_len), feature_col_num])
        X.append(feature_list)
        y.append(df.iloc[(i + window_len), target_col_num])
    return np.array(X).reshape(-1,(len(feature_col_numlist)*window_len)), np.array(y).reshape(-1, 1)
#    return np.array(X), np.array(y).reshape(-1, 1)

In [23]:
# Create features (X) and target (y) data window sets

window_size = 12 # model dataset is hourly, try half day for starters
feature_col_numlist = list(range(model_df.shape[1]-1))
target_col_num = (model_df.shape[1] - 1) # 0s based column index
X, y = data_window_chopper(model_df, window_size, feature_col_numlist, target_col_num)

print(f"X sample values:\n {X[:2]}")

X sample values:
 [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.97383408e-01  9.95895542e-01  9.95218306e-01  9.93073726e-01
   9.93073726e-01  9.91801344e-01  9.93073726e-01  9.93073726e-01
   9.93073726e-01  9.94828382e-01  9.99989739e-01  1.00000000e+00
   9.89343186e-01  9.87867314e-01  9.87195538e-01  9.85068246e-01
   9.85068246e-01  9.83806121e-01  9.85068246e-01  9.85068246e-01
   9.85068246e-01  9.86808758e-01  9.91928507e-01  9.94585076e-01
   9.89343186e-01  9.87867314e-01  9.87195538e-01  9.85068246e-01
   9.85068246e-01  9.83806121e-01  9.85068246e-01  9.85068246e-01
   9.85068246e-01  9.86808758e-01  9.91928507e-01  9.94585076e-01
  -2.61659227e-03 -1.49176955e-03 -6.80026789e-04 -2.15488354e-03
   0.00000000e+00 -1.28125646e-03  1.28290018e-03  0.00000000e+00
   0.00000000e+00  1.76689399e-03  5.18818785e-03  2.67818

In [24]:
# Split 70% of the data for training, 30% for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [25]:
# Use the MinMaxScaler to scale data between 0 and 1.
scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# target is already boolean, doesnt need scaling
#scaler.fit(y)
#y_train = scaler.transform(y_train)
#y_test = scaler.transform(y_test)

In [26]:
# Keras LSTM API requires features data as a vertical vector

# reshape training and test data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print (f"X_train sample values:\n{X_train[:2]} \n")
#print (f"X_test sample values:\n{X_test[:5]}")

X_train sample values:
[[[0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [0.00000000e+00]
  [9.89981801e-01]
  [9.84285178e-01]
  [9.81692232e-01]
  [9.73481237e-01]
  [9.73481237e-01]
  [9.68609642e-01]
  [9.73481237e-01]
  [9.73481237e-01]
  [9.73481237e-01]
  [9.80199324e-01]
  [9.99960713e-01]
  [1.00000000e+00]
  [9.84769243e-01]
  [9.82659922e-01]
  [9.81699817e-01]
  [9.78659484e-01]
  [9.78659484e-01]
  [9.76855651e-01]
  [9.78659484e-01]
  [9.78659484e-01]
  [9.78659484e-01]
  [9.81147029e-01]
  [9.88464193e-01]
  [9.92260972e-01]
  [9.87342852e-01]
  [9.85589952e-01]
  [9.84792080e-01]
  [9.82265486e-01]
  [9.82265486e-01]
  [9.80766454e-01]
  [9.82265486e-01]
  [9.82265486e-01]
  [9.82265486e-01]
  [9.84332699e-01]
  [9.90413450e-01]
  [9.93568670e-01]
  [4.80061731e-01]
  [4.84087267e-01]
  [4.86992347e-01]
  [4.817

In [27]:
# LSTM RNN model definition

model = Sequential()
dropout_fraction = 0.25

# Layer 1
model.add(LSTM(
    units=window_size,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=window_size, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=window_size))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

In [28]:
# Model compilation and summary

# the output value is not continuous rather boolean so different loss parameter 
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 240, 12)           672       
_________________________________________________________________
dropout_3 (Dropout)          (None, 240, 12)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 240, 12)           1200      
_________________________________________________________________
dropout_4 (Dropout)          (None, 240, 12)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 12)                1200      
_________________________________________________________________
dropout_5 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [29]:
# Training time!
model.fit(X_train, y_train, epochs=5, shuffle = False, batch_size=1, verbose=1)

Epoch 1/5
 1072/24850 [>.............................] - ETA: 45:12 - loss: 3.8993e-06 - accuracy: 1.0000

KeyboardInterrupt: 

In [None]:
# Evaluate model performance with test data

model.evaluate(X_test, y_test)