In [36]:
#!pip install hvplot

In [37]:
# Imports to get show started

import numpy as np
import pandas as pd
import hvplot.pandas
from numpy.random import seed
from pathlib import Path

# Import required preprocessing and Keras modules
from sklearn.preprocessing import MinMaxScaler
from tensorflow import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [38]:
# TODO after model experimentation
# Define random seed for reproducibility

# seed(1)
# random.set_seed(2)

In [39]:
# Read in prepared model dataset created in data_prep notebook 

model_df = pd.read_csv(Path('./ModelData/model_dataset.csv'),index_col="Date_Time",infer_datetime_format=True,parse_dates=True)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37337 entries, 2016-10-06 13:00:00 to 2021-01-15 00:00:00
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Close                 37337 non-null  float64
 1   Volume                37337 non-null  float64
 2   US_Holiday            37337 non-null  int64  
 3   US_Market_Open        37337 non-null  int64  
 4   Trail24hr_CloseRatio  37337 non-null  float64
 5   Trail12Wk_CloseRatio  37337 non-null  float64
 6   Trail52Wk_CloseRatio  37337 non-null  float64
 7   Hr_Return             37337 non-null  float64
 8   Trail24hr_Return      37337 non-null  float64
 9   Trail24hr_Std         37337 non-null  float64
 10  Trail12Wk_Return      37337 non-null  float64
 11  Trail12Wk_Std         37337 non-null  float64
 12  Trail52Wk_Return      37337 non-null  float64
 13  Trail52Wk_Std         37337 non-null  float64
 14  Trail24hr_VolRatio    37337 non-nul

In [40]:
# Last minute pruning of unwanted columns
#  for now remove volume calculations until data_prep notebook div0 issue resolved
#  remove US holiday, 12week numbers to reduce model fittin time for initial model evals

column_2drop_list = ['Close',
                     'Volume',
                     'US_Holiday',
                     'Trail12Wk_CloseRatio',
                     'Trail12Wk_Return',
                     'Trail12Wk_Std',
                     'Trail12Wk_VolRatio',
                     'Vol_PctDelta']
model_df = model_df.drop(columns=column_2drop_list)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37337 entries, 2016-10-06 13:00:00 to 2021-01-15 00:00:00
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   US_Market_Open        37337 non-null  int64  
 1   Trail24hr_CloseRatio  37337 non-null  float64
 2   Trail52Wk_CloseRatio  37337 non-null  float64
 3   Hr_Return             37337 non-null  float64
 4   Trail24hr_Return      37337 non-null  float64
 5   Trail24hr_Std         37337 non-null  float64
 6   Trail52Wk_Return      37337 non-null  float64
 7   Trail52Wk_Std         37337 non-null  float64
 8   Trail24hr_VolRatio    37337 non-null  float64
 9   Trail52Wk_VolRatio    37337 non-null  float64
 10  Significant_Drawdown  37337 non-null  int64  
dtypes: float64(9), int64(2)
memory usage: 3.4 MB


In [41]:
def data_window_chopper(df, window_len, feature_col_numlist, target_col_num):
    """
    Function chops up dataframe features (X) defined by column numbers
    in feature_col_numlist and target (y) values defined by t_col_num
    with a rolling window of length window_len.
    """
    X = []
    y = []
    for i in range(len(df) - window_len):
        feature_list = []
        for feature_col_num in feature_col_numlist:
            feature_list.append(df.iloc[i:(i + window_len), feature_col_num])
        X.append(feature_list)
        y.append(df.iloc[(i + window_len), target_col_num])
    return np.array(X).reshape(-1,(len(feature_col_numlist)*window_len)), np.array(y).reshape(-1, 1)
#    return np.array(X), np.array(y).reshape(-1, 1)

In [42]:
# Create features (X) and target (y) data window sets

window_size = 12 # model dataset is hourly, try half day for starters
feature_col_numlist = list(range(model_df.shape[1]-1))
target_col_num = (model_df.shape[1] - 1) # 0s based column index
X, y = data_window_chopper(model_df, window_size, feature_col_numlist, target_col_num)

print(f"X sample values:\n {X[:2]}")

X sample values:
 [[ 1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.98548550e-01  9.99820584e-01  1.00000000e+00  9.99135653e-01
   9.98515933e-01  9.98222382e-01  9.98532242e-01  9.98401774e-01
   9.98059297e-01  9.97749437e-01  9.97749437e-01  9.96738315e-01
   7.76475810e-01  7.77363515e-01  7.77604464e-01  7.76932344e-01
   7.76450447e-01  7.76222180e-01  7.76463129e-01  7.76361676e-01
   7.76095365e-01  7.75854416e-01  7.75854416e-01  7.75068163e-01
   2.28702115e-04  1.14324911e-03  3.09956117e-04 -8.64346521e-04
  -6.20256264e-04 -2.93987946e-04  3.10411868e-04 -1.30659176e-04
  -3.43025155e-04 -3.10462589e-04  0.00000000e+00 -1.01340307e-03
  -1.56155081e-03 -3.04155751e-04  1.36267765e-04  3.64728290e-04
   1.40996849e-03  6.58022709e-04  1.68775319e-03  3.46460063e-04
   3.43490809e-06 -3.07027681e-04 -8.78932451e-04 -1.38608

In [43]:
# Split 70% of the data for training, 30% for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [44]:
# Use the MinMaxScaler to scale data between 0 and 1.
scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# target is already boolean, doesnt need scaling
#scaler.fit(y)
#y_train = scaler.transform(y_train)
#y_test = scaler.transform(y_test)

In [45]:
# Keras LSTM API requires features data as a vertical vector

# reshape training and test data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print (f"X_train sample values:\n{X_train[:2]} \n")
#print (f"X_test sample values:\n{X_test[:5]}")

X_train sample values:
[[[1.        ]
  [1.        ]
  [1.        ]
  [1.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.        ]
  [0.99682171]
  [0.99960713]
  [1.        ]
  [0.99810731]
  [0.99675028]
  [0.99610748]
  [0.99678599]
  [0.99650031]
  [0.99575037]
  [0.99507186]
  [0.99507186]
  [0.99285777]
  [0.73451927]
  [0.7355736 ]
  [0.73585978]
  [0.7350615 ]
  [0.73448915]
  [0.73421803]
  [0.73450421]
  [0.73438371]
  [0.73406741]
  [0.73378124]
  [0.73378124]
  [0.7328474 ]
  [0.45997363]
  [0.46242816]
  [0.46019171]
  [0.45704002]
  [0.45769513]
  [0.45857079]
  [0.46019293]
  [0.45900915]
  [0.45843918]
  [0.45852658]
  [0.45935982]
  [0.45663997]
  [0.64007373]
  [0.64147583]
  [0.64196694]
  [0.6422217 ]
  [0.64338723]
  [0.64254874]
  [0.64369698]
  [0.64220133]
  [0.64181882]
  [0.64147263]
  [0.64083491]
  [0.6402694 ]
  [0.00765889]
  [0.00824139]
  [0.00827463]
  [0.00809023]
  [0.00701009]
  

In [46]:
# LSTM RNN model definition

model = Sequential()
dropout_fraction = 0.25

# Layer 1
model.add(LSTM(
    units=window_size,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=window_size, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=window_size))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

In [47]:
# Model compilation and summary

# the output value is not continuous rather boolean so different loss parameter 
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 120, 12)           672       
_________________________________________________________________
dropout_3 (Dropout)          (None, 120, 12)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 120, 12)           1200      
_________________________________________________________________
dropout_4 (Dropout)          (None, 120, 12)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 12)                1200      
_________________________________________________________________
dropout_5 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [48]:
# Training time!
model.fit(X_train, y_train, epochs=5, shuffle = False, batch_size=1, verbose=1)

Epoch 1/5
  886/26127 [>.............................] - ETA: 28:23 - loss: 8.9903e-06 - accuracy: 1.0000

KeyboardInterrupt: 

In [None]:
# Evaluate model performance with test data

model.evaluate(X_test, y_test)