## Set Up

In [0]:
#Initialize TPY
import tensorflow as tf
import os
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [0]:
#Code to install necessary libraries and perform authorization
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

#Mount google drive 
!mkdir -p drive 
!google-drive-ocamlfuse drive

#Path
import os
os.chdir('/content/drive/sp500AlgoTrading_Classification')

## Data Exploration
Training and validation data consists of intraday by minute data from 09/14/2009 to 4/24/2020.

In [0]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from numpy import array

In [0]:
df = pd.read_csv('SPYTrainValData.csv')
df.head()

In [0]:
#Reverse data so oldest date is first
df = df.sort_index(ascending=True, axis=0)
df = df.reindex(index=df.index[::-1])
df.head()

## Data Exploration - Interactive Price Graph

In [0]:
#Interactive S&P500 Price Graph

fig = px.line(df, x='Time', y='Last', title='S&P500 Price Evolution', height = 800)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

## Data Exploration - Interactive Volume Graph

In [0]:
#Interactive S&P500 Volume Graph

fig = px.line(df, x='Time', y='Volume', title='S&P500 Volume Evolution', height = 800)
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

## Data Processing - Creating a label
Let current period be A and next period be B. If B's change is lower than or equal to 0 we assign a 0. If greater than 0 we assign a 1. This integer assignment will be A's label. This way each period's label will tell us the classification of the next period's price movement. This will require removal of latest period in dataset as that period does not have a classification (has not occured yet) and will be what we attempt to predict.

In [0]:
#Change the "Change" column to binary
conditions = [df['Change'] <= 0, df['Change'] > 0]
choices = [0, 1]
df['Change'] = np.select(conditions, choices)
df.tail()

In [0]:
#Shift the "Change" column up
df['Change'] = df['Change'].shift(-1)
df.tail()

In [0]:
#Change name from "Change" to "Target" and drop last row
df = df.drop(df.tail(1).index)
df['Target'] = df['Change']
df = df[['Time', 'Open', 'High', 'Low', 'Last', 'Volume', 'Target']]
df.tail()

In [0]:
#Label frequency
labels="0","1"
sizes=[sum(df["Target"]==0),sum(df["Target"]==1)]
colors=["red","green"]
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',startangle=90)
plt.axis('equal')
plt.show()

## Data Processing - Feature Engineering

Implement three different technical indicators for predictive features. We will use one trend following, one momentum, and one volume indicator in order to create a multi-indicator strategy. These features will be created using the Techinical Analysis Library.
- trend following: bollinger bands
- momentum: relative strength index
- volume: on balance volume

In [0]:
!pip install ta
import ta 

In [0]:
#Initialize Bollinger Bands
indicator_bb = ta.volatility.BollingerBands(close = df["Last"], n=20, ndev=2)

#Add Bollinger Bands features
df['bbMAV'] = indicator_bb.bollinger_mavg()
df['bbHiBand'] = indicator_bb.bollinger_hband()
df['bbLoBand'] = indicator_bb.bollinger_lband()

In [0]:
#Initialize Relative Strength Index
indicator_rsi = ta.momentum.RSIIndicator(close = df["Last"], n = 14, fillna = False)

#Add Relative Strength Index feature
df['RSI'] = indicator_rsi.rsi()

In [0]:
#Initialize On Balance Volume
indicator_obv = ta.volume.OnBalanceVolumeIndicator(close = df['Last'], volume = df['Volume'], fillna = False)

#Add On Balance Volume feature
df['OBV'] = indicator_obv.on_balance_volume()

In [0]:
df.head()

## Data Processing - Cleaning Data
Break apart time column into 6 different seperate columns, each column will be a piece of the date or time.

In [0]:
df['Date'] = pd.to_datetime(df['Time'], yearfirst = True, format = '%m/%d/%Y %H:%M')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour
df['Minute'] = df['Date'].dt.minute

In [0]:
#Drop "Time" column for further analysis
df = df[['Year', 'Month', 'Day', 'Hour','Minute', 'Open', 'High', 'Low', 'Last', 'Volume', 'bbMAV', 'bbHiBand', 'bbLoBand', 'RSI', 'OBV', 'Target']]
df.head()

## Data Preparation for Model

In [0]:
df.head()

In [0]:
#Scale values to between 0 and 1 for easier training
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaledFeatures =  MinMaxScaler().fit_transform(df)
df = pd.DataFrame(scaledFeatures, columns = ['Year', 'Month', 'Day', 'Hour','Minute', 'Open', 'High', 'Low', 'Last', 'Volume', 'bbMAV', 'bbHiBand', 'bbLoBand', 'RSI', 'OBV', 'Target'])

In [0]:
#Look at min and max 
df.describe()

In [0]:
dataset = df.values
print(dataset)
print(len(dataset))

In [0]:
#Split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		#Find the end of this pattern
		end_ix = i + n_steps
		#Check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		#Gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

In [0]:
#Choose a number of time steps
n_steps = 5
#Split into samples
X, y = split_sequences(dataset, n_steps)

print(X)
print(y)

In [0]:
#Reshape from [samples, timesteps] into [samples, batch, timesteps, features]
n_features = 15
batch = 1
n_steps = 5
X = X.reshape((X.shape[0], batch, n_steps, n_features))
print(X.shape)
print(len(X))
print(len(y))

In [0]:
#Creating train, validation, and test sets
#Test set is the last day in data set 4/24/2020
lenTrain= 1038000
X_train, X_val = X[:lenTrain], X[lenTrain:]
y_train, y_val = y[:lenTrain], y[lenTrain:]

## Model - Building
Academic Literature suggests a combination of LSTMs and CNNs will provide best results for time series classification.

In [0]:
!pip install pyyaml h5py  # Required to save models in HDF5 format

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import TimeDistributed
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [0]:
model = Sequential()
model.add(TimeDistributed(Conv1D(filters = 128, kernel_size = 8, padding='same', activation = 'relu', kernel_initializer='he_uniform'), input_shape=(batch, n_steps, n_features)))
model.add(TimeDistributed(Conv1D(filters = 256, kernel_size = 8, padding='same', activation = 'relu', kernel_initializer='he_uniform')))
model.add(TimeDistributed(Conv1D(filters = 128, kernel_size = 8, padding='same', activation = 'relu', kernel_initializer='he_uniform')))
model.add(TimeDistributed(MaxPooling1D(pool_size = 2, data_format = 'channels_last')))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(64, activation='relu', dropout = .8, return_sequences = True))
model.add(LSTM(32, activation='relu', dropout = .8, return_sequences = True))
model.add(LSTM(16, activation='relu', dropout = .8))
model.add(Dense(1))

In [0]:
#model.load_weights('/content/drive/sp500AlgoTrading_Classification/weights/01-0.6953.hd5')

In [0]:
ckpt_callback = ModelCheckpoint(filepath='/content/drive/sp500AlgoTrading_Classification/weights/{epoch:02d}-{val_loss:.4f}.hd5', monitor = 'val_loss', mode = 'min', save_best_only = True, verbose=1)
model.compile(optimizer = 'adam', loss='binary_crossentropy' , metrics=['accuracy'])

In [0]:
model.summary()

## Model - Training

In [0]:
model.fit(X_train, y_train, epochs=500, validation_data=(X_val,y_val), callbacks=[ckpt_callback])

## Model - Testing
Testing data consists of intraday by minute data from 4/27/2020 to 4/30/2020.

In [0]:
dftest = pd.read_csv('SPYTestData.csv')

In [0]:
#Reordering
dftest = dftest.sort_index(ascending=True, axis=0)
dftest = dftest.reindex(index=dftest.index[::-1])
dftest = dftest.drop(dftest.index[0])
dftest.head()

In [0]:
#Creating Label
conditions = [dftest['Change'] <= 0, dftest['Change'] > 0]
choices = [0, 1]
dftest['Change'] = np.select(conditions, choices)
dftest['Change'] = dftest['Change'].shift(-1)
dftest = dftest.drop(dftest.tail(1).index)
dftest['Target'] = dftest['Change']
dftest = dftest[['Time', 'Open', 'High', 'Low', 'Last', 'Volume', 'Target']]
dftest.head()

In [0]:
#Feature Engineering
#Initialize Bollinger Bands
indicator_bb = ta.volatility.BollingerBands(close = dftest["Last"], n=20, ndev=2)

#Add Bollinger Bands features
dftest['bbMAV'] = indicator_bb.bollinger_mavg()
dftest['bbHiBand'] = indicator_bb.bollinger_hband()
dftest['bbLoBand'] = indicator_bb.bollinger_lband()

#Initialize Relative Strength Index
indicator_rsi = ta.momentum.RSIIndicator(close = dftest["Last"], n = 14, fillna = False)

#Add Relative Strength Index feature
dftest['RSI'] = indicator_rsi.rsi()

#Initialize On Balance Volume
indicator_obv = ta.volume.OnBalanceVolumeIndicator(close = dftest['Last'], volume = dftest['Volume'], fillna = False)

#Add On Balance Volume feature
dftest['OBV'] = indicator_obv.on_balance_volume()

dftest.head()

In [0]:
#Data Processing
dftest['Date'] = pd.to_datetime(dftest['Time'], yearfirst = True, format = None)
dftest['Year'] = dftest['Date'].dt.year
dftest['Month'] = dftest['Date'].dt.month
dftest['Day'] = dftest['Date'].dt.day
dftest['Hour'] = dftest['Date'].dt.hour
dftest['Minute'] = dftest['Date'].dt.minute
dftest = dftest[['Year', 'Month', 'Day', 'Hour','Minute', 'Open', 'High', 'Low', 'Last', 'Volume', 'bbMAV', 'bbHiBand', 'bbLoBand', 'RSI', 'OBV', 'Target']]
dftest.head()

In [0]:
#Scale values to between 0 and 1 for easier handling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaledFeatures =  MinMaxScaler().fit_transform(dftest)
dftest = pd.DataFrame(scaledFeatures, columns = ['Year', 'Month', 'Day', 'Hour','Minute', 'Open', 'High', 'Low', 'Last', 'Volume', 'bbMAV', 'bbHiBand', 'bbLoBand', 'RSI', 'OBV', 'Target'])
dataset = dftest.values
print(dataset)
print(len(dataset))

In [0]:
#Choose a number of time steps
n_steps = 3
#Split into samples
X, y = split_sequences(dataset, n_steps)

print(X)
print(y)

In [0]:
#Reshape from [samples, timesteps] into [samples, batch, timesteps, features]
n_features = 15
batch = 1
n_steps = 3
X = X.reshape((X.shape[0], batch, n_steps, n_features))
print(X.shape)
print(len(X))
print(len(y))

In [0]:
ytest = model.predict_classes(X, verbose=1)
print(ytest)

In [0]:
#Precision tp / (tp + fp)
precision = precision_score(y, ytest)
print('Precision: %f' % precision)
#Recall: tp / (tp + fn)
recall = recall_score(y, ytest)
print('Recall: %f' % recall)