<a href="https://colab.research.google.com/github/aaronlwan/deep-learning-stock-lows/blob/main/Deep_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Modules
!pip install yfinance
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import math
from tensorflow.python.framework import ops
import matplotlib.pyplot as plt

# Data Processing

### Normalize Data

In [None]:
# Normalize Data Within Window (min-max)
def normalize_in_window(data):
  new_data = []
  for j in range(len(data)):
    prices = []
    volumes = []
    for i in range(len(data[j])):
      if (i + 1) % 5 == 0:
        volumes.append(data[j][i])
      else:
        prices.append(data[j][i])
    maxprice = max(prices)
    minprice = min(prices)
    maxvol = max(volumes)
    minvol = min(volumes)
    newrow = []
    price_index = 0
    vol_index = 0
    for k in range(len(data[j])):
      if (k + 1) % 5 == 0:
        newrow.append((volumes[vol_index] - minvol)/(maxvol-minvol))
        vol_index += 1
      else:
        newrow.append((prices[price_index] - minprice)/(maxprice - minprice))
        price_index += 1
    newrow.append(maxprice)
    newrow.append(minprice)
    new_data.append(newrow)
  data = np.array(new_data)
  return data

# Normalize Over the Training Timeframe (min-max)
def normalize_all(data, train_window):
  prices = []
  volumes = []
  for j in range(train_window):
    for i in range(len(data[j])):
      if (i + 1)%5 == 0:
        volumes.append(data[j][i])
      else:
        prices.append(data[j][i])
  maxprice = max(prices)
  minprice = min(prices)
  minvol = min(volumes)
  maxvol = max(volumes)
  print('Max Price:', maxprice)
  print('Min Price:', minprice)
  new_data = []
  for k in range(len(data)):
    row = []
    for i in range(len(data[k])):
      if (i + 1)%5 == 0:
        row.append((data[k][i]-minvol)/(maxvol-minvol))
      else:
        row.append((data[k][i]-minprice)/(maxprice-minprice))
    new_data.append(row)
  return np.array(new_data)

# Standardize over the whole Training Timeframe (0 mean, 1std)
def standardize_all(data):
  prices = []
  volumes = []
  for j in range(len(data)):
    for i in range(len(data[j])):
      if (i + 1)%5 == 0:
        volumes.append(data[j][i])
      else:
        prices.append(data[j][i])
  meanprice = np.average(np.array(prices))
  stdprice = np.std(np.array(prices))
  meanvol = np.sum(np.array(volumes))/len(prices)
  stdvol = np.std(np.array(volumes))
  print('Mean Price:', meanprice)
  print('STD Price:', stdprice)
  new_data = []
  for k in range(len(data)):
    row = []
    for i in range(len(data[k])):
      if (i + 1)%5 == 0:
        row.append((data[k][i]-meanvol)/(stdvol))
      else:
        row.append((data[k][i]-meanprice)/(stdprice))
    new_data.append(row)
  return np.array(new_data)

def log_normalization(data):
  return np.log(data)



# Normalize Over the Training Timeframe w/ vwap (min-max)
def normalize_all_vwap(data, train_window):
  prices = []
  volumes = []
  vwaps = []
  for j in range(train_window):
    index = 0
    for i in range(len(data[j])):
      if (index + 1)%5 == 0:
        volumes.append(data[j][i])
        index += 1
      elif (index + 1)%6 == 0:
        vwaps.append(data[j][i])
        index = 0
      else:
        prices.append(data[j][i])
        index += 1
  maxprice = max(prices)
  minprice = min(prices)
  minvol = min(volumes)
  maxvol = max(volumes)
  maxvwap = max(vwaps)
  minvwap = min(vwaps)
  print('Max Price:', maxprice)
  print('Min Price:', minprice)
  new_data = []
  for k in range(len(data)):
    row = []
    index = 0
    for i in range(len(data[k])):
      if (index + 1)%5 == 0:
        row.append((data[k][i]-minvol)/(maxvol-minvol))
        index += 1
      elif (index + 1)%6 == 0:
        row.append((data[k][i]-minvwap)/(maxvwap-minvwap))
        index = 0
      else:
        row.append((data[k][i]-minprice)/(maxprice-minprice))
        index += 1
    new_data.append(row)
  return np.array(new_data)


### Mine other features

In [None]:
!pip install ta
from ta.volume import VolumeWeightedAveragePrice
# vwap: volume weighted price average, used by many institutions to determine when to enter a position
def vwap_hour_to_hour(data):
  # Convert numpy array to df so we can use TA library
  data = pd.DataFrame(data, columns=['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'])
  data['vwap'] = VolumeWeightedAveragePrice(high=data['High'], low=data['Low'], close=data["Close"], volume=data['Volume'], window=3, fillna=True).volume_weighted_average_price()
  return data

### yfinance Data Processsing

In [None]:
# Get Data From Yfinance
def getData(ticker, input_interval, input_candles, output_interval):
  # Load Input Data
  # Input Data Columns: Date, Open, High, Low, Close, Volume
  input = yf.download(ticker, interval=input_interval)
  input = np.array(pd.DataFrame({'Date': input.index, 'Open': input.values[:, 0], 'High': input.values[:, 1],
                         'Low': input.values[:, 2], 'Close': input.values[:, 3],
                         'Volume': input.values[:, 5]}))
  # Output Data Colums: Date, Low
  # Load Output Data
  output = yf.download(ticker, interval=output_interval)
  output = np.array(pd.DataFrame({'Date': output.index, 'Low': output.values[:, 2]}))
  
  # Match Input w/ Output
  # Matched Data Columns: Repeat (Date, Open, High, Low, Close, Volume) for each input candle, Output Low
  data = []
  input_dates = list(input[:, 0])
  for row in output:
    data_row = []
    outdate = row[0]
    try:
      input_index = input_dates.index(outdate)
      # Collect the input_candles preceeding the outdate
      for i in range(input_candles):
        candle = input[input_index - 1 - i][1:]
        for item in candle:
          data_row.append(item)
      data_row.append(row[1])
      data.append(data_row)
    except: 
      pass
  return np.array(data)

In [None]:
# Get Data
data = getData("SPY", "1d", 15, "1d")
print(data.shape)
data = normalize_all(data, 7000)

### Intraday Data Processing

In [None]:
# Intraday Process (5min to Predict 1hr)
input_data = np.array(pd.read_csv('/content/drive/MyDrive/SPY_qjrt28/5min.csv', header=None))
output_data = np.array(pd.read_csv('/content/drive/MyDrive/SPY_qjrt28/1hr.csv', header=None))
data = []
deleted = []
i = 0
output_data = output_data[1:]
index = 0
while True:
  try:
    # Select 24 candles of data
    selection = input_data[i: i+24][:, 1:]
    last_timestamp = input_data[i: i+24][:, 0][23]
    # Check last candle, if it doesn't match up we just omit from dataset b/c we have a lot of datapoints
    if int(last_timestamp[14:16]) == 55:
      selection = list(selection.flatten())
      selection.append(output_data[index][1])
      data.append(selection)
      i += 12
    else:
      deleted.append(index)
      difference = int(last_timestamp[14:16]) + 60 - 55
      i += int(12 - difference/5)
    index += 1
  except:
    break

data = np.array(data)
print(data.shape)
print(deleted)
data = normalize_all(data, 33000)

In [None]:
# 1hr to predict 1d
from pandas import Timestamp
input_data = np.array(pd.read_csv('/content/drive/MyDrive/SPY_qjrt28/1hrinput.csv', header=None))
output = yf.download('SPY', interval='1d')
output_data = np.array(pd.DataFrame({'Date': output.index, 'Low': output.values[:, 2]}))
outdates = list(output_data[:, 0])
start = outdates.index(Timestamp('2005-01-05 00:00:00'))
output_data = output_data[start:]
data = []
deleted = []
i = 0
index = 0
while True:
  try:
    # Select 24 candles of data
    selection = input_data[i: i+24][:, 1:]
    last_timestamp = input_data[i: i+24][:, 0][23]
    # Check last candle, if it doesn't match up we just omit from dataset b/c we have a lot of datapoints
    if int(last_timestamp[11:13]) == 19:
      selection = list(selection.flatten())
      selection.append(output_data[index][1])
      data.append(selection)
      i += 12
    else:
      deleted.append(index)
      difference = int(last_timestamp[11:13]) - 7
      i += int(12 - difference)
    index += 1
  except:
    break
data = np.array(data)
print(data.shape)
data = log_normalization(data)

In [None]:
#1hr to predict 1hr
input_data = np.array(pd.read_csv('/content/drive/MyDrive/SPY_qjrt28/1hrinputandoutput.csv', header=None))
#input_data = np.array(vwap_hour_to_hour(input_data))
output_data = input_data[:, 3]
data = []
index = 0
output_data = output_data[30:]
for i in range(len(output_data)):
  row = input_data[i: i+30, 1:]
  row = row.flatten()
  row = list(row)
  row.append(output_data[i])
  data.append(row)
data = np.array(data)
print(data.shape)
data = normalize_all(data, 66000)

### Split Data

In [None]:
# Split Train/Test based on time frame (simulates predicting the future using past data)
x = data[:, [i for i in range(75)]]
y = np.transpose([data[:, 75]])
X_train, X_test, Y_train, Y_test = np.transpose(x[:7000]), np.transpose(x[7000:]), np.transpose(y[:7000]), np.transpose(y[7000:])

# Single Layer RNN

In [None]:
def format_rnn(X, d):
  # Each window is a column of X
  new_X = []
  for i in range(len(X[0])):
    col = X[:, i]
    sample = np.zeros((5, 1))
    # Every 5 datapoints in the column is one column in the sample
    bar_data = []
    for j in range(len(col)):
      bar_data.append(col[j])
      if (j + 1) % 5 == 0:
        sample = np.concatenate((sample, np.transpose([np.array(bar_data)])), axis=1)
        bar_data = []
    sample = np.delete(sample, 0, axis=1)
    new_X.append(sample)
  return np.array(new_X)

In [None]:
X_train = format_rnn(X_train, 15)
X_test = format_rnn(X_test, 15)
Y_train = Y_train.flatten()
Y_test = Y_test.flatten()

In [None]:
from keras.layers.recurrent import SimpleRNN
from keras.layers import Reshape
from keras.initializers import glorot_normal

def single_layer_rnn(input_shape = (5, 15)):
  tf.compat.v1.disable_eager_execution()
  X_input = Input(input_shape)
  X = SimpleRNN(1, activation='relu', kernel_initializer=glorot_normal())(X_input)
  model = Model(inputs = X_input, outputs = X, name='single layer rnn')
  return model

model = single_layer_rnn()
model.compile(optimizer='adam', loss='mean_absolute_error')
model.summary()

In [None]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = 200, batch_size = 32)

# Two Layer RNN

In [None]:
from keras.layers.recurrent import SimpleRNN
from keras.layers import Reshape
from keras.initializers import glorot_normal

def two_layer_rnn(input_shape = (5, 15)):
  tf.compat.v1.disable_eager_execution()
  X_input = Input(input_shape)
  X = SimpleRNN(4, activation='relu', return_sequences=True, kernel_initializer=glorot_normal())(X_input)
  X = SimpleRNN(1, activation='relu', kernel_initializer=glorot_normal())(X)
  model = Model(inputs = X_input, outputs = X, name='single layer rnn')
  return model

model = two_layer_rnn()
model.compile(optimizer='adam', loss='mean_absolute_error')
model.summary()

In [None]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = 200, batch_size = 32)

# CNN

In [None]:
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, Conv1D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D, RNN
from keras.callbacks import EarlyStopping
from keras.models import Model, load_model
from keras.initializers import glorot_uniform, glorot_normal

import keras.backend as K
K.set_image_data_format('channels_last')
K.set_learning_phase(1)

In [None]:
# Format X data to fit in CNN
# Format: m sets of 5xdx1 matricies, where d is the number of candles in the window
def format(X, d):
  # Each window is a column of X
  new_X = []
  for i in range(len(X[0])):
    col = X[:, i]
    sample = np.zeros((5, 1))
    # Every 5 datapoints in the column is one column in the sample
    bar_data = []
    for j in range(len(col)):
      bar_data.append(col[j])
      if (j + 1) % 5 == 0:
        sample = np.concatenate((sample, np.transpose([np.array(bar_data)])), axis=1)
        bar_data = []
    sample = np.delete(sample, 0, axis=1)
    sample = np.expand_dims(sample, 2)
    new_X.append(sample)
  return np.array(new_X)

In [None]:
X_train = format(X_train, 10)
print(X_train.shape)
Y_train = np.squeeze(Y_train)

In [None]:
from keras.layers.convolutional import Conv3D
from keras.layers import ConvLSTM2D
from keras.layers import Reshape

def basic_cnn(input_shape = (5, 10, 1)):
  tf.compat.v1.disable_eager_execution()
  X_input = Input(input_shape)
  X = Conv2D(filters=5, kernel_size=(3, 3), padding='valid', kernel_initializer=glorot_normal())(X_input)
  X = Conv2D(filters=5, kernel_size=(3, 3), padding='valid', kernel_initializer=glorot_normal())(X_input)
  X = Flatten()(X)
  X = Dense(128, activation='relu', kernel_initializer=glorot_normal())(X)
  X = Dense(64, activation='relu', kernel_initializer=glorot_normal())(X)
  X = Dense(32, activation='relu', kernel_initializer=glorot_normal())(X)
  X = Dense(16, activation='relu', kernel_initializer=glorot_normal())(X)
  X = Dense(1, activation='relu', kernel_initializer=glorot_normal())(X)
  model = Model(inputs = X_input, outputs = X, name='BasicCNN')
  return model

In [None]:
X_test = format(X_test, 10)
Y_test = np.squeeze(Y_test)

In [None]:
model = basic_cnn()
#es = EarlyStopping(monitor='val_loss', mode='min', patience=50)
model.compile(optimizer='adam', loss='mean_absolute_error')
model.summary()

In [None]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = 200, batch_size = 32)

In [None]:
preds = model.evaluate(X_test, Y_test)
predictions = model.predict(X_test)
print('Average Error', preds[0])

# Convert prediction back into actual prediction
mre = 0
pred_range = len(predictions)
for i in range(pred_range):
  maxprice = 450.094
  minprice = 52.0695
  meanprice = 179.1915384746307
  stdprice = 101.32692381699914
  actual_predict = predictions[i]*(maxprice - minprice) + minprice
  true_price = Y_test[i]*(maxprice - minprice) + minprice
  #actual_predict = predictions[i]*stdprice + meanprice
  #true_price = Y_test[i]*stdprice + meanprice
  mre += abs(actual_predict - true_price)/(true_price)
  #print(actual_predict, true_price)
print('MRE on Actual Prices:', mre[0]/pred_range)

# CNN-RNN

In [None]:
from keras.layers import Input, Dense, LSTM, Flatten, TimeDistributed, Conv2D, Reshape, SimpleRNN, GRU
from keras import Sequential
from keras.initializers import glorot_normal, glorot_uniform, HeNormal, HeUniform
tf.compat.v1.disable_eager_execution()
model = Sequential()
model.add(Conv2D(filters=3, kernel_size=(2, 3), strides=1, padding='valid', kernel_initializer=glorot_normal(), input_shape=(5, 10, 1)))
model.add(Conv2D(filters=2, kernel_size=(2, 3), strides=1, padding='valid', kernel_initializer=glorot_normal()))
model.add(Reshape((6, 6)))
model.add(SimpleRNN(20, activation='relu', kernel_initializer=glorot_normal()))
model.add(Dense(1, activation='relu', kernel_initializer=glorot_normal()))
model.compile(optimizer='adam', loss='mean_absolute_error')
model.summary()

In [None]:
X_train = format(X_train, 10)
Y_train = np.squeeze(Y_train)
X_test = format(X_test, 10)
Y_test = np.squeeze(Y_test)

In [None]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = 200, batch_size = 32)

In [None]:
preds = model.evaluate(X_test, Y_test)
predictions = model.predict(X_test)
print('Mean Error', preds)

# Convert prediction back into actual prediction
mre = 0
overpredict = 0
for i in range(len(predictions)):
  maxprice = 450.094
  minprice = 52.0695
  meanprice = 179.1915384746307
  stdprice = 101.09464575990093
  actual_predict = predictions[i]*(maxprice - minprice) + minprice
  true_price = Y_test[i]*(maxprice - minprice) + minprice
  #actual_predict = np.exp(predictions[i])
  #true_price = np.exp(Y_test[i])
  
  #actual_predict = predictions[i]*stdprice + meanprice
  #true_price = Y_test[i]*stdprice + meanprice
  if actual_predict > true_price:
    overpredict += 1
  mre += abs(actual_predict - true_price)/(true_price)
  #print(actual_predict, true_price)
print('MRE on Actual Prices:', mre[0]/len(predictions))
print('Overpredict Rate', overpredict/len(predictions))

In [None]:
# Simulation: buy a share at projected low if price reaches the projected low, then sell at close
output_data = np.array(pd.read_csv('/content/drive/MyDrive/SPY_qjrt28/1hrinputandoutput.csv', header=None))
removed = 0
output_data = output_data[15:]
output_data = output_data[66000:]
#print(len(output_data), len(predictions))
print('For the test interval, the price of SPY changed from', output_data[0][4], 'to', output_data[len(output_data)-1][4])

maxprice = 450.094
minprice = 52.0695
acc_value = 1000
winners = 0
buys = 0

for i in range(len(predictions)):
  actual_predict = predictions[i]*(maxprice - minprice) + minprice
  true_low = output_data[i][3]
  true_close = output_data[i][4]
  #print(actual_predict, true_low)
  # 0.9942 is a "correction" constant to account for the model's error
  if true_low < actual_predict*0.9942:
    acc_value = acc_value - actual_predict*0.9942+ true_close
    if actual_predict*0.9942 <= true_close:
      winners += 1
    buys += 1
    #print('Bought at:', str(actual_predict*0.9942, ' Sold at:', str(true_close), ' Current Acc Value:', str(acc_value))
print('This current strategy produces', str((acc_value[0]/1000 - 1)*100)+ "% Profit over the test timeframe")
print('Winrate:', winners/buys)
print('Total Transactions:', buys)