Downloads and installs the dependencies required for TA-Lib and TA-Lib, Wikipedia where we get our list of the Dow Jones Industrial Average and the Yahoo Finance API.

In [0]:
import urllib.request
print('Beginning file download with urllib2...')
url = 'https://netcologne.dl.sourceforge.net/project/ta-lib/ta-lib/0.4.0/ta-lib-0.4.0-src.tar.gz'
urllib.request.urlretrieve(url, '/content/ta-lib-0.4.0-src.tar.gz')
print("Download complete. Unpacking...")
!tar -xzf ta-lib-0.4.0-src.tar.gz
%cd ./ta-lib
!./configure --prefix=/usr
!make
!sudo make install
%cd /content
!pip install TA-Lib
!pip install yfinance
!pip install wikipedia

Imports the necessary libraries.

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import sklearn.metrics as metrics
import os
from os import path
import shutil
from datetime import datetime
keras = tf.keras
import yfinance as yf
import wikipedia as wp
import talib
from talib.abstract import *
%load_ext tensorboard

Scrapes the wikipedia page of the Dow Jones Index and gets the companies which are currently in it.

In [0]:
dow_jones_tickers = []
html = wp.page("Dow_Jones_Industrial_Average").html().encode("UTF-8")
try: 
    df = pd.read_html(html)[1]  # Try 2nd table first as most pages contain contents table first
except IndexError:
    df = pd.read_html(html)[0]

for symbol in df['Symbol']:
  is_there = symbol.find('NYSE:')
  #print(symbol)
  if is_there == 0:
    symbol = symbol.lstrip('NYSE:').strip()
  
  dow_jones_tickers.append(symbol)
  
print(len(dow_jones_tickers))
print(dow_jones_tickers)

This cell downloads the price data for the companies in the Dow Jones Index and saves it as csv to a path. This only needs to be run once a day after market close to update the prices.

In [0]:
# for ticker in dow_jones_tickers:
#   curr_ticker = yf.Ticker(ticker)
#   ticker_history = curr_ticker.history(period='max', interval = '1d')
#   ticker_history.drop(columns=['Dividends', 'Stock Splits'], inplace=True)
#   ticker_history.to_csv('/content/drive/My Drive/Data Sets/Price Data/%s_price_data.csv' %ticker)
#   time.sleep(1)

# df_display = pd.read_csv('/content/drive/My Drive/Data Sets/Price Data/AAPL_price_data.csv')
# df_display

Generates a dataset of technical indicators for a company based on how many days in the past we want to look when training the algorithm.

In [0]:
def gen_technical_data_single(ticker, days):
  
  path_name = os.getcwd() + '/Technical Sets/%s' %ticker
  print(path_name)
  if path.exists(path_name) == False:
    os.makedirs(path_name)
  else:
    shutil.rmtree(path_name)
    os.mkdir(path_name)
  
  generation_start = time.time()

  #Change to own path. Standard Open, High, Low, Close data generated by yfinance.
  #Dividends and stock splits columns dropped.
  ticker_history = pd.read_csv('/content/drive/My Drive/Data Sets/Price Data/%s_price_data.csv' %ticker)
  ticker_history.set_index('Date', inplace=True)
    
  inputs = {
      'open' : ticker_history['Open'].to_numpy(),
      'high' : ticker_history['High'].to_numpy(),
      'low'  : ticker_history['Open'].to_numpy(),
      'close': ticker_history['Close'].to_numpy(),
      'volume': ticker_history['Volume'].astype(float)
  }
    
  tasize = talib.get_function_groups()
  count = 0 
  total_count = 0
  for group in tasize:    
      
    for indicator in tasize[group]:
          # There is a bug which does no allow the creation of MAVP indicator.
          if indicator == 'MAVP':
            continue
            
          method = getattr(talib.abstract, indicator)
          output = method(inputs, timeperiod = days)
            
          if len(output) == len(ticker_history):
            ticker_history[indicator] = output
            count+=1
            total_count += count * len(ticker_history)
  print("Indicators created for", ticker, ": ", count)
  ticker_history.to_csv(path_name +'/%s_technical_indicators.csv' %ticker)

  #print("Installation of libraries and generation of data sets took: ", (time.time()-start_time))
  print("Total number of indicators created: ", total_count)
  print("Generation of datasets in seconds took: ", (time.time()-generation_start))

Loads a dataset with technical indicators and selects the window of time we want to look at.

In [0]:
def transform_dataset(ticker):
  path_to = '/content/Technical Sets/'+ ticker + '/%s_technical_indicators.csv' %ticker
  #path_to = '/content/drive/My Drive/Data Sets/Price Data/%s_price_data.csv' %ticker 
  print(path_to)
  company_indicators = pd.read_csv(path_to)
  start_date = datetime.strptime('01-01-2000', '%d-%m-%Y')
  end_date = datetime.strptime('01-01-2011', '%d-%m-%Y')
  company_indicators['Date'] = pd.to_datetime(company_indicators['Date'])
  mask = (company_indicators['Date'] >= start_date) & (company_indicators['Date'] <= end_date)
  company_indicators = company_indicators.loc[mask]
  company_indicators.reset_index(inplace=True)
  company_indicators.drop(columns='index', inplace=True)
  company_indicators.drop(columns='Date', inplace=True)  
  return company_indicators


After we have selected our time frame we generate a number of sliding window datasets used for training and predictions.

In [0]:
def generate_sets(ds, window_size):
  row = 0
  
  labels = []
  array_of_arrays = []
  
  ds = ds.fillna(0)
  
  scaler = preprocessing.MinMaxScaler()
  for column in ds.columns:
    #print(column)
    series = np.array(ds[column])
    series = series.reshape((-1,1))
    series = scaler.fit_transform(series)
    series = series.flatten()
    ds[column] = series
  print("Preprocessing done")
  
  while row < len(ds):
    
    if row >= window_size:
      window_start = row - window_size
      window_end = row
      window_array = []
      
      for item in range(window_start, window_end):
        window_array.append(ds.iloc[item].to_numpy())
      
      labels.append(ds.at[row,'Close'])
      array_of_arrays.append(window_array)
    
    row+=1

  print("Data shape:", np.array(array_of_arrays).shape, 
        "Labels shape:", np.array(labels).shape)
  
  return np.array(array_of_arrays), np.array(labels)

We split the generated sliding windows datasets and labels into training, validation and testing sets. We plot and save the plots of the closing prices of all 3 sets.

In [0]:
def gen_train_val_test_single(days, ticker, save_path):
  gen_technical_data_single(ticker, days)
  transformed_dataset = transform_dataset(ticker)
  data, labels = generate_sets(transformed_dataset, days)
  
  # features_train = data[:-5]
  # labels_train = labels[:-5]
  # features_test = data[(len(data)-5):]
  # labels_test = labels[(len(labels)-5):]

  # print(len(features_train))
  # print(len(features_test))
  features_train, features_test, labels_train, labels_test = train_test_split(data, labels, test_size = 0.10, shuffle = False, random_state = 42)
  features_train, validation_data, labels_train, labels_validation = train_test_split(features_train, labels_train, test_size = 0.15, shuffle = False, random_state = 42 )

  print("Train")
  print(len(features_train))
  print(len(labels_train))
  plt.plot(labels_train)
  plt.xlabel('Day')
  plt.ylabel('Scaled Price')
  plt.savefig((save_path + '/train.png'))
  plt.show()
  
  
  print("Validation")
  print(len(validation_data))
  print(len(labels_validation))
  plt.plot(labels_validation)
  plt.xlabel('Day')
  plt.ylabel('Scaled Price')
  plt.savefig((save_path + '/validation.png'))
  plt.show()
  

  print("Test")
  print(len(features_test))
  print(len(labels_test))
  plt.plot(labels_test)
  plt.xlabel('Day')
  plt.ylabel('Scaled Price')
  plt.savefig((save_path + '/test.png'))
  plt.show()
  
  
  return features_train, validation_data, features_test, labels_train, labels_validation, labels_test, transformed_dataset 

After we have completed training and predictions we reverse the scaling on the predictions and testin dataset in order to see the actual values. We calculate and save the means of both predictions and testing dataset.

In [0]:
def reverse_scaling(dataset, predictions, l_test, path):
  scaler_2 = preprocessing.MinMaxScaler()
  
  scale_it = scaler_2.fit(np.array(dataset['Close']).reshape((-1,1)))
  
  test_labels_inversed = scale_it.inverse_transform(l_test.reshape((-1,1)))
  test_labels_inversed = test_labels_inversed.flatten()
  
  predictions_inversed = scale_it.inverse_transform(predictions)
  predictions_inversed = predictions_inversed.flatten()
  
  pred_str = "Predictions mean: %f"  %predictions_inversed.mean()
  test_str = "Test labels mean: %f"  %test_labels_inversed.mean()


  filepath = '%s/means.txt' %path
  print(filepath) 
  f = open(filepath, 'w+')
  f.write(str(pred_str + '\n'))
  f.write(str(test_str))
  f.close()
  print(str(pred_str))
  print(str(test_str))
  
  return predictions_inversed, test_labels_inversed

We generate the paths and make the folders where the logs of training will be stored.

In [0]:
def path_to_log():
  now = datetime.now()
  current_time = now.strftime("%d-%m-%Y %H:%M:%S")
  str_path = '/content/drive/My Drive/Logs LSTM/%s' %current_time
  os.makedirs(str_path)
  return str_path

def spec_path(ticker, days, gen_path):
  if days < 10:
    days = '0' + str(days)
  spec_path = '/%s/%s' %(ticker, days)
  #print(gen_path + spec_path)
  os.makedirs(gen_path + spec_path)
  return gen_path + spec_path

After training and predictions are completed we plot the predictions and testing labels to see how well the algorithm did. We also save the plot.

In [0]:
def plot_results(predictions_descaled, l_test_descaled, save_path):
  plt.figure(figsize=(15, 10))
  plt.plot(l_test_descaled, label = 'Price')
  plt.plot(predictions_descaled, label = 'Prediction')
  plt.xlabel('Day')
  plt.ylabel('Price')
  plt.legend(bbox_to_anchor=(1,1), loc="upper left")
  plt.savefig((save_path + '/comparion_plot.png'))
  plt.show()

After the predictions are complete we calculate how many of the days as a percentage the algorithm managed to correctly predict the direction of the price.

In [0]:
def count_correct_directions(predictions_descaled, l_test_descaled, path):
  day_label = 0
  pred_label = 0
  count_correct = 0
  for day in range(len(predictions_descaled)):
    if day >= 1:
      if l_test_descaled[day] - l_test_descaled[day-1] > 0:
        day_label = 1
      else:
        day_label = 0
      
      if predictions_descaled[day] - predictions_descaled[day - 1] > 0:
        pred_label = 1
      else :
        pred_label = 0

      if pred_label == day_label: 
        count_correct += 1


  accuracy_movement = count_correct/len(predictions_descaled)
  f = open(str(path + '/precent_correct_directions.txt'), 'w+')
  f.write(str("Percentage of accurately predicted next day directions" + str(accuracy_movement)))
  f.close()

We set the parameters and compile the model.

In [0]:
def compile_model(f_train):
  keras.backend.clear_session()
  tf.random.set_seed(42)
  model = keras.models.Sequential([
                                  keras.layers.LSTM(units=149, return_sequences=True,
                                                    input_shape = (f_train.shape[1],
                                                                    f_train.shape[2])),
                                  keras.layers.Dropout(rate = 0.5), 
                                  keras.layers.LSTM(units=75, return_sequences=True),
                                  keras.layers.Dropout(rate = 0.5),
                                  keras.layers.LSTM(units=35, return_sequences=False),
                                  keras.layers.Dense(units = 1)
  ])


  model.compile(loss='mse', optimizer = 'adam', metrics=['mae', 'mse'])
  model.summary()
  return model

We loop through all companies in the Dow Jones Index, create datasets for between 3 and 15 days looking in the past then make train a model on for the company and days and save the data generated by the model.

In [0]:
path_to_save = path_to_log()
# for ticker in dow_jones_tickers:
  
#    if ticker == 'DOW' or ticker == 'V':
#       continue

ticker = 'HD'
for days in range(3, 15+1):

    spec_path_to_save = spec_path(ticker, days, path_to_save)
    print("LOG PATH: " + spec_path_to_save)
    f_train, f_val, f_test, l_train, l_val, l_test, dataset = gen_train_val_test_single(days, ticker, spec_path_to_save)
    
    model = compile_model(f_train)
    model_checkpoint = keras.callbacks.ModelCheckpoint('%s/best_model.h5' %spec_path_to_save, save_best_only=True)
    tensorboard_path = spec_path_to_save + '/tensorboard/'
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = tensorboard_path, histogram_freq=1) 
    
    model.fit(f_train, l_train, epochs=50, batch_size = 62,
                        shuffle= False,
                        callbacks = [tensorboard_callback],
                        verbose = 0)
    #history = keras.models.load_model('%s/best_model.h5' %spec_path_to_save)
    predictions = model.predict(f_test)  
    
    predictions_descaled, l_test_descaled = reverse_scaling(dataset, predictions, l_test, spec_path_to_save)
    plot_results(predictions_descaled, l_test_descaled, spec_path_to_save)
    count_correct_directions(predictions_descaled, l_test_descaled, spec_path_to_save)