In [10]:
from pandas_datareader import data as pdr 
from datetime import date
import yfinance as yf 
yf.pdr_override()
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt 
import math
import quandl
import numpy as np
from sklearn import preprocessing
import random

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
%matplotlib qt

In [11]:
ticker_sp = '^GSPC'
ticker_gold = 'GC=F'
ticker_oil = 'CL=F'
ticker_dax = '^GDAXI'
ticker_nikkei = '^N225'
ticker_ftse = '^FTSE'
ticker_shanghai = '000001.SS'

auth_tok = "Nv1rJgRR7u88iz_dg7Y6"

end_date = "2020-09-1"
start_date = "1970-01-02"

In [12]:
def getGOLDData ():
    # Contains only price from 1975 onwards
    data = quandl.get("CHRIS/CME_GC1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[['Last']]
    data.columns = ["GOLD Adj Close"]
    return data

def getSPData():
    # Contains price from 1970 onwards
    data = pdr.get_data_yahoo(ticker_sp, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["SP500 Adj Close"]
    return data

def getDAXData():
    # Contains price from 1988 onwards
    data = pdr.get_data_yahoo(ticker_dax, start=start_date, end=end_date)
    data = data[data.columns[4:5]]
    data.columns = ["DAX Adj Close"]
    return data


def getOILData():
    # Contains only price FROM 1984 onwards
    data = quandl.get("CHRIS/CME_CL1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[["Last"]]
    data.columns=["OIL Adj Close"]
    return data


def getNIKKEIData():
    # Contains only price from 1970 onwards
    data = pdr.get_data_yahoo(ticker_nikkei, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["NIKKEI Adj Close"]
    return data


def getFTSEData():
    # Contains price from 1984 onwards
    data = pdr.get_data_yahoo(ticker_ftse, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["FTSE Adj Close"]
    return data

def getSHANGHAIData():
    # Contains only price from 1997
    data = pdr.get_data_yahoo(ticker_shanghai, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["SHANGHAI Adj Close"]
    return data

In [13]:
def combineData():
    allData = [getSPData(), getDAXData(), getFTSEData(), getGOLDData(), getOILData()]
    mergedData = pd.concat(allData, axis = 1)
    cleanData = mergedData.dropna()
    return cleanData

In [14]:
data = combineData()

FUTURE_TO_PREDICT = 1 # Number of days into the future we want to predict

data['Future'] = data['SP500 Adj Close'].shift(-FUTURE_TO_PREDICT)

data.dropna(inplace=True)

def buy_or_sell (current, future):
    if (future > current):
        return 1
    else:
        return 0
    
data['Target'] = list(map(buy_or_sell, data['SP500 Adj Close'], data['Future']))

NUMBER_OF_DATA_POINTS = len(data)
SIZE_TRAINING = int(NUMBER_OF_DATA_POINTS * 0.85)
SIZE_TESTING  = NUMBER_OF_DATA_POINTS - SIZE_TRAINING
print("size of training data: {}".format(SIZE_TRAINING))
print("size of testing data: {}".format(SIZE_TESTING))

data_training = data[:SIZE_TRAINING]
data_testing  = data[SIZE_TRAINING:]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
size of training data: 6723
size of testing data: 1187


In [15]:
print(data_training)

            SP500 Adj Close  DAX Adj Close  FTSE Adj Close  GOLD Adj Close  \
Date                                                                         
1987-12-30       247.860001    1005.190002     1759.800049           485.5   
1988-01-04       255.940002     956.489990     1713.900024           480.5   
1988-01-05       258.630005     996.099976     1789.599976           483.2   
1988-01-06       258.890015    1006.010010     1787.099976           485.3   
1988-01-07       261.070007    1014.469971     1787.199951           483.1   
...                     ...            ...             ...             ...   
2015-08-25      1867.609985   10128.120117     6081.299805          1139.7   
2015-08-26      1940.510010    9997.429688     5979.200195          1124.5   
2015-08-27      1987.660034   10315.620117     6192.000000          1123.4   
2015-08-28      1988.869995   10298.530273     6247.899902          1132.8   
2015-09-01      1913.849976   10015.570312     6058.500000      

In [16]:
def process_data(data):
    data.drop("Future", axis = 1, inplace = True)  # Drop the future column so that the NN doesn't have access to the future
    
    for column in data.columns:  # Normalize the columns
        if column != "Target":   # We only want to normalize the other columns
            data[column] = data[column].pct_change()   # Normalization by percent change
            #data.dropna(inplace = True)
            #data[column] = preprocessing.scale(data[column].values)    # First testing without scaling
    
    data.dropna(inplace = True)

    sequential_data = []
    sequence_length = 15 # Number of days into the past we are using to make a prediction
    
    for day in range (len(data)-sequence_length+1):
        
        sequence = []
        
        for future_day in range (sequence_length):
            sequence.append(data.iloc[day + future_day][:-1])
        
        buy_or_sell = data.iloc[day + sequence_length - 1][-1]
        
        sequential_data.append([sequence, buy_or_sell])
    
    random.shuffle(sequential_data)
    
    buy_sequences  = []
    sell_sequences = []
    
    for sequence, target in sequential_data:
        if target == 1:
            buy_sequences.append([sequence, target])
        elif target == 0:
            sell_sequences.append([sequence, target])
            
    print("{} buys".format(len(buy_sequences)))
    print("{} sells".format(len(sell_sequences)))
    
    random.shuffle(buy_sequences)
    random.shuffle(sell_sequences)
    
    max_size = min(len(buy_sequences), len(sell_sequences))
    
    print("reduced to {} buys and sells".format(max_size))
    
    buy_sequences  = buy_sequences[:max_size]
    sell_sequences = sell_sequences[:max_size]
    
    sequential_data = buy_sequences + sell_sequences
    random.shuffle(sequential_data)
    
    x = []
    y = []
    
    for sequence, target in sequential_data:
        x.append(sequence)
        y.append(target)
        
    x = np.array(x)
    y = np.array(y)
    
    return x,y

In [17]:
x_train, y_train = process_data(data_training)
x_test, y_test   = process_data(data_testing)

3599 buys
3109 sells
reduced to 3109 buys and sells
653 buys
519 sells
reduced to 519 buys and sells


In [31]:
model = Sequential()

LSTM_REPRESENTATION = 16
DENSE_REPRESENTATION = 16


model.add(LSTM(LSTM_REPRESENTATION, activation = 'relu', input_shape = (x_train.shape[1:]), return_sequences = True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(LSTM_REPRESENTATION, return_sequences = True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(LSTM_REPRESENTATION))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(DENSE_REPRESENTATION, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation = 'softmax'))

In [32]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay = 1e-6)

model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = opt,
    metrics = ['accuracy']
)

tensorboard = TensorBoard(log_dir="logs\LSTM-{}-DENSE-{}".format(LSTM_REPRESENTATION, DENSE_REPRESENTATION))

In [33]:
history = model.fit(
    x_train, y_train,
    batch_size = 32,
    epochs = 50,
    validation_data = (x_test, y_test),
    callbacks=[tensorboard]
)

Train on 6218 samples, validate on 1038 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [34]:
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

plt.plot(accuracy, label = "Train Accuracy")
plt.plot(val_accuracy, label = "Test Accuracy")
plt.legend()
plt.show()