In [1]:
from pandas_datareader import data as pdr 
from datetime import date
import yfinance as yf 
yf.pdr_override()
import pandas as pd
import matplotlib.pyplot as plt 
import math
import quandl
import numpy as np
from sklearn import preprocessing
import random

from tensorflow.keras.models import Seque
%matplotlib qt

  from pandas.util.testing import assert_frame_equal


In [2]:
ticker_sp = '^GSPC'
ticker_gold = 'GC=F'
ticker_oil = 'CL=F'
ticker_dax = '^GDAXI'
ticker_nikkei = '^N225'
ticker_ftse = '^FTSE'
ticker_shanghai = '000001.SS'

auth_tok = "Nv1rJgRR7u88iz_dg7Y6"

end_date = "2020-07-1"
start_date = "1970-01-02"

In [3]:
def getGOLDData ():
    # Contains only price from 1975 onwards
    data = quandl.get("CHRIS/CME_GC1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[['Last']]
    data.columns = ["GOLD Adj Close"]
    return data

def getSPData():
    # Contains price from 1970 onwards
    data = pdr.get_data_yahoo(ticker_sp, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["SP500 Adj Close"]
    return data

def getDAXData():
    # Contains price from 1988 onwards
    data = pdr.get_data_yahoo(ticker_dax, start=start_date, end=end_date)
    data = data[data.columns[4:5]]
    data.columns = ["DAX Adj Close"]
    return data


def getOILData():
    # Contains only price FROM 1984 onwards
    data = quandl.get("CHRIS/CME_CL1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[["Last"]]
    data.columns=["OIL Adj Close"]
    return data


def getNIKKEIData():
    # Contains only price from 1970 onwards
    data = pdr.get_data_yahoo(ticker_nikkei, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["NIKKEI Adj Close"]
    return data


def getFTSEData():
    # Contains price from 1984 onwards
    data = pdr.get_data_yahoo(ticker_ftse, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["FTSE Adj Close"]
    return data

def getSHANGHAIData():
    # Contains only price from 1997
    data = pdr.get_data_yahoo(ticker_shanghai, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["SHANGHAI Adj Close"]
    return data

In [4]:
def combineData():
    allData = [getSPData(), getDAXData(), getFTSEData(), getGOLDData(), getOILData()]
    mergedData = pd.concat(allData, axis = 1)
    cleanData = mergedData.dropna()
    return cleanData

In [34]:
data = combineData()

FUTURE_TO_PREDICT = 1

data['Future'] = data['SP500 Adj Close'].shift(-FUTURE_TO_PREDICT)

data.dropna(inplace=True)

def buy_or_sell (current, future):
    if (future > current):
        return 1
    else:
        return 0
    
data['Target'] = list(map(buy_or_sell, data['SP500 Adj Close'], data['Future']))

NUMBER_OF_DATA_POINTS = len(data)
SIZE_TRAINING = int(NUMBER_OF_DATA_POINTS * 0.85)
SIZE_TESTING  = NUMBER_OF_DATA_POINTS - SIZE_TRAINING
print("size of training data: {}".format(SIZE_TRAINING))
print("size of testing data: {}".format(SIZE_TESTING))

data_training = data[:SIZE_TRAINING]
data_testing  = data[SIZE_TRAINING:]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
size of training data: 6677
size of testing data: 1179


In [35]:
print(data_training)

            SP500 Adj Close  DAX Adj Close  FTSE Adj Close  GOLD Adj Close  \
Date                                                                         
1987-12-30       247.860001    1005.190002     1759.800049           485.5   
1988-01-04       255.940002     956.489990     1713.900024           480.5   
1988-01-05       258.630005     996.099976     1789.599976           483.2   
1988-01-06       258.890015    1006.010010     1787.099976           485.3   
1988-01-07       261.070007    1014.469971     1787.199951           483.1   
...                     ...            ...             ...             ...   
2015-06-19      2109.989990   11040.099609     6710.500000          1201.5   
2015-06-22      2122.850098   11460.500000     6825.700195          1183.7   
2015-06-23      2124.199951   11542.540039     6834.899902          1176.2   
2015-06-24      2108.580078   11471.259766     6844.799805          1172.6   
2015-06-25      2102.310059   11473.129883     6807.799805      

In [36]:
def process_data(data):
    data.drop("Future", axis = 1, inplace = True)  # Drop the future column so that the NN doesn't have access to the future
    
    for column in data.columns:  # Normalize the columns
        if column != "Target":   # We only want to normalize the other columns
            data[column] = data[column].pct_change()   # Normalization by percent change
            #data.dropna(inplace = True)
            #data[column] = preprocessing.scale(data[column].values)    # First testing without scaling
    
    data.dropna(inplace = True)

    sequential_data = []
    sequence_length = 15
    
    for day in range (len(data)-sequence_length+1):
        
        sequence = []
        
        for future_day in range (sequence_length):
            sequence.append(data.iloc[day + future_day][:-1])
        
        buy_or_sell = data.iloc[day + sequence_length - 1][-1]
        
        sequential_data.append([sequence, buy_or_sell])
    
    random.shuffle(sequential_data)
    
    buy_sequences  = []
    sell_sequences = []
    
    for sequence, target in sequential_data:
        if target == 1:
            buy_sequences.append([sequence, target])
        elif target == 0:
            sell_sequences.append([sequence, target])
            
    print("{} buys".format(len(buy_sequences)))
    print("{} sells".format(len(sell_sequences)))
    
    random.shuffle(buy_sequences)
    random.shuffle(sell_sequences)
    
    max_size = min(len(buy_sequences), len(sell_sequences))
    
    print("reduced to {} buys and sells".format(max_size))
    
    buy_sequences  = buy_sequences[:max_size]
    sell_sequences = sell_sequences[:max_size]
    
    sequential_data = buy_sequences + sell_sequences
    random.shuffle(sequential_data)
    
    x = []
    y = []
    
    for sequence, target in sequential_data:
        x.append(sequence)
        y.append(target)
        
    x = np.array(x)
    
    return x,y

In [37]:
x_train, y_train = process_data(data_training)
x_test, y_test   = process_data(data_testing)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


3577 buys
3085 sells
reduced to 3085 buys and sells
634 buys
530 sells
reduced to 530 buys and sells


In [38]:
model = Sequential()

model.add(LSTM(128, activation = 'relu', input_shape = (x_train.sphape[1:]), return_sequences = True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences = True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation = 'softmax'))

NameError: name 'Sequential' is not defined

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay = 1e-6)

model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = opt,
    metrics = ['accuracy']
)

NAME = "first_trial"

tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

history = model.fit(
    x_train, y_train,
    batch_size = 32,
    epochs = 20,
    validation_data = (x_test, y_test),
    callbacks=[tensorboard]
)