In [1]:
from pandas_datareader import data as pdr 
from datetime import date
import yfinance as yf 
yf.pdr_override()
import pandas as pd
import matplotlib.pyplot as plt 
import math
import quandl
import numpy as np

  from pandas.util.testing import assert_frame_equal


In [2]:
ticker_sp = '^GSPC'
ticker_gold = 'GC=F'
ticker_oil = 'CL=F'
ticker_dax = '^GDAXI'
ticker_nikkei = '^N225'
ticker_ftse = '^FTSE'
ticker_shanghai = '000001.SS'

auth_tok = "Nv1rJgRR7u88iz_dg7Y6"

end_date = "2020-05-1"
start_date = "2000-01-01"

In [3]:
def getGOLDData ():
    # Contains price and volume
    data = quandl.get("CHRIS/CME_GC1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[['Last', 'Volume']]
    data.columns = ["GOLD Adj Close", "GOLD Volume"]
    return data.dropna()

def getSPData():
    # Contains price and volume
    data = pdr.get_data_yahoo(ticker_sp, start=start_date, end=end_date)
    data = data[data.columns[4:6]] 
    data.columns = ["SP500 Adj Close",  "SP500 Volume"]
    return data

def getDAXData():
    # Contains price and volume
    data = pdr.get_data_yahoo(ticker_dax, start=start_date, end=end_date)
    data = data[data.columns[4:6]]
    data.columns = ["DAX Adj Close",  "DAX Volume"]
    return data


def getOILData():
    # Contains price and volume
    data = quandl.get("CHRIS/CME_CL1", trim_start = start_date, trim_end = end_date, authtoken=auth_tok)
    data = data[["Last", "Volume"]]
    data.columns=["OIL Adj Close", "OIL Volume"]
    return data.dropna()


def getNIKKEIData():
    # Contains only price
    data = pdr.get_data_yahoo(ticker_nikkei, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["NIKKEI Adj Close"]
    return data


def getFTSEData():
    # Contains price and volume
    data = pdr.get_data_yahoo(ticker_ftse, start=start_date, end=end_date)
    data = data[data.columns[4:6]] 
    data.columns = ["FTSE Adj Close",  "FTSE Volume"]
    return data

def getSHANGHAIData():
    # Contains only price
    data = pdr.get_data_yahoo(ticker_shanghai, start=start_date, end=end_date)
    data = data[data.columns[4:5]] 
    data.columns = ["SHANGHAI Adj Close"]
    return data

In [4]:
def checkData (data):
    counter = 0
    for index, row in data.iterrows():
        for item in row:
            if (math.isnan(item)):
                counter += 1
                break
    return counter

def normalizeData(data):
    for column in data:
        maxValue = max(data[column])
        data[column] = data[column] / maxValue

In [5]:
def combineData():
    allData = [getSPData(), getGOLDData(), getDAXData(), getOILData(), getNIKKEIData(), getSHANGHAIData(), getFTSEData()]
    mergedData = pd.concat(allData, axis = 1)
    print("REMOVED: {} DATA POINTS".format(checkData(mergedData)))
    cleanData = mergedData.dropna()
    normalizeData(cleanData)
    return cleanData

In [30]:
data = combineData()

print(data)

day_counter = 0

# Size of prediction
prediction_size = 3

# Number of days in the past
historical_size = 15

# Size of available data.
data_size = len(data) - prediction_size - historical_size

# Size of data alocated to training
training_size = int(0.85*data_size)

# Size of data alocated to testing
testing_size = data_size - training_size

# Number of data elements in a single day
daily_data_size = 12 

x_train = np.zeros((training_size, historical_size, daily_data_size))
y_train = np.zeros((training_size))
x_test  = np.zeros((testing_size, historical_size, daily_data_size))
y_test  = np.zeros((testing_size))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
REMOVED: 831 DATA POINTS
            SP500 Adj Close  SP500 Volume  GOLD Adj Close  GOLD Volume  \
Date                                                                     
2000-01-04         0.413278      0.088074        0.150209     0.038545   
2000-01-05         0.414072      0.094752        0.149362     0.027149   
2000-01-06         0.414468      0.095346        0.149521     0.031954   
2000-01-07         0.425696      0.106946        0.149786     0.023927   
2000-01-11         0.424836      0.088511        0.150580     0.038427   
...                     ...           ...             ...          ...   
2020-04-23         0.826248      0.502

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [50]:
day_counter = 0

for index, row in data.iterrows():
    
    if (day_counter == data_size):
        print("100% completed")
        break

    if (day_counter < len(x_train)):
        
        
        for i in range (historical_size):
            for j in range (daily_data_size):
                x_train[day_counter][i][j] = data.iloc[day_counter + i][j]
                
        base_value = data.iloc[day_counter + historical_size - 1]['SP500 Adj Close']
        future_values = []
        for i in range (prediction_size):
            future_values += [data.iloc[day_counter + historical_size + i]['SP500 Adj Close']] 
        average_future = sum(future_values) / len(future_values)
        if ((average_future - base_value) > 0):
            delta = 1
        else:
            delta = 0
            
        y_train[day_counter] = delta
        
        
    else:
        
        for i in range (historical_size):
            for j in range (daily_data_size):
                x_test[day_counter - len(x_train)][i][j] = data.iloc[day_counter + i][j]
                
        base_value = data.iloc[day_counter + historical_size - 1]['SP500 Adj Close']
        future_values = []
        for i in range (prediction_size):
            future_values += [data.iloc[day_counter + historical_size + i]['SP500 Adj Close']] 
        average_future = sum(future_values) / len(future_values)
        if ((average_future - base_value) > 0):
            delta = 1
        else:
            delta = 0
            
        y_test[day_counter - len(y_train)] = delta
        
    day_counter += 1
    print("{}% completed".format(int(100 * day_counter/data_size)), end = '\r')

100% completed


In [51]:
print(x_train)

[[[0.66896033 0.27519612 0.64478213 ... 0.51489584 0.9182332  0.16154064]
  [0.67335178 0.33265394 0.63991105 ... 0.51584431 0.91811759 0.24228033]
  [0.67875616 0.33571428 0.63556944 ... 0.51699366 0.91992808 0.22391463]
  ...
  [0.69378204 0.32956653 0.65150633 ... 0.52740577 0.93764765 0.16205741]
  [0.69318253 0.32055659 0.65537142 ... 0.53013586 0.93450179 0.15473031]
  [0.69434608 0.30665062 0.65378303 ... 0.52561489 0.9373395  0.18136051]]

 [[0.67335178 0.33265394 0.63991105 ... 0.51584431 0.91811759 0.24228033]
  [0.67875616 0.33571428 0.63556944 ... 0.51699366 0.91992808 0.22391463]
  [0.67825702 0.31514381 0.62905702 ... 0.51857131 0.91955573 0.20485719]
  ...
  [0.69318253 0.32055659 0.65537142 ... 0.53013586 0.93450179 0.15473031]
  [0.69434608 0.30665062 0.65378303 ... 0.52561489 0.9373395  0.18136051]
  [0.69854553 0.31247452 0.65500079 ... 0.53402748 0.93410372 0.20288735]]

 [[0.67875616 0.33571428 0.63556944 ... 0.51699366 0.91992808 0.22391463]
  [0.67825702 0.315143

In [22]:
print(data.tail(10))

            SP500 Adj Close  SP500 Volume  GOLD Adj Close  GOLD Volume  \
Date                                                                     
2020-04-16         0.826765      0.452155        0.913433     0.000203   
2020-04-17         0.848917      0.505589        0.892942     0.000280   
2020-04-20         0.833737      0.455661        0.900672     0.000133   
2020-04-21         0.808163      0.443063        0.886112     0.000836   
2020-04-22         0.826694      0.440779        0.913538     0.000569   
2020-04-23         0.826248      0.502479        0.922010     0.000605   
2020-04-24         0.837748      0.469132        0.907979     0.000278   
2020-04-27         0.850075      0.453400        0.908932     0.000212   
2020-04-28         0.845618      0.495179        0.905332     0.000132   
2020-04-30         0.860101      0.569395        0.893789     0.000411   

            DAX Adj Close  DAX Volume  OIL Adj Close  OIL Volume  \
Date                                       

In [29]:
print(data_size)
print(training_size)
print(testing_size)

4448
3780
668


In [37]:
print(data.iloc[1]['GOLD Adj Close'])

0.1493619950230317
