In [26]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### Data Wrangling

In [41]:
data=pd.read_csv("../S&P_transformed_17-20.csv")

In [42]:
data=data[65:]
data

Unnamed: 0,time,open,high,low,close,volume,close_change_$,close_change_%,volume_change,volume_change_%,sma_5_min,sma_8_min,sma_13_min,sma_5_bar,sma_8_bar,sma_13_bar,price_change_binary
65,7/28/17 10:34,246.260,246.300,246.2000,246.2600,222615,0.0000,0.000000,60335,37.179566,246.60100,246.665625,246.713077,246.691800,246.613802,246.538354,0
66,7/28/17 10:35,246.260,246.290,246.2257,246.2800,136025,0.0200,0.008121,-86590,-38.896750,246.59300,246.660625,246.716538,246.680400,246.604052,246.530662,1
67,7/28/17 10:36,246.290,246.365,246.2870,246.3600,119204,0.0800,0.032483,-16821,-12.366109,246.62500,246.681250,246.734231,246.677200,246.598928,246.526508,1
68,7/28/17 10:37,246.360,246.365,246.3300,246.3300,89515,-0.0300,-0.012177,-29689,-24.906043,246.66700,246.710000,246.748077,246.676000,246.595945,246.522662,0
69,7/28/17 10:38,246.330,246.380,246.3000,246.3150,151720,-0.0150,-0.006089,62205,69.491147,246.70500,246.742500,246.761154,246.674400,246.593945,246.519046,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323104,11/13/20 15:55,357.920,357.960,357.8100,357.8789,492569,-0.0411,-0.011483,102932,26.417409,357.35000,357.480838,357.522054,357.500748,357.480687,357.836252,0
323105,11/13/20 15:56,357.860,357.950,357.8000,357.8200,309294,-0.0589,-0.016458,-183275,-37.207985,357.44534,357.538338,357.544362,357.505148,357.506188,357.845791,0
323106,11/13/20 15:57,357.825,358.210,357.8118,358.2050,479111,0.3850,0.107596,169817,54.904718,357.53134,357.580837,357.564500,357.496948,357.530438,357.861098,1
323107,11/13/20 15:58,358.205,358.350,358.1400,358.3500,718921,0.1450,0.040480,239810,50.053119,357.62734,357.612087,357.589885,357.497348,357.553187,357.877252,1


In [43]:
# # Flipping the dataset
# df = data[::-1].reset_index(drop=True)
# df

In [44]:
data=data[["open","high","low","close","volume"]]
data

Unnamed: 0,open,high,low,close,volume
65,246.260,246.300,246.2000,246.2600,222615
66,246.260,246.290,246.2257,246.2800,136025
67,246.290,246.365,246.2870,246.3600,119204
68,246.360,246.365,246.3300,246.3300,89515
69,246.330,246.380,246.3000,246.3150,151720
...,...,...,...,...,...
323104,357.920,357.960,357.8100,357.8789,492569
323105,357.860,357.950,357.8000,357.8200,309294
323106,357.825,358.210,357.8118,358.2050,479111
323107,358.205,358.350,358.1400,358.3500,718921


**Scaling the Data**

In [31]:
# Number of datapoints to use in predictions (390 = 1 trading day)
history_points=390
# Value to predict (column index)
column=3

In [32]:
# Scaling data
MinMaxScaler = preprocessing.MinMaxScaler()
X = MinMaxScaler.fit_transform(data)

In [33]:
# Building normalised training data in segments of 390x5
# Get the normalised next day close data
historical_data_normalised =      np.array([X[i  : i + history_points].copy() for i in range(len(X) - history_points)])
next_day_close_values_normalised = np.array([X[:,column][i + history_points].copy() for i in range(len(X) - history_points)])
next_day_close_values_normalised = np.expand_dims(next_day_close_values_normalised, -1)

In [34]:
# Retriving the real next day open values
next_day_close_values = np.array([data.iloc[:,column][i + history_points].copy() for i in range(len(data) - history_points)])
# Expanding the dimentions of next_day_open_values_normalised (5246, 1, 1)
unscaled_y = np.expand_dims(next_day_close_values, -1)

In [35]:
y_normaliser = preprocessing.MinMaxScaler()
y_normaliser.fit(unscaled_y)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [36]:
assert historical_data_normalised.shape[0] == next_day_close_values_normalised.shape[0]

In [37]:
test_split = 0.9 # percent of data to be used for testing
n = int(historical_data_normalised.shape[0] * test_split)

# splitting the dataset up into train and test sets

X_train = historical_data_normalised[:n]
y_train = next_day_close_values_normalised[:n]

X_test = historical_data_normalised[n:]
y_test = next_day_close_values_normalised[n:]

unscaled_y_test = unscaled_y[n:]

In [38]:
# Viewing shape of input data
historical_data_normalised.shape

(322654, 390, 5)

In [39]:
import keras
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate
from keras import optimizers
import numpy as np
np.random.seed(4)
tf.random.set_seed(4)

lstm_input = Input(shape=(history_points, 5), name='lstm_input')
x = LSTM(history_points, name='lstm_0')(lstm_input)
x = Dropout(0.2, name='lstm_dropout_0')(x)
x = Dense(64, name='dense_0')(x)
x = Activation('sigmoid', name='sigmoid_0')(x)
x = Dense(1, name='dense_1')(x)
output = Activation('linear', name='linear_output')(x)
model = Model(inputs=lstm_input, outputs=output)

adam = optimizers.Adam(lr=0.0005)

model.compile(optimizer=adam, loss='mse')

In [40]:
model.fit(X_train, y_train, batch_size=32, epochs=50, shuffle=True, validation_split=0.1)
evaluation = model.evaluate(X_test, y_test)
print(f"Test Loss:{evaluation}")

Epoch 1/50
 217/8168 [..............................] - ETA: 4:18:37 - loss: 0.0029

KeyboardInterrupt: 

In [None]:
y_test_predicted = model.predict(X_test)
# # # model.predict returns normalised values
# # # now we scale them back up using the y_scaler from before
y_test_predicted = y_normaliser.inverse_transform(y_test_predicted)

# # # also getting predictions for the entire dataset, just to see how it performs
y_train_predicted = model.predict(X_train)
y_train_predicted = y_normaliser.inverse_transform(y_train_predicted)

# assert unscaled_y_test.shape == y_test_predicted.shape
real_mse = np.mean(np.square(unscaled_y_test - y_test_predicted))
scaled_mse = real_mse / (np.max(unscaled_y_test) - np.min(unscaled_y_test)) * 100
print(f"Scaled MSE: {scaled_mse}")

In [None]:
import matplotlib.pyplot as plt
plt.gcf().set_size_inches(22, 15, forward=True)

start = 0
end = -1

real = plt.plot(unscaled_y_test[start:end], label='real')
pred = plt.plot(y_test_predicted[start:end], label='predicted')

plt.legend(['Real', 'Predicted'])

plt.show()