## Predicting Stock Price Movement with a Feed Forward Neural Network, using Tensorflow

In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np

To validate our model, we split the the 186k events of 1/6/17 such that we train on the first 80% of the day, and validate using the final 20%. After tuning parameters, we use the entire 1/6/17 data set to train, and then test using the following day's data, which is from 1/9/17.

In [7]:
# remove the first 1000 and last 1000 events due to weirdness from the beginning and end of the trading day.
train = pd.read_csv('../SOXX_01_06_processed.csv')
train_data = train[1000:-1000].reset_index(drop=True)

test = pd.read_csv('../SOXX_01_09_processed.csv')
test_data = test[1000:-1000].reset_index(drop=True)

### Preprocessing: Standardizing Features and Encoding Labels
Use sklearn prepocessing module to both encode labels, and standard scale the columns for the train, validation, and test sets.

In [8]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

bid_columns = []
ask_columns = []

for i in range(10, 0, -1):
    bid_columns.append('bid' + str(i))

for i in range(1,11):
    ask_columns.append('ask' + str(i))

train_depths = train_data[bid_columns + ask_columns]
test_depths = test_data[bid_columns + ask_columns]
depth_names = train_depths.columns

scaled_depths = scaler.fit_transform(train_depths)
train_x = pd.DataFrame(scaled_depths, columns=depth_names)
train_x['normalized_mid_price'] = train_data.norm_mid_price
train_x['mid_price_change'] = train_data.norm_mid_price_change
train_x['normalized_relative_depth'] = train_data.norm_rel_depth

scaled_test_depths = scaler.fit_transform(test_depths)
test_x = pd.DataFrame(scaled_test_depths, columns=depth_names)
test_x['normalized_mid_price'] = test_data.norm_mid_price
test_x['mid_price_change'] = test_data.norm_mid_price_change
test_x['normalized_relative_depth'] = test_data.norm_rel_depth

train_set_x = train_x[:150000]
validation_x = train_x[150000:]

Check that the input columns look good and standardized

In [9]:
train_set_x[10000:10010]

Unnamed: 0,bid10,bid9,bid8,bid7,bid6,bid5,bid4,bid3,bid2,bid1,...,ask4,ask5,ask6,ask7,ask8,ask9,ask10,normalized_mid_price,mid_price_change,normalized_relative_depth
10000,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-1.813906,-1.779646,-1.305669,-0.946409,-0.96268,-0.949827,-0.346373,-1.427243,1.140265,2.69412
10001,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-1.813906,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.140265,2.694462
10002,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-1.813906,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.140265,2.694462
10003,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-1.813906,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.140265,3.329511
10004,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-1.243165,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.140265,2.694369
10005,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-0.862671,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.140265,2.339232
10006,-0.916484,0.469729,-0.763767,0.172598,-0.578536,-0.742413,-1.439773,-0.087217,0.251144,-0.754155,...,-0.862671,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.306359,2.339232
10007,0.987799,-0.475337,0.863938,-0.076316,-0.350305,-1.189503,0.04008,-0.087016,-1.386611,-0.754155,...,-0.862671,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.306359,2.516934
10008,0.987799,-0.475337,0.863938,-0.076316,-0.350305,-1.189503,-0.38271,-0.087016,-1.386611,-0.754155,...,-0.862671,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.472468,2.339333
10009,0.987799,-0.475337,0.863938,-0.076316,-0.350305,-1.189503,-0.38271,-0.087016,-1.386611,-0.754155,...,-0.862671,-1.779646,-1.3057,-0.946599,-0.617941,-0.94987,-0.924091,-1.427243,1.472468,2.339333


### Label Encoding

In [10]:
le = preprocessing.LabelEncoder()

train_y = train_data['change_label']
le.fit(train_y)
encoded_train_y = le.transform(train_y)
encoded_train_set_y = encoded_train_y[:150000]
encoded_validation_set_y = encoded_train_y[150000:]

test_y = test_data['change_label']
encoded_test_y = le.transform(test_y)

## Building the Neural Net

#### Defining Layer Hyperparameters

In [11]:
n_inputs = len(train_x.columns)
n_hidden1 = 13
n_hidden2 = 8
n_outputs = 3
num_examples = len(train_x)

#### Setting up placeholders and  2 hidden Layers

In [12]:
x_place = tf.placeholder(tf.float32, shape=(None, n_inputs), name="x_place")
y_place = tf.placeholder(tf.int32, shape=(None), name="y_place")

layer1 = tf.layers.dense(x_place, n_hidden1, activation = tf.nn.relu, name = 'hidden1')
layer2 = tf.layers.dense(layer1, n_hidden2, activation = tf.nn.relu, name = 'hidden2')
logits = tf.layers.dense(layer2, n_outputs, name = 'outputs')

predictions = tf.nn.softmax(logits, name = 'predictions') 

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.


#### Defining a loss function: softmax cross entropy

In [13]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_place, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    loss_summary = tf.summary.scalar('log_loss', loss)

#### Define an optimization method: AdamOptimizer with .0001 learn rate 

In [14]:
learning_rate = 0.0001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

#### Define an accuracy metric: did the neural network predict the correct label?

In [15]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y_place, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    accuracy_summary = tf.summary.scalar('accuracy', accuracy)

#### Neural Net Iteration
* Define epoch and batch_size parameters
* Instantiate variables and a saver to save the model

* We want to train sequentially, thus, for each iteration, we keep track of the current index,  increment by the batch size.
* We train on the batch, then change the start index such that we train on the next sequential batch
* Save and print results every 50 epochs to see the average accuracy and loss for that epoch.
* Finally save the model after all epochs have completed.

In [None]:
%%time
import datetime

n_epochs = 1000
batch_size = 500

num_examples = len(train_x)
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    init.run()
    writer = tf.summary.FileWriter('./graphs/', sess.graph)
    for epoch in range(n_epochs):
        epoch_acc = 0
        epoch_loss = 0
        iterations = 0
        current_index = 0
        for iteration in range(num_examples // batch_size):
            next_index = current_index + batch_size
            x_batch = train_x[current_index:next_index]
            y_batch = encoded_train_y[current_index:next_index]
            sess.run(training_op, feed_dict = {x_place:x_batch, y_place:y_batch})
            current_index = next_index
            acc_batch = accuracy.eval(feed_dict={x_place: x_batch, y_place: y_batch})
            loss_batch = loss.eval(feed_dict={x_place: x_batch, y_place: y_batch})
            
            if (epoch+1) % 50 == 0:
                acc_batch = accuracy.eval(feed_dict={x_place: x_batch, y_place: y_batch})
                loss_batch = loss.eval(feed_dict={x_place: x_batch, y_place: y_batch})
                epoch_acc = epoch_acc + acc_batch
                epoch_loss = epoch_loss + loss_batch
                iterations += 1
            
        if (epoch+1) % 50 == 0:
            acc_train = accuracy.eval(feed_dict = {x_place: x_batch, y_place:y_batch})
            summary1, summary2 = sess.run([loss_summary, accuracy_summary], feed_dict = {x_place: train_x, y_place:encoded_train_y})
            
            writer.add_summary(summary1, epoch+1)
            writer.add_summary(summary2, epoch+1)
            print(datetime.datetime.now())
            print(epoch+1)
            print("Epoch acc: "+ str(epoch_acc/iterations))
            print("Epoch loss: "+ str(epoch_loss/iterations))
            
    save_path = saver.save(sess, "./my_model_final.ckpt")

writer.close()

In [None]:
with tf.Session() as sess:
    saver.restore(sess, "./my_model_final20.ckpt")
    Z = logits.eval(feed_dict = {x_place: test_x})
    y_pred= np.argmax(Z, axis = 1)