In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
from sklearn.model_selection import KFold
import matplotlib
import matplotlib.pyplot as plt
from dateutil.parser import parse
from datetime import datetime, timedelta
from collections import deque

%matplotlib inline

print("Pandas version:      ", pd.__version__)
print("NumPy version:       ", np.__version__)
print("SciKit Learn version:", sklearn.__version__)
print("TensorFlow version:  ", tf.__version__)
print("MatPlotLib version:  ", matplotlib.__version__)

seed = 8
tf.set_random_seed(seed)
np.random.seed(seed)

Pandas version:       0.22.0
NumPy version:        1.14.2
SciKit Learn version: 0.19.1
TensorFlow version:   1.6.0
MatPlotLib version:   2.2.0


In [2]:
dataframe = pd.read_csv('prices.csv')
dataframe.describe()

Unnamed: 0,open,close,low,high,volume
count,851264.0,851264.0,851264.0,851264.0,851264.0
mean,70.836986,70.857109,70.118414,71.543476,5415113.0
std,83.695876,83.689686,82.877294,84.465504,12494680.0
min,0.85,0.86,0.83,0.88,0.0
25%,33.84,33.849998,33.48,34.189999,1221500.0
50%,52.77,52.799999,52.23,53.310001,2476250.0
75%,79.879997,79.889999,79.110001,80.610001,5222500.0
max,1584.439941,1578.130005,1549.939941,1600.930054,859643400.0


In [3]:
dataframe.tail(10)

Unnamed: 0,date,symbol,open,close,low,high,volume
851254,2016-12-30,XRAY,58.290001,57.73,57.540001,58.360001,949200.0
851255,2016-12-30,XRX,8.72,8.73,8.7,8.8,11250400.0
851256,2016-12-30,XYL,49.98,49.52,49.360001,50.0,646200.0
851257,2016-12-30,YHOO,38.720001,38.669998,38.43,39.0,6431600.0
851258,2016-12-30,YUM,63.93,63.330002,63.16,63.939999,1887100.0
851259,2016-12-30,ZBH,103.309998,103.199997,102.849998,103.93,973800.0
851260,2016-12-30,ZION,43.07,43.040001,42.689999,43.310001,1938100.0
851261,2016-12-30,ZTS,53.639999,53.529999,53.27,53.740002,1701200.0
851262,2016-12-30 00:00:00,AIV,44.73,45.450001,44.41,45.59,1380900.0
851263,2016-12-30 00:00:00,FTV,54.200001,53.630001,53.389999,54.48,705100.0


In [4]:
desired_columns = ['open', 'close']
basic_mlp_data = dataframe[desired_columns]
basic_mlp_data.head()

Unnamed: 0,open,close
0,123.43,125.839996
1,125.239998,119.980003
2,116.379997,114.949997
3,115.480003,116.620003
4,117.010002,114.970001


In [8]:
# Call this in IPython notebooks before any elements are added to
# the default graph otherwise if you rerun cells you can get 
# annoying errors.
tf.reset_default_graph()

# Define the Neural Network topology with the 'net_hidden_sizes'
# and how much we should regularise it and how quickly it should
# learn. Also the type of non linearity we should use.
amount_epochs = 50
learning_rate = 0.001
batch_size = 128
net_hidden_sizes = [128, 64, 8]
l2_strength = 0.01
non_linearity = tf.nn.relu
dropout_amount = 0.7

# The input to the graph - the targets (close) and the inputs
# (open). Also a placeholder to pass a variable dropout rate. 
net_input = tf.placeholder(tf.float32, shape=[None, 1])
net_target = tf.placeholder(tf.float32, shape=[None, 1])
dropout_prob = tf.placeholder(tf.float32)

# L2 regularisation to penalise the weights from growing too
# large. Useful to prevent overfitting.
regulariser = tf.contrib.layers.l2_regularizer(scale=l2_strength)

# Build the network from the list of dimensions. Apply l2 and
# dropout regularisation to the layers.
net = net_input
for size in net_hidden_sizes:
    net = tf.layers.dense(inputs=net, 
                          units=size, 
                          activation=non_linearity, 
                          kernel_regularizer=regulariser)
    net = tf.layers.dropout(inputs=net,
                            rate=dropout_prob)

# The models prediction has a linear output. 
net_output = tf.layers.dense(inputs=net,
                             units=1, 
                             activation=None, 
                             kernel_regularizer=regulariser)    

# The main loss for penalising the network on how well it does.
loss = tf.losses.mean_squared_error(labels=net_target, 
                                    predictions=net_output)

# TensorFlows manner of applying l2 to the loss.
l2_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
l2_loss = tf.contrib.layers.apply_regularization(regulariser, 
                                                 l2_variables)
total_loss = loss + l2_loss

# Train and initialisation TensorFlow operations to be ran
# in the session.
train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
init_op = tf.global_variables_initializer()
print('ffff')

ffff


In [9]:
with tf.Session() as sess:
    
    amount_folds = 5
    k_folds = KFold(n_splits=amount_folds)
    data = basic_mlp_data.as_matrix()
    fold_errors = []
    fold_iteration = 0
    
    # Cross validate the dataset, using K-Fold.
    for train_indices, test_indices in k_folds.split(data):

        # Each new fold, reinitialise the network.
        sess.run(init_op)
        
        # Training phase.
        for epoch in range(amount_epochs):
            
            # Each new epoch, reshuffle the train set.
            random_train_indices = np.random.permutation(train_indices)
            train_set = data[random_train_indices]
            
            # Loop over the train set and optimise the network.
            for begin in range(0, len(train_set), batch_size):
                end = begin + batch_size
                batch_x = train_set[begin:end].T[0].reshape((-1, 1))
                batch_y = train_set[begin:end].T[1].reshape((-1, 1))
                
                sess.run(train_op, feed_dict={
                    net_input: batch_x,
                    net_target: batch_y,
                    dropout_prob: dropout_amount
                })
        
        # Testing phase.
        test_set = data[test_indices]
        
        # Collate the error over the test set.
        all_error = []
        for begin in range(0, len(test_set), batch_size):
            end = begin + batch_size 
            batch_x = train_set[begin:end].T[0].reshape((-1, 1))
            batch_y = train_set[begin:end].T[1].reshape((-1, 1))
            
            error = sess.run(loss, feed_dict={
                net_input: batch_x,
                net_target: batch_y,
                dropout_prob: 1.0
            }) 
            all_error.append(error)
        
        all_error = np.array(all_error).reshape((-1))
        fold_errors.append(all_error)
        
        print("\nFold iteration:  ", fold_iteration,
              "\nError mean:      ", np.mean(all_error),
              "\nError deviation: ", np.std(all_error),
              "\n")
        fold_iteration += 1      
        
    fold_errors = np.array(fold_errors).reshape((amount_folds, -1))
    print('fff')


Fold iteration:   0 
Error mean:       2.7146893 
Error deviation:  3.3762183 


Fold iteration:   1 
Error mean:       3.507859 
Error deviation:  3.1102047 


Fold iteration:   2 
Error mean:       2.104382 
Error deviation:  2.184533 


Fold iteration:   3 
Error mean:       2.0846407 
Error deviation:  1.8512129 


Fold iteration:   4 
Error mean:       1.8507316 
Error deviation:  1.9735212 

fff


In [10]:
print('dgdfgd')

dgdfgd
