# DNN Stock Price Predictor
By: Alfredo Andere Valencia <br>
<br>
Used data from the S&P 500 stock market and a Deep Neural Network (DNN) with features:
<i>
<br> x1-x30    : Opening Price of stock for past 30 days, 
<br> x31-x60   : Highest price of the day of past 30 days, 
<br> x61-x90   : Lowest price of the day for past 30 days, 
<br> x91-x120  : Closing price of past 30 days, 
<br> x121-x150 : Volume of trades of past 30 days.
</i>
<br>
<br> The DNN tries to learn and predict:
<i>
<br> [1, 0] : Stocks will go up in price in the next 24 hours, 
<br> [0, 1]  : stocks price will decrease in price in the next 24 hours <br> </i>
<br>
<b>Dataset:</b> <br>
<i>DJIA 30 Stock Time Series</i> <br>
Historical stock data for DIJA 30 companies (2006-01-01 to 2018-01-01): <br>
https://www.kaggle.com/szrlee/stock-time-series-20050101-to-20171231 <br>
(Date, Open, High, Low, Close, Volume, Name)

<i>S&P 500 stock data </i> <br>
Historical stock data for all current S&P 500 companies (2013 - 2018) <br>
https://www.kaggle.com/camnugent/sandp500 <br>
(date, open, high, low, close, volume, Name)

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import csv
import pickle
import random
import math

  from ._conv import register_converters as _register_converters


In [2]:
# Load in data into a pandas dataframe
stocks_DOJ30 = pd.read_csv('stock-time-series-20050101-to-20171231/all_stocks_2017-01-01_to_2018-01-01.csv')
stocks_500 = pd.read_csv('all_stocks_5yr.csv')

# Capitalize columns to match first dataset
stocks_500 = stocks_500.rename(columns={'open':'Open', 'high':'High', 'low':'Low', 'close':'Close', 'volume':'Volume'})


print(stocks_DOJ30.head()) 
print(stocks_500.head())


         Date    Open    High     Low   Close   Volume Name
0  2017-01-03  178.83  180.00  177.22  178.05  2510055  MMM
1  2017-01-04  178.03  178.90  177.61  178.32  1541985  MMM
2  2017-01-05  178.26  179.14  176.89  177.71  1447848  MMM
3  2017-01-06  177.29  178.60  175.80  178.23  1625049  MMM
4  2017-01-09  178.37  178.38  177.20  177.27  1622625  MMM
         date   Open   High    Low  Close    Volume Name
0  2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
1  2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2  2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
3  2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
4  2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL


In [3]:
# Take the mean of each column and subtract it from column, 
# take the range of  each column and subtract it from column
def mean_normalize(dataframe):
    minus_mean = (dataframe - dataframe.mean()) 
    df_range = (dataframe.max() - dataframe.min())
    df_norm = dataframe / df_range
    return df_norm
    
normalized_DOJ30 = mean_normalize(stocks_DOJ30[['Open', 'High', 'Low', 'Close', 'Volume']])
normalized_500 = mean_normalize(stocks_500[['Open', 'High', 'Low', 'Close', 'Volume']])

normalized_DOJ30 = pd.concat([normalized_DOJ30, stocks_DOJ30['Name']], axis=1)
normalized_500 = pd.concat([normalized_500, stocks_500['Name']], axis=1)

all_stocks_normalized = pd.concat([normalized_DOJ30, normalized_500], axis=0, ignore_index=True)

all_stocks_normalized.head()

Unnamed: 0,Open,High,Low,Close,Volume,Name
0,0.15058,0.1505,0.150967,0.151086,0.008033,MMM
1,0.149906,0.149581,0.151299,0.151315,0.004935,MMM
2,0.1501,0.149781,0.150686,0.150797,0.004634,MMM
3,0.149283,0.14933,0.149757,0.151238,0.005201,MMM
4,0.150192,0.149146,0.15095,0.150424,0.005193,MMM


In [1]:
stock_names = []

# Create featuresets out of data
def data_handling(dataframe):
    dataset = np.array([])
    i, j = 0, 30 * 6
    for _ in range(dataframe['Name'].count()):
        try:
            current_name = dataframe.loc[j]['Name']
            stock_names.append(current_name)
            while dataframe.loc[j+1]['Name'] == current_name:
                    temp = dataframe[i:j]
                    feature = (pd.concat([temp['Open'], temp['High'], temp['Low'], temp['Close'], temp['Volume']], axis=0))
                    feature = feature.values.transpose()
                    feature = np.array(feature)
                    label = [1, 0] if (dataframe.loc[j+1]['Open'] - dataframe.loc[j]['Open'] > 0) else [0, 1]
                    feature_label = np.array([[feature, label]])
                    dataset = np.concatenate([dataset, [feature, label]], axis=0)
                    i += 7
                    j += 7

        except Exception as e:
            print(str(e))

        i, j = j + 1, j + (30 * 6)
        
    return dataset


all_stocks_matrix = data_handling(all_stocks_normalized)
print(stock_names) 

NameError: name 'all_stocks_normalized' is not defined

In [8]:
# Organize Data into Features and Labels
X = []
Y = []

i = 0
while i < len(all_stocks_matrix):
    X.append(all_stocks_matrix[i])
    i += 1
    Y.append(all_stocks_matrix[i])
    i += 1

# Remove any NaN in data and replace it by priors day value
for feature in X:
    index = 0
    for i in feature:
        if (math.isnan(i)):
            feature[index] = feature[index-1]
        index += 1

In [11]:
# Shuffle the order of data 

Data = list(zip(X, Y))
random.shuffle(Data)
X, Y = zip(*Data)


print(len(X) == len(Y))                
len(X)

True


610740

In [12]:
# Separate into training and testing set
test_size = 0.2

train_x = X[0:-int(len(X)*test_size)]
train_y = Y[0:-int(len(X)*test_size)]
test_x = X[int(-len(X)*test_size):]
test_y = Y[int(-len(X)*test_size):]

In [13]:
n_nodes_hl1 = 1000
n_nodes_hl2 = 1000
n_nodes_hl3 = 1000
n_classes = 2
batch_size = 5000
epoch_num = 100

In [14]:
x = tf.placeholder('float')
y = tf.placeholder('float')

In [15]:
# DNN with three hidden layers
def model(data):
    hidden_layer_1 = {'weights':tf.Variable(tf.random_normal([len(train_x[1]), n_nodes_hl1])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
    hidden_layer_2 = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
    hidden_layer_3 = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                     'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
    output = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
             'biases':tf.Variable(tf.random_normal(([n_classes])))}
    
    l1 = tf.add(tf.matmul(data, hidden_layer_1['weights']), hidden_layer_1['biases'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1, hidden_layer_2['weights']), hidden_layer_2['biases'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2, hidden_layer_3['weights']), hidden_layer_3['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.matmul(l3, output['weights']) + output['biases']
    
    return output

# Session of tensorflow to train DNN
def train_net(x):
    prediction = model(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=prediction, labels=y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    n_epochs = epoch_num
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(n_epochs):
            epoch_loss = 0
            i = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
                epoch_loss += c
                i += batch_size
                
                
            print('epoch:', epoch, 'completed out of:', n_epochs, 'with loss:', epoch_loss)
            
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        
        print('Accuracy:', accuracy.eval({x: test_x, y: test_y}))
        



In [16]:
train_net(x)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

epoch: 0 completed out of: 100 with loss: 105168.39735412598
epoch: 1 completed out of: 100 with loss: 38404.261947631836
epoch: 2 completed out of: 100 with loss: 57151.72896575928
epoch: 3 completed out of: 100 with loss: 30212.614051818848
epoch: 4 completed out of: 100 with loss: 53558.54050445557
epoch: 5 completed out of: 100 with loss: 40603.30177307129
epoch: 6 completed out of: 100 with loss: 40816.25161743164
epoch: 7 completed out of: 100 with loss: 44250.83182525635
epoch: 8 completed out of: 100 with loss: 32992.33726501465
epoch: 9 completed out of: 100 with loss: 56562.81783294678
epoch: 10 completed out of: 100 with loss: 45918.434257507324
epoch: 11 completed out of: 100 with loss: 45847.39234542847
epoch: 12 completed out of: 100 with loss: 30838.07448577881
epoch: 13 completed ou