In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import pickle
from sklearn.model_selection import train_test_split
import datetime
import os
import subprocess
import sys

In [2]:
market_train_df = pickle.load(open('Market_train',"rb"))
news_train_df = pickle.load(open('News_train', 'rb'))

In [3]:
market_train_df.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2007-02-01 22:00:00+00:00,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.00186,0.000622,,,0.034672,1.0
1,2007-02-01 22:00:00+00:00,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0
2,2007-02-01 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0
3,2007-02-01 22:00:00+00:00,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0
4,2007-02-01 22:00:00+00:00,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0


In [4]:
news_train_df.columns

Index(['time', 'sourceTimestamp', 'firstCreated', 'sourceId', 'headline',
       'urgency', 'takeSequence', 'provider', 'subjects', 'audiences',
       'bodySize', 'companyCount', 'headlineTag', 'marketCommentary',
       'sentenceCount', 'wordCount', 'assetCodes', 'assetName',
       'firstMentionSentence', 'relevance', 'sentimentClass',
       'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
       'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
       'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
       'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
       'volumeCounts7D'],
      dtype='object')

In [5]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged

In [6]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf
X = preprocess_data(market_train_df, news_train_df)

Finished preprocessing data!


In [7]:

X = X[X['returnsOpenNextMktres10'] >=-1]
X = X[X['returnsOpenNextMktres10'] <=1]

y = X['returnsOpenNextMktres10']

X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
y = normalizeY(y)
assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]

In [8]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
        
    return df

In [9]:
print(X.columns)
X = regularize(X)
X = X.values
y = y.values

Index(['volume', 'close', 'open', 'returnsClosePrevRaw1',
       'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
       'returnsOpenPrevMktres1', 'returnsClosePrevRaw10',
       'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
       'returnsOpenPrevMktres10', 'universe', 'urgency', 'takeSequence',
       'bodySize', 'companyCount', 'sentenceCount', 'wordCount',
       'firstMentionSentence', 'relevance', 'sentimentClass',
       'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
       'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
       'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
       'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
       'volumeCounts7D'],
      dtype='object')


In [10]:

sess = tf.Session()

In [11]:
#Split data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42)

In [12]:
# Declare batch size
batch_size = 1000
n_iterations  = 1000
print(X_train.shape)
print(y_test.shape)

(3664982, 35)
(407221,)


In [13]:
# Initialize placeholders
x_data = tf.placeholder(shape=[None, 35], dtype=tf.float32, name='input')
y_target = tf.placeholder(shape=[None,], dtype=tf.float32,name = 'output')

# Create variables for linear regression
A = tf.Variable(tf.random_normal(shape=[35,1]), name='Weight')

b = tf.Variable(tf.random_normal(shape=[1,1]), name = 'bias')

In [14]:
# Declare model operations
model_output = tf.add(tf.matmul(x_data, A), b)

In [15]:
# Declare loss function
# = max(0, abs(target - predicted) + epsilon)
# 1/2 margin width parameter = epsilon
epsilon = tf.constant([1.0])
# Margin term in loss
loss = tf.reduce_mean(tf.maximum(0., tf.subtract(tf.abs(tf.subtract(model_output, y_target)), epsilon)))

In [16]:
# Declare optimizer
my_opt = tf.train.GradientDescentOptimizer(0.01)
train_step = my_opt.minimize(loss)

# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)

In [17]:
# Training loop
train_loss = []
test_loss = []
batch = 50
m = X_train.shape[0]
for i in range(n_iterations):
    rand_index = np.random.choice(X_train.shape[0], size=batch_size)
    rand_x = X_train[rand_index]
    rand_y = y_train[rand_index]
    
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
    
        
    if (i+1)%50==0:
        rand_index_test = np.random.choice(X_test.shape[0], size=90000)
        temp_train_loss = sess.run(loss, feed_dict={x_data: X_train[rand_index_test], y_target: y_train[rand_index_test]})
        train_loss.append(temp_train_loss)
    
        temp_test_loss = sess.run(loss, feed_dict={x_data: X_test[rand_index_test], y_target: y_test[rand_index_test]})
        test_loss.append(temp_test_loss)
        print('-----------')
        print('Iteration: ' + str(i+1))
        #print('A = ' + str(sess.run(A)) + ' b = ' + str(sess.run(b)))
        print('Train Loss = ' + str(temp_train_loss))
        print('Test Loss = ' + str(temp_test_loss))
    
    

-----------
Iteration: 50
Train Loss = 0.033311557
Test Loss = 0.033180736
-----------
Iteration: 100
Train Loss = 0.022180552
Test Loss = 0.021324212
-----------
Iteration: 150
Train Loss = 0.016299218
Test Loss = 0.016759023
-----------
Iteration: 200
Train Loss = 0.010505103
Test Loss = 0.010766816
-----------
Iteration: 250
Train Loss = 0.007928266
Test Loss = 0.008201632
-----------
Iteration: 300
Train Loss = 0.0062301476
Test Loss = 0.0064045936
-----------
Iteration: 350
Train Loss = 0.005015546
Test Loss = 0.0051466
-----------
Iteration: 400
Train Loss = 0.0043119434
Test Loss = 0.004485699
-----------
Iteration: 450
Train Loss = 0.0037568896
Test Loss = 0.003529386
-----------
Iteration: 500
Train Loss = 0.0034898845
Test Loss = 0.0034413063
-----------
Iteration: 550
Train Loss = 0.0030055956
Test Loss = 0.003174889
-----------
Iteration: 600
Train Loss = 0.002545059
Test Loss = 0.0028165493
-----------
Iteration: 650
Train Loss = 0.00245174
Test Loss = 0.0026575981
-------

In [18]:
sess.close()