In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.models import load_model

In [21]:
def loadDataFiles():
    market_df = pickle.load(open('Market_train',"rb"))
    news_df = pickle.load(open("News_train", "rb"))
    print('Finished loading datafiles!')
    return market_df, news_df


In [22]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged


In [23]:
market_data, news_data = loadDataFiles()


Finished loading datafiles!


In [24]:
X = preprocess_data(market_data, news_data)


Finished preprocessing data!


In [25]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf


In [26]:
X = X[X['returnsOpenNextMktres10'] >= -1]
X = X[X['returnsOpenNextMktres10'] <= 1]

y = X['returnsOpenNextMktres10']

X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
y = normalizeY(y)
assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]


In [34]:
def getLinearRegressionModel(numfeatures):
    inputs = keras.layers.Input(shape=(numfeatures,))
    preds = keras.layers.Dense(1,activation='linear')(inputs)
    model = keras.Model(inputs=inputs,outputs=preds)
    sgd=keras.optimizers.SGD(.01)
    model.compile(optimizer=sgd ,loss='mse',metrics=['mse'])
    return model


In [28]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
    return df

In [29]:
X = regularize(X)

           volume     close      open  returnsClosePrevRaw1  \
0        0.002125  0.020354  0.020254              0.021121   
1        0.001672  0.007002  0.006943              0.021090   
2        0.000949  0.023725  0.023928              0.020744   
3        0.019357  0.053654  0.054375              0.020745   
4        0.000985  0.011375  0.011317              0.021246   
5        0.001351  0.033142  0.033023              0.020989   
6        0.000967  0.014930  0.015180              0.020548   
7        0.000246  0.016552  0.016297              0.021284   
8        0.004640  0.033459  0.033086              0.020940   
9        0.000328  0.033199  0.032833              0.021296   
10       0.007367  0.001964  0.001988              0.020464   
11       0.001121  0.017224  0.017143              0.021080   
12       0.001334  0.015494  0.015224              0.022879   
13       0.000208  0.074864  0.074282              0.021075   
14       0.001061  0.031114  0.031022              0.02

In [30]:
def saveModel(model, model_name):
    model.save(model_name + '.h5')

In [31]:
def loadModel(filename):
    model = load_model(filename)
    return model

In [32]:
def modify_dataset(X, degree):
    if degree == 0 or degree == 1:
        return X
    for x in range(degree - 1):
        for column in X.columns.values:
            newcol = column + '_' + str((x+2))
            X[newcol] = X[column] ** (x + 2)
    return X

In [None]:
degree = 3
X = modify_dataset(X, degree)
lrmodel = getLinearRegressionModel(len(X.columns.values))
lrmodel.fit(X,y, batch_size=1000000, epochs=5, validation_split=.3)