In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import pickle


In [2]:
def loadDataFiles():
    market_df = pickle.load(open('Market_train',"rb"))
    news_df = pickle.load(open("News_train", "rb"))
    print('Finished loading datafiles!')
    return market_df, news_df


In [3]:
def preprocess_data(mkt_df, news_df):
    mkt_df['time'] = pd.to_datetime(mkt_df['time'])
    news_df['time'] = pd.to_datetime(news_df['time'])
    mkt_df['time'] = mkt_df['time'].dt.date
    news_df['time'] = news_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_df.drop(irrelevantColumns, axis=1, inplace=True)
    mkt_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    print('Finished preprocessing data!')
    return merged


In [4]:
market_data, news_data = loadDataFiles()


Finished loading datafiles!


In [5]:
X = preprocess_data(market_data, news_data)


Finished preprocessing data!


In [6]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf


In [7]:
X = X[X['returnsOpenNextMktres10'] >= -1]
X = X[X['returnsOpenNextMktres10'] <= 1]

X = pd.DataFrame(Xs, columns=X.columns.values).append(Xz)
y = X['returnsOpenNextMktres10']

X.drop(['returnsOpenNextMktres10'], axis=1, inplace=True)
y = normalizeY(y)
assetCodesAndTime = X.iloc[:, :2]
X = X.iloc[:, 2:]


NameError: name 'Xs' is not defined

In [None]:
def getNNModel(numhiddenlayers=2, nodes=4): # returns NN given hidden layers and nodes
    layers = []
    layers.append(keras.layers.Flatten(input_shape=(35,)))

    for x in range(numhiddenlayers):
        layers.append(keras.layers.Dense(nodes, activation=tf.nn.relu, use_bias=True))

    layers.append(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    model = keras.Sequential(layers)
    sgd = keras.optimizers.SGD(lr=.3)
    model.compile(optimizer=sgd,
              loss='mean_absolute_error',
              metrics=['accuracy'])
    return model


In [None]:
def getLinearRegressionModel():
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=200, input_dim=35))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(units=45))
    model.add(keras.layers.Activation('relu'))
    model.add(keras.layers.Dense(units=1))
    model.compile(loss='mean_absolute_error',
              optimizer='sgd')
    return model


In [None]:
def regularize(df):
    for column in df:
        colmin = np.amin(df[column])
        colmax = np.amax(df[column])
        df[column] = (df[column] - colmin) / (colmax - colmin)
    return df

In [None]:
X = regularize(X)

In [None]:
def splitDataset(X, y, split):
    print(len(X.index))
    index = int(split*len(y.index))
    y_train, y_test = np.split(y, [index])
    X_train, X_test = X.iloc[:index, :], X.iloc[index:, :]
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = splitDataset(X, y, .7)
nnmodel = getNNModel(3, 15)
nnmodel.fit(X_train, y_train, epochs=1, verbose=1, batch_size=100000)
results = nnmodel.evaluate(X_test, y_test)
print(results)

In [None]:
def lossMatrix(layers, nodes):
    lossmatrix = []
    for layer in layers:
        lossforlayer = []
        for node in nodes:
            nnmodel2 = getNNModel(layer, node)
            nnmodel2.fit(X,y, epochs=1, verbose=1, batch_size=1000000)
            loss, acc = nnmodel2.evaluate(X, y)
            lossforlayer.append(loss)
        lossmatrix.append(lossforlayer)
    print(lossmatrix)

In [None]:
lrmodel = getLinearRegressionModel()
lrmodel.fit(X_train,y_train, batch_size=1000000, epochs=1)
lrpredictions = lrmodel.predict(X_test)
loss = lrmodel.evaluate(X_test,y_test)
print(loss)

In [None]:
xplot = list(range(len(y_test)))
plt.plot(xplot, lrpredictions)
plt.show()

In [None]:
plt.plot(xplot, y_test)
plt.show()

In [None]:
newnnmodel = getNNModel(3,15)
hist = keras.callbacks.History()
num_epochs = 10
newnnmodel.fit(X_train, y_train, epochs=num_epochs, batch_size=1000000, callbacks=[hist])

In [None]:
xr = list(range(num_epochs))
plt.plot(xr, hist.history['loss'])
plt.show()

In [None]:
newnnpredictions = newnnmodel.predict(X_test)
ss = list(range(len(X_test)))
plt.plot(ss, newnnpredictions)
plt.show()

In [None]:
plt.plot(ss, y_test)
plt.show()