In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from IPython.display import HTML
import base64
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize
import statsmodels.tsa.stattools as ts
import matplotlib.pyplot as plt
import pickle


In [4]:
market_train_df = pickle.load(open('Market_train',"rb"))
news_train_df = pickle.load(open('News_train', 'rb'))

In [5]:
def preprocess_data(mkt_train_df, news_train_df):
    mkt_train_df['time'] = mkt_train_df['time'].dt.date
    news_train_df['time'] = news_train_df['time'].dt.date
    assetCodes = []
    index = 0
    for x in news_train_df['assetCodes']:
        x = x.split(',')[0].split("'")[1]
        assetCodes.append(x)
    news_train_df['assetCode'] = np.asarray(assetCodes)
    irrelevantColumns = ['sourceTimestamp', 'firstCreated', 'sourceId', 
                         'headline', 'provider', 'subjects', 'audiences',
                        'headlineTag', 'marketCommentary', 'assetCodes', 'assetName']
    news_train_df.drop(irrelevantColumns, axis=1, inplace=True)
    market_train_df.drop(['assetName'], axis=1, inplace=True)
    modifiednews = news_train_df.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
    
    # join news reports to market data, note many assets will have many days without news data
    merged = pd.merge(mkt_train_df, modifiednews, how='left', on=['time', 'assetCode'], copy=False) 
    merged = merged.fillna(0)
    return merged

In [6]:
def normalizeY(ydf):
    ydf = (ydf + 1) / 2
    return ydf

In [7]:
def separateDfByStock(df):
    uniqueAssets = df.assetCode.unique()
    dfdict = {elem : pd.DataFrame(columns=df.columns.values) for elem in uniqueAssets}

    for index, row in df.iterrows():
        dfdict[row['assetCode']].append(row)

    print(len(dfdict))
    return dfdict

In [8]:
def regress_prices(x_data, y_data):
    reg = LinearRegression(fit_intercept=True)
    reg.fit(x_data.reshape(-1,1), y_data.reshape(-1,1))
    r_c, r_i = reg.coef_[0,0], reg.intercept_[0]
    return r_c, r_i

In [9]:
def residuals(x_vals, y_vals, coeffs):
    return y_vals - (coeffs[0] * x_vals + coeffs[1])

In [10]:
def isCointegrated(code1, code2, sp_dict):
    coeffs1 = regress_prices(sp_dict[code1]['open'], sp_dict[code2]['open'])
    coeffs2 = regress_prices(sp_dict[code2]['open'], sp_dict[code1]['open'])
    resids1 = residuals(sp_dict[code1]['open'], sp_dict[code2]['open'], coeffs1)
    resids2 = residuals(sp_dict[code2]['open'], sp_dict[code1]['open'], coeffs2)
    adf1 = ts.adfuller(resids1, 2)
    adf2 = ts.adfuller(resids2, 2)
    return (adf1[1] >= .01 or adf2[1] >= .01)

In [11]:
def find2CointegratedPairs(sp_dict):
    pairs = []
    keys = list(sp_dict.keys())
    for i in range(len(keys) - 1):
        for j in range(1, len(keys)):
            if (isCointegrated(keys[i], keys[j], sp_dict)):
                pairs.append((keys[i], keys[j]))
            if (len(pairs) == 2):
                return pairs
    return pairs

In [None]:
stocks_and_prices = pd.DataFrame({'assetCode': market_train_df['assetCode'], 'open':market_train_df['open']})
sp_dict = separateDfByStock(stocks_and_prices)
pairs = find2CointegratedPairs(sp_dict)

In [None]:
def joinPairs(X, pair1, pair2):
    p1c1, p1c2 = pair1
    p2c1, p2c2 = pair2
    columns2 = []
    for col in X.columns.values:
        c2 = col + '_2'
        columns2.append(c2)
    p1c1df = X[X['assetCode'] == p1c1]
    p1c2df = X[X['assetCode'] == p1c2]
    p2c1df = X[X['assetcode'] == p2c1]
    p2c2df = X[X['assetCode'] == p2c2]
    joinedcolumns = X.columns.values + columns2
    joinedp1df = pd.DataFrame(columns=joinedcolumns)
    joinedp2df = pd.DataFrame(columns=joinedcolumns)
    for col in joinedp1df.columns.values:
        if (col in columns2):
            joinedp1df[col] = p1c2df[col[:-2]]
            joinedp2df[col] = p2c2df[col[:-2]]
        else:
            joinedp1df[col] = p1c1df[col]
            joinedp2df[col] = p2c1df[col]
    joinedDf = joinedp1df.append(joinedp2df)
    return joinedDf
    

In [None]:
X = preprocess_data(market_data, news_data)
X = X[X['returnsOpenNextMktres10'] >= -1]
X = X[X['returnsOpenNextMktres10'] <= 1]
X['returnsOpenNextMktres10'] = normalize(X['returnsOpenNextMktres10'])
joinedDf = joinPairs(X, pairs[0], pairs[1])

y1 = joinedDf['returnsOpenNextMktres10']
y2 = joinedDf['returnsOpenNextMktres10_2']

joinedDf.drop(['returnsOpenNextMktres10', 'returnsOpenNextMktres10_2', 'time', 'time_2', 'assetCode', 'assetCode_2'])
y = pd.DataFrame({'y1': y1, 'y2':y2 })

In [None]:
layers = []
layers.append(keras.layers.Flatten(input_shape=(len(joinedDf.columns.values),)))

for x in range(3):
    layers.append(keras.layers.Dense(15, activation=tf.nn.relu, use_bias=True))

layers.append(keras.layers.Dense(2, activation=tf.nn.sigmoid))
model = keras.Sequential(layers)
sgd = keras.optimizers.SGD(lr=.3)
model.compile(optimizer=sgd,
              loss='mean_absolute_error',
              metrics=['accuracy'])
model.fit(joinedDf, y, batch_size=1000000, epochs=10, validation_split=.3)