In [1]:
from pathlib import Path
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Layer, Lambda, LSTM
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from datetime import datetime
from IPython.display import Image
from collections import deque
from statsmodels.tsa.arima_model import ARIMA
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
import copy
import random
import pickle
import time
import os
import io
from importnb import Notebook, reload
with Notebook(): 
    import Utilities

tf.compat.v1.disable_eager_execution()

In [2]:
def preprocess_text(df):
    """ Tokenize """
    conc = pd.DataFrame()
    for i in set(df.index):
        concat_str = ''
        for title in df.loc[i]['title']:
            concat_str += " " + title.lower()
        concat_str = word_tokenize(concat_str)
        conc = conc.append({'date':i, 'title':concat_str},ignore_index=True)
    conc.set_index('date',inplace=True)
    conc = conc.sort_values('date')
    sents = [''.join([word + ' ' for word in title]) for title in conc['title'].values]
    return sents

def eval_preds(preds,targets): 
    """ Calculate the MSE of the ARIMA-predictions 
    and the actual prices """
    return (np.square(preds - targets)).mean(axis=0)


In [15]:
config = {
    'embed_dim': 300,    # Dimensions to use for the word embedding
    'lookback': 3,       # How far back to collect data in the recurrent layer (days)
    'delay': 1,          # How far ahead to predict data (days)
    'batch_size': 10,    # Batch size used in generator
    'p': 1,              # Order of the AR-part of the model
    'd': 1,              # Integrated order
    'q': 1,              # Included moving average terms 
    'train_part' : 0.8,  # Part of data to be used for training
    'val_part' : 0.1,    # Part of data to be used for validation
    'test_part' : 0.1,   # Part of data to be used for testing
    'series': '1 YEAR',   # What series we currently want to predict, '1 YEAR', '3 YEAR' or 'S&P'
    'vocab_size': 15000,  # Include only the 'vocab_size' most common words 
    'start_date': '2006-10-20',
    'end_date': '2013-11-19',
}

In [5]:
# Load data 
news_path = Path(os.getcwd()) / "Datasets/data/financial_headlines_20061020-20131119.pkl"
stock_path = Path(os.getcwd()) / "Datasets/data/stock_data.pkl"
data = pd.DataFrame(pd.read_pickle(news_path))
data.set_index('date',inplace=True)
stock_data = pd.DataFrame(pd.read_pickle(stock_path))
text = preprocess_text(data)

In [6]:
# Tokenize the text data 
tokenizer = Tokenizer(num_words=config['vocab_size'])
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(text)

In [7]:
# Load dictionary of embeddings for this vocabulary (previously constructed)
subset_embeddings = Path(os.getcwd()) / "Embeddings/GloVe/saved.42B.300d.pkl"
with open(subset_embeddings,'rb') as handle: 
    emb_dict = pickle.load(handle)

In [19]:
# Create embedding matrix with shape (vocab_size, embed_dimension)
embedding_weights = np.zeros((config['vocab_size'], config['embed_dim']))
# Add pre-trained weights from GloVe
for word,index in word_index.items(): 
    if index > config['vocab_size']: 
        break
    temp_emb = emb_dict.get(word)
    if temp_emb is not None: 
        embedding_weights[index - 1] = temp_emb
    else: 
        embedding_weights[index - 1] = np.random.normal(size=config['embed_dim'])

In [20]:
config['max_len'] = max([len(sent) for sent in sequences])
print("Actual maximum length of one day news:",config['max_len'])
config['max_len'] = 600
print("Used maximum length of one day news:",config['max_len'])

# Pad the text data so we get a matrix of shape (n,d)
x = np.zeros((len(sequences), config['max_len']))
for i,sent in enumerate(sequences): 
    if len(sent) > config['max_len']: 
        x[i] = sent[:config['max_len']]
    else: 
        x[i,:len(sent)] = sent

Actual maximum length of one day news: 950
Used maximum length of one day news: 600


In [10]:
y = Utilities.load_financial_labels(config)

Found and loaded previously constructed models.


In [11]:
(x_train,y_train), (x_test, y_test) = Utilities.shuffle_and_partition(x,y, test_part=0.2)
print("Shape of x_train: ",x_train.shape)
print("Shape of y_train: ",y_train.shape)

Shape of x_train:  (2060, 600)
Shape of y_train:  (2060,)


## Model 
This model aims to take the chronology of news over a period into account. 

In [35]:
# Define some help functions for custom layer and hyper parameter optimization

# Create functions for custom 'MergeEmedding'-layer which averages 
# the embeddings over all words after the embedding layer 
def merge_embeddings(x):
    # Sum the embeddings for every word slot. If this is zero, there is no word in this slot
    non_zero = K.sum(K.cast(K.not_equal(K.sum(x,axis=2),0),tf.float32))
    return K.sum(x,axis=1) / non_zero

def merge_output_shape(input_shape):
    return input_shape.pop(2)
    #return input_shape[0], input_shape[1], input_shape[3]
# OR 
# return input_shape.pop(2)


def train_and_format(x,y,bs,epochs, results, layers, nodes,verbose=0):
    name = f"{bs}_{layers}_{nodes}"
    temp_history = model.fit(x, y, batch_size=bs, validation_split=0.2, epochs = epochs,verbose=verbose)
    return results.append(pd.Series([np.mean(temp_history.history['val_loss'][-5:]),
                                                temp_history, model, layers, 
                                                nodes, bs], 
                                                name=name,index=['Mean Val MSE','History','Model','Layers','Nodes','Batch Size']))


In [37]:
 
input_layer = Input(shape=(config['lookback'],config['max_len']), name='input')
embed_layer = Embedding(input_dim=config['vocab_size'],
                output_dim=config['embed_dim'],
                weights=[embedding_weights])(input_layer)
merged_layer = Lambda(merge_embeddings, output_shape=merge_output_shape)(embed_layer)
lstm_layer = LSTM(5,return_sequences=True)(merged_layer)
lstm_layer = LSTM(5,return_sequences=False)(lstm_layer)
output_layer = Dense(1, activation='linear', name='output')(lstm_layer)

model = Model(inputs=[input_layer], outputs=[output_layer])
model.compile(optimizer='rmsprop', loss='mse')

model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 3, 600)]          0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 3, 600, 300)       4500000   
_________________________________________________________________
lambda_10 (Lambda)           (None, 600, 300)          0         
_________________________________________________________________
lstm_19 (LSTM)               (None, 600, 5)            6120      
_________________________________________________________________
lstm_20 (LSTM)               (None, 5)                 220       
_________________________________________________________________
output (Dense)               (None, 1)                 6         
Total params: 4,506,346
Trainable params: 4,506,346
Non-trainable params: 0
_________________________________________________