In [None]:
from pathlib import Path
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.utils import plot_mjodel
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from datetime import datetime
from IPython.display import Image
from collections import deque
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
import copy
import random
import pickle
import time
import os
import io
%run ARIMA-creator.ipynb

In [None]:
def concat_daily(df):
    conc = pd.DataFrame()
    for i in set(df.index):
        concat_str = ''
        for title in df.loc[i]['title']:
            concat_str += " " + title
        conc = conc.append({'date':i, 'title':concat_str},ignore_index=True)
    conc.set_index('date',inplace=True)
    conc = conc.sort_values('date')
    return conc

def pad_data(seq,maxlen):
    data = np.zeros((len(seq), maxlen),dtype=int)
    for i,s in enumerate(seq): 
        if len(s) <= maxlen: 
            data[i,:len(s)] = s
        else: 
            s = np.array(s)
            indices = np.sort(np.random.choice(len(s),maxlen,replace=False))
            data[i,:] = s[indices]
    return data

def eval_preds(preds,targets): 
    """ Calculate the MSE of the ARIMA-predictions 
    and the actual prices """
    return (np.square(preds - targets)).mean(axis=0)

def load_fasttext_matrix(fname, words): 
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    nbr_words = len(words)
    embedding_matrix = np.zeros((nbr_words,300))
    ctr = 0
    not_found_words = list(words.values())
    for line in fin: 
        tokens = line.rstrip().split(' ')
        if tokens[0] in words.keys(): 
            index = words.get(tokens[0])
            embedding_matrix[index-1] = tokens[1:]
            not_found_words.remove(index)
            if len(not_found_words) == 0:
                print("Found all words after {} %. ".format(100 * round(ctr / n,3)))
                return embedding_matrix, not_found_words
        ctr += 1
    print("Found {} words. ".format(nbr_words - len(not_found_words)))
    return embedding_matrix, not_found_words

In [None]:
# Set up hyperparameters to be saved in a config file later. 
par = {
    'embed_dim': 300,    # Dimensions to use for the word embedding
    'vocab_part': 0.6,   # How large part of the total vocabulary to include
    'lookback': 3,       # How far back to collect data in the recurrent layer (days)
    'delay': 1,          # How far ahead to predict data (days)
    'batch_size': 10,    # Batch size used in generator
    'p': 1,              # Order of the AR-part of the model
    'd': 1,              # Integrated order
    'q': 1,              # Included moving average terms 
    'train_part' : 0.8,  # Part of data to be used for training
    'val_part' : 0.1,    # Part of data to be used for validation
    'test_part' : 0.1,   # Part of data to be used for testing
    'series': '1 YEAR'   # What series we currently want to predict, '1 YEAR', '3 YEAR' or 'S&P'
}

In [None]:
# Load data 
news_path = Path(os.getcwd()) / "Datasets/data/financial_headlines_20061020-20131119.pkl"
stock_path = Path(os.getcwd()) / "Datasets/data/stock_data.pkl"
data = pd.DataFrame(pd.read_pickle(news_path))
data.set_index('date',inplace=True)
stock_data = pd.DataFrame(pd.read_pickle(stock_path))
data = concat_daily(data)

In [None]:
# Load and process text data 

len_words = [len(title) for title in data['title'].values]
mean_words = np.mean(len_words)
std_words = np.std(len_words)

sent_len = int(mean_words + 2 * std_words)  # THIS MAKES IT SLOW!?                                                                                                                                                        
sent_len = 500


# Update environment variable config
par.update({'input_dim': sent_len})
par.update({'start_date' : data.index[0]})
par.update({'end_date' : data.index[-1]})

# Tokenize the text data
tokenizer = Tokenizer(num_words=None)  # Tokenize without limitation first, just because it's a cheap way 
tokenizer.fit_on_texts(data['title'])  # of calculating the number of unique words in the data
par.update({'vocab_size': len(tokenizer.word_index)})

tokenizer = Tokenizer(num_words=int(par['vocab_size'] * par['vocab_part']),filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\' ')

tokenizer.fit_on_texts(data['title'])
sequences = tokenizer.texts_to_sequences(data['title'])

word_index = tokenizer.word_index

par.update({'vocab_size': len(word_index)})

In [None]:
FAST_path = Path(os.getcwd() + "/Embeddings/FASTtext/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec")
FAST_embeddings, not_found = load_fasttext_matrix(FAST_path, word_index)

In [None]:
# Initiate random vectors for words not found in the pre-trained FASTtext-matrix
for i in not_found: 
    FAST_embeddings[i - 1] = np.random.normal(scale=0.05,size=par['embed_dim'])

In [None]:
# Print not found words 
rev_word_index = {v:k for (k,v) in word_index.items()}
[print(rev_word_index.get(i)) for i in not_found] 


In [None]:
# Pad sequences to the same length sent_len. End-padded.
# If a sequence is longer than sent_len, words are randomly sampled.
text_data = pad_data(sequences,sent_len)

# Extract financial data
stock_data = stock_data[par['start_date'] : par['end_date']]

# Add indicies which are present in the news data but not in the 
# financial data and interpolate missing values 
stock_data = stock_data.reindex(data.index.drop_duplicates())
stock_data = stock_data.interpolate()

# Normalize the financial data to [0,1]
fin_stats = pd.DataFrame(columns=['min','max'])
for col in stock_data: 
    minimum = min(stock_data[col])
    maximum = max(stock_data[col])
    fin_stats = fin_stats.append({'min':minimum, 'max':maximum},ignore_index=True)
    stock_data[col] = [(row - minimum) / (maximum - minimum) for row in stock_data[col]]
fin_stats.index = stock_data.columns

# Concatinate all data to one dataframe 
data = pd.DataFrame()
for text in text_data:
    data = data.append({'WORDS':text},ignore_index=True)
data['DATE'] = stock_data.index
data.set_index('DATE',inplace=True)
for col in stock_data: 
    data[col] = stock_data[col]

In [None]:
# Check if the ARIMA-models has been previously calculated for this config
# or if they have to be constructed
path = Path(f"./Models/ARIMA/all_mods_del{par['lookback']}.pkl")
if os.path.exists(path):
    # Load model if if already exists
    ARIMA_models = pd.read_pickle(path)    
    print("Found and loaded previously constructed models.")
else: 
    # Fit ARIMA-models to all of the dates in the training data 
    print("Model not found, fitting models")
    ARIMA_models = fit_all_models(par, data)
    ARIMA_models.to_pickle(path)    

In [None]:
# Predict 
arima_preds = predict_arima(ARIMA_models, par['delay'])

In [None]:
sequential_data = []
sequence_dates = []
prev_data = deque(maxlen=par['lookback'])
for i,row in enumerate(data['WORDS']): 
    prev_data.append(row)
    if len(prev_data) == par['lookback']:
        sequential_data.append(np.array(prev_data))
        sequence_dates.append(data.index[i])
sequential_data = np.asarray(sequential_data)

In [None]:
# Drop the values not in arima_preds. This is just values in the beginning where
# there isn't enough data to make a prediction, varies depending on lookback and delay. 
for date in data.index: 
    if not date in arima_preds.index : data.drop(index=date,inplace=True)
        
# Add the predicitons to the training data 
data['1 YEAR PRED'] = arima_preds['1 YEAR'].values
data['3 YEAR PRED'] = arima_preds['3 YEAR'].values
data['S&P PRED'] = arima_preds['S&P'].values

In [None]:
# Shift the target data (the actual rates) so that each row has a target rate 'delay' days in the future
data['1 YEAR'] = data['1 YEAR'].shift(-par['delay'])
data['3 YEAR'] = data['3 YEAR'].shift(-par['delay'])
data['S&P'] = data['S&P'].shift(-par['delay'])
data.dropna(inplace=True)

In [None]:
del_rows = []
for i,date in enumerate(sequence_dates): 
    if date not in data.index: del_rows.append(i)
keep_rows = np.setdiff1d(np.arange(len(sequential_data)),del_rows)
sequential_data = sequential_data[keep_rows]
        

In [None]:
# Divide the data into training, validation and test segments. 
indices = np.arange(len(data))
np.random.shuffle(indices)

# Here we extract the time series specified in 'par' (for some reason...)
words = np.array([row for row in data['WORDS'].values])
arima_preds = np.array([row for row in data[par['series'] + ' PRED'].values])
targets = np.array([row for row in data[par['series']].values])


training_samples = round(par['train_part'] * len(data))
validation_samples = round(par['val_part'] * len(data))
test_samples = round(par['test_part'] * len(data))

train_words = words[:training_samples - par['delay']]
train_seq = sequential_data[:training_samples - par['delay']]
train_arima = arima_preds[:training_samples - par['delay']]
train_targets = targets[:training_samples - par['delay']]

val_words = words[training_samples:training_samples + validation_samples - par['delay']]
val_seq = sequential_data[training_samples:training_samples + validation_samples - par['delay']]
val_arima = arima_preds[training_samples:training_samples + validation_samples - par['delay']]
val_targets = targets[training_samples:training_samples + validation_samples - par['delay']]

test_words = words[-test_samples:]
test_seq = sequential_data[-test_samples:]
test_arima = arima_preds[-test_samples:]
test_targets = targets[-test_samples:]

In [None]:
# Reshape to get the more conventional shape of (samples, sent_len, lookback)
sequential_data = np.transpose(sequential_data, (0,2,1))
sequential_data.shape

In [None]:
# Shuffle the data 
train_indices = np.arange(len(train_targets))
random.shuffle(train_indices)
train_words = train_words[train_indices]
train_seq = train_seq[train_indices]
train_arima = train_arima[train_indices]
train_targets = train_targets[train_indices]

val_indices = np.arange(len(val_targets))
val_words = val_words[val_indices]
val_seq = val_seq[val_indices]
val_arima = val_arima[val_indices]
val_targets = val_targets[val_indices]

test_indices = np.arange(len(test_targets))
test_words = test_words[test_indices]
test_seq = test_seq[test_indices]
test_arima = test_arima[test_indices]
test_targets = test_targets[test_indices]

In [None]:
# Initialize logging folder for TensorBoard
logdir=Path("./logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

In [None]:
print(train_words.shape)
print(train_seq.shape)

In [None]:
# Either create a new network or continue training a previous one. 
# Also specify the used embeddings here for use with all models 
model_embeddings = FAST_embeddings

In [None]:
# Initialize the neural network
text_inputs = layers.Input(shape=(par['input_dim']), name='Text_Input')

# Layer for word embedding
embedded_layer = layers.Embedding(input_dim=par['vocab_size'],
                           output_dim=par['embed_dim'],
                           input_length=(par['input_dim']),
                            weights=[model_embeddings], name='Embedding_Layer')(text_inputs)

# LSTM-layer over the embedding layer 
lstm_out = layers.Bidirectional(layers.LSTM(10))(embedded_layer)
dropout = layers.Dropout(0.2)(lstm_out)
dense = layers.Dense(1)(dropout)

# Input from an ARIMA-model independently fitted to the training data. 
ARIMA_input = layers.Input(shape=(1,), name='ARIMA_input')

# Merging the ARIMA-input and the input from the LSTM-layer. 
hidden = layers.concatenate([dense, ARIMA_input])

# Dense layers 
hidden = layers.Dense(2, activation='linear',name='Dense_1')(hidden)

# Main output of the model
main_output = layers.Dense(1,activation='linear',name='Main_Output')(hidden)

model_word = Model(inputs=[text_inputs, ARIMA_input],outputs=[main_output])

model_word.compile(optimizer='rmsprop',
             loss='mse')


model_word.summary()


In [None]:
# Or load the latest model (stored in model_path)
#model_path = Path(os.getcwd()) / "Models/NeuralNetworks/model_del_1_t20Mar12-17h11m/m_word.h5"
#model_word = keras.models.load_model(model_path)

In [None]:
plot_model(model_word)

In [None]:
eps = 10
h = model_word.fit({'Text_Input': train_words, 'ARIMA_input': train_arima},
              {'Main_Output': train_targets},
              validation_data=({'Text_Input': val_words, 'ARIMA_input': val_arima},
              {'Main_Output': val_targets}),
              batch_size=par['batch_size'],
              epochs=eps)

In [None]:
plt.plot(h.history['loss'],color="blue",label="Loss")
plt.plot(h.history['val_loss'],color="red",label="Val_Loss")
plt.legend()
plt.yscale('log')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.xticks(range(0,eps,int(eps / 10)))
plt.show()

In [None]:
# Save model and parameters
date_str = date.today().strftime("%y%h%d-%Hh%Mm")
dir_path = Path(f"./Models/NeuralNetworks/model_del_{par['delay']}_t{date_str}")
os.mkdir(dir_path)
model_path = dir_path / "m_word.h5"
model_word.save(model_path)
config_path = dir_path / "config.pkl"
pickle.dump(par,open(config_path,'wb'))

In [None]:
# Time distributed model  

# Initialize the neural network
text_inputs = layers.Input(shape=(par['lookback'],par['input_dim']), name='Text_Input')

# Layer for word embedding
embedded_layer = layers.TimeDistributed(layers.Embedding(input_dim=par['vocab_size'],
                           output_dim=par['embed_dim'],
                           input_length=par['input_dim'],
                            weights=[model_embeddings],
                                name='Embedding_Layer'))(text_inputs)

embedded_layer = layers.TimeDistributed(Flatten())(embedded_layer)

# LSTM-layer over the embedding layer 
lstm_out = layers.Bidirectional(layers.LSTM(5, input_shape=(None,par['lookback'],par['embed_dim'])))(embedded_layer)
dropout = layers.Dropout(0.2)(lstm_out)
dense = layers.Dense(1)(dropout)

# Input from an ARIMA-model independently fitted to the training data. 
ARIMA_input = layers.Input(shape=(1,), name='ARIMA_input')

# Merging the ARIMA-input and the input from the LSTM-layer. 
hidden = layers.concatenate([dense, ARIMA_input])

# Stack of dense layers 
hidden = layers.Dense(2, activation='linear',name='Dense_1')(hidden)


# Main output of the model
main_output = layers.Dense(1,activation='linear',name='Main_Output')(hidden)

model_p = Model(inputs=[text_inputs, ARIMA_input],outputs=[main_output])

model_p.compile(optimizer='rmsprop',
             loss='mse')

model_p.summary()

plot_model(model_p)

In [None]:
eps = 10
h_p = model_p.fit({'Text_Input': train_seq, 'ARIMA_input': train_arima},
              {'Main_Output': train_targets},
              validation_data=({'Text_Input': val_seq, 'ARIMA_input': val_arima},
              {'Main_Output': val_targets}),
              batch_size=par['batch_size'],
              epochs=eps)


In [None]:
plt.plot(h_p.history['loss'],color="blue",label="Loss")
plt.plot(h_p.history['val_loss'],color="red",label="Val_Loss")
plt.legend()
plt.yscale('log')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.xticks(range(0,eps,int(eps / 10)))
plt.show()

In [None]:
# Arima model which just outputs the input, convenient for testing and comparing 
inputs = layers.Input(shape=(1,), name='ARIMA_Input')
output = layers.Dense(1, activation='linear',name='Output')(inputs)

model_dummy_arima = Model(inputs=[inputs],outputs=[output])
model_dummy_arima.layers[-1].set_weights([np.array(1).reshape(1,1), np.array(0.0).reshape(1)])
model_dummy_arima.layers[-1].trainable = False
model_dummy_arima.compile(optimizer='rmsprop', loss='mse')

#arima_dummy_model.summary()

In [None]:
# Evaluate the model and compare to the ARIMA-predictions

train_mse_nlp_seq = model_p.evaluate({'Text_Input': train_seq, 'ARIMA_input': train_arima},
                {'Main_Output': train_targets},verbose=0)
train_mse_nlp_word = model_word.evaluate({'Text_Input': train_words, 'ARIMA_input': train_arima},
                {'Main_Output': train_targets},verbose=0)
train_mse_arima = model_dummy_arima.evaluate(train_arima, train_targets,verbose=0)
print(" ------ TRAIN RESULTS ------ ")
print("NLP MSE WORDS: ",train_mse_nlp_word)
print("NLP MSE SEQUENCE: ",train_mse_nlp_seq)
print("ARIMA MSE: ",train_mse_arima)
print("Percentage MSE word vs arima: {0:+} %".format(round(100 * (train_mse_nlp_word - train_mse_arima) / train_mse_nlp_word,3)))
print("Percentage MSE seq vs arima: {0:+} %".format(round(100 * (train_mse_nlp_seq - train_mse_arima) / train_mse_nlp_seq,3)))

val_mse_nlp_seq = model_p.evaluate({'Text_Input': val_seq, 'ARIMA_input': val_arima},
                {'Main_Output': val_targets},verbose=0)
val_mse_nlp_word = model_word.evaluate({'Text_Input': val_words, 'ARIMA_input': val_arima},
                {'Main_Output': val_targets},verbose=0)
val_mse_arima = model_dummy_arima.evaluate(val_arima, val_targets,verbose=0)
print(" ------ VAL RESULTS ------ ")
print("NLP MSE WORD: ",val_mse_nlp_word)
print("NLP MSE SEQUENCE: ",val_mse_nlp_seq)
print("ARIMA MSE: ",val_mse_arima)
print("Percentage MSE word vs arima: {0:+} %".format(round(100 * (val_mse_nlp_word - val_mse_arima) / val_mse_nlp_word,3)))
print("Percentage MSE seq vs arima: {0:+} %".format(round(100 * (val_mse_nlp_seq - val_mse_arima) / val_mse_nlp_seq,3)))

test_mse_nlp_seq = model_p.evaluate({'Text_Input': test_seq, 'ARIMA_input': test_arima},
                {'Main_Output': test_targets},verbose=0)
test_mse_nlp_word = model_word.evaluate({'Text_Input': test_words, 'ARIMA_input': test_arima},
                {'Main_Output': test_targets},verbose=0)

test_mse_arima = model_dummy_arima.evaluate(test_arima, test_targets,verbose=0)

print(" ------ TEST RESULTS ------ ")
print("NLP MSE WORD: ",test_mse_nlp_word)
print("NLP MSE SEQUENCE: ",test_mse_nlp_seq)
print("ARIMA MSE: ",test_mse_arima)
print("Percentage MSE word vs arima: {0:+} %".format(round(100 * (test_mse_nlp_word - test_mse_arima) / test_mse_nlp_word,3)))
print("Percentage MSE seq vs arima: {0:+} %".format(round(100 * (test_mse_nlp_seq - test_mse_arima) / test_mse_nlp_seq,3)))


In [None]:
model_p.layers[-1].get_weights()[0].shape