#Initial Downloads, imports and set-up

In [0]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Bidirectional, TimeDistributed, Dropout
from keras.models import Model
from keras.utils import to_categorical
from keras.optimizers import RMSprop, Adam, Adagrad, Nadam, Adadelta, Adamax
from keras.backend import eval
import matplotlib.pyplot as plt
import keras.regularizers
from keras.regularizers import l2

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle, resample

from scipy.stats import mode

import sys
import csv

csv.field_size_limit(sys.maxsize)

In [0]:
# Mount drive to import necessary files:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
def load_data(textfile):
  """ Method to load in pre created csv files which will be used to create
      the data

  Parameters
  ----------
  textfile : str
      Name of file to load in
  """
  # files paths
  path = '/content/drive/My Drive/Colab Notebooks/MSc Project 2019 Aaron Dougherty csvData/' + textfile
  path2 = '/content/drive/My Drive/Colab Notebooks/MSc Project 2019 Aaron Dougherty csvData/sent_fin.csv'

  # load primary csv file containing text data
  df = pd.read_csv(path, encoding='ISO-8859-1', engine='python', error_bad_lines=False)
  # csv containing sentiment and finance data, including classification labels 
  sent_fin = pd.read_csv(path2, encoding='ISO-8859-1', engine='python', error_bad_lines=False)
  # remove csv file size limit on import
  csv.field_size_limit(sys.maxsize)
  # remove useless columns
  df.drop('Unnamed: 0', axis=1, inplace=True)
  sent_fin.drop('Unnamed: 0', axis=1, inplace=True)
  return df, sent_fin

#Data Preparation

In [0]:
def prepare_data(text_df, sent_fin):
  """ Method to combine all data into one data frame

  Parameters
  ----------
  text_df : pandas.DataFrame
      Primary data frame ontaining all text data
  sent_fin: pandas.DataFrame
      Secondary data frame containing all financial and sentiment data and labels
  """
  # rename data frame columns
  text_df = text_df.rename(columns={"date": 'Date', 'title': 'Title',
                                    'content':'Content'})
  # combine data frames
  text_df = text_df.merge(sent_fin, how='inner', on='Date')

  return text_df

def removeNANs(df):
  """ Method to remove rows with no text

  Parameters
  ----------
  df : pandas.DataFrame
      Data frame to remove rows from
  """
  string = ' '
  for index, row in df.iterrows():
    # check is row/column contains NAN type
    if type(row['Title']) is not type(string): 
      df.drop([index], inplace=True)
    # check is row/column contains NAN type
    if type(row['Content']) is not type(string):
      df.drop([index], inplace=True)
  return df

In [0]:
# Preparing final set
df, sent_fin = load_data('title_contentSTOP.csv')
df = removeNANs(prepare_data(df, sent_fin))

In [0]:
# check presence of class imbalance
minority_class = df[df.Direction == 1]
majority_class = df[df.Direction == 0]
print(len(minority_class), len(majority_class))

#Training/Development/Testing Split

In [0]:
def shuffle_df(df):
  """ Method to shuffle rows of a data frame

  Parameters
  ----------
  df : pandas.DataFrame
      Data frame to shuffle
  """
  index = df.index # record data frame indexes
  df = shuffle(df) # ScikitLearn's shuffle method
  df.index = index # reset indexes
  return df

In [0]:
# Shuffle  rows in the final data frame prior to splitting the data into
# training, devlopment and testing
df = shuffle_df(df)

In [0]:
# Split data into X and y variables
X = df.iloc[:, 0:-1]
y = df[['Date', 'Direction']]

In [0]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    shuffle=False)
y_test.drop('Date', axis=1, inplace=True) # remove useless column from y_test

In [0]:
# check shapes of trianing and test sets
print(X_train.shape, y_train.shape, '\n', X_test.shape, y_test.shape, '\n')

# Swap labels (do not run unless validating an existing model)

In [0]:
# swap test set labels to validate a model's performance
y_test['Direction'] = y_test['Direction'].apply(lambda x: x+1 if x == 0 else x-1)

#Dealing with class imbalance:

In [0]:
def recombine(X, y):
  """ Method to recombine the X and y

  Parameters
  ----------
  X : pandas.DataFrame
      Data frame containing X variables
  y : pandas.DataFrame
      Data frame containing y variables
  """
  train = X_train
  train['Direction'] = y_train['Direction'] # combine X and y
  return train

def address_class_imbalance(df, minority_classification, majority_classification,
                            minority_target_samples, majority_target_samples):
  """ Method to remove class imbalance within a data set for binary classification

  Parameters
  ----------
  df : pandas.DataFrame
      Data frame to check for and remove class imbalances from
  minority_classification : int
      the minority class
  majority_classification : int
      the majority class
  minority_target_samples: int
      number of samples to upsample to
  majority_target_samples: int
      number of samples to downsample to
  """
  # Split data frame in 2, 1 for each class
  minority_class = df[df.Direction == minority_classification]
  majority_class = df[df.Direction == majority_classification]
 
  # down sample majority and up sample minority the desired amount using helper
  # methods
  downsampled_majority = downsample_majority(majority_class, majority_target_samples)
  upsampled_minority = upsample_minority(minority_class, minority_target_samples)
  
  # combine newly sampled data frames
  new_df = upsampled_minority
  new_df = new_df.append(downsampled_majority, ignore_index=True)

  return new_df

# Upsample helper method
def upsample_minority(minority_class, target_samples):
  """ Method to up sample the minority class

  Parameters
  ----------
  minority_class : int
      the minority class
  target_samples: int
      number of samples to upsample to
  """
  return resample(minority_class, replace = True, n_samples = target_samples)

# Downsample helper method
def downsample_majority(majority_class, target_samples):
  """ Method to down sample the majority class

  Parameters
  ----------
  majority_class : int
      the majority class
  majority_target_samples: int
      number of samples to downsample to
  """
  return resample(majority_class, replace = False, n_samples = target_samples)

def find_y_mid_point(df):
  """ Method to find the mid-point between to class counts

  Parameters
  ----------
  df : pandas.DataFrame
      dataframe to find midpoint in between 2 unbalanced classes
  """
  # count number of rows belonging to each binary class
  count_class_0, count_class_1 = df.Direction.value_counts()
  # calculate mid-point
  mid_point = int(count_class_1 + ((count_class_0 - count_class_1)/2))
  return mid_point

In [0]:
# rebalance classes using mis point (up and down sample classes equally)
train = recombine(X_train, y_train) # recombine training sets
target_sample = find_y_mid_point(train) # determine target sample value
# up and down sample the classes to remove imbalance
train = address_class_imbalance(train, 1, 0, target_sample, target_sample)  # swap back
train = shuffle_df(train) # shuffle the new training set
# Split training set back into X and y
X_train = train.iloc[:, 0:-1]
y_train = train[['Date', 'Direction']]
y_train.drop('Date', axis=1, inplace=True) # remove useless column

In [0]:
# recheck class imbalance
minority_class = train[train.Direction == 1]
majority_class = train[train.Direction == 0]
print(len(minority_class), len(majority_class))

#Model Preparation

In [0]:
# define documents:
def create_xtext(x_df, col):
  """ Method to format data for the nerual network

  Parameters
  ----------
  X_df : pandas.DataFrame
      X values
  col:  str
      relevant columns
  """
  arr = []
  for index, row in x_df.iterrows():
    arr.append(x_df[col].loc[index]) # add text data to array row by row
  return arr

# create vocabulary/tokeniser
def create_vocab(docs):
  """ Method to create vocabulary/dictionary

  Parameters
  ----------
  docs : list
      list of strings
  """
  tokenizer = Tokenizer() # define keras tokeniser
  tokenizer.fit_on_texts(docs) # create dictionary from words in docs
  return tokenizer

# tokenise text
def tokenize(text_arr):
  """ Method to ftokenise text

  Parameters
  ----------
  text_arr : list
      list of strings to be tokenised
  """
  tokenised_text = tokenizer.texts_to_sequences(text_arr) # tokenise text
  return tokenised_text

# find max number of words in an array of tokens
def find_max_length(array):
  """ Method to find maximum number of tokens in a list index

  Parameters
  ----------
  array : list
      2D list containing tokenised text
  """
  max_ = 0
  for tokens in array:
    if len(tokens) > max_:
      max_ = len(tokens)
  return max_

# pad sequences to be the same length
def pad_text(tokens, maxlen):
  """ Method to pad sequence length up to max

  Parameters
  ----------
  tokens : pandas.DataFrame
      list of tokenised text
  maxlen: int
      maximum lnegth of a sequence
  """
  return pad_sequences(tokens, maxlen=maxlen, padding='post')

# load the whole embedding into memory/create embedding index
def load_embeddings(path):
  """ Method to create an embedding index so that each word in the dictionary is
      matched with an embedding value

  Parameters
  ----------
  path : str
      file path for where the word embeddings are stored
  """
  embeddings_index = dict() # create empty dictionary
  embeddings = open(path) # open path to word embeddings
  for line in embeddings: # create embeddings dictinoary
    values = line.split() # split string on each line read in from file path
    word = values[0] # define first string as word
    # assign rest as vector and change type
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector # add word/vector pair to dictionary
  # confirm loading of vectors
  print('Loaded %s word vectors.' % len(embeddings_index))
  return embeddings_index

# create a weight matrix for words
def create_embeddings(embeddings, vocab, vocab_size):
  """ Method to create an embedding matrix fro embedding ayer of neural network

  Parameters
  ----------
  embeddings : dictionary
      Word embeddings dictinoary created from method above
  vocab : tokenizer
      tokenizer object containing the vocabulary
  vocab_size : int
      size of vocabulary
  """
  # initialise mebeddings matrix with all 0s
  embedding_matrix = np.zeros((vocab_size, 300))
  for word, i in tokenizer.word_index.items(): #  build matrix using vocabulary
    embedding_vector = embeddings.get(word) # find vector for each word in vocab
    if embedding_vector is not None: # check word had a corresponding vector
      embedding_matrix[i] = embedding_vector # add to matrix
  return embedding_matrix

In [0]:
# transform titles into array format
train_titles = create_xtext(X_train, 'Title')

test_titles = create_xtext(X_test, 'Title')

# combine all titles into complete array of documents
all_docs = train_titles + test_titles

# prepare y variables (turn them into a list)
train_labels = y_train['Direction'].tolist()
test_labels = y_test['Direction'].tolist()

In [0]:
# prepare sentiment vector inputs
train_sent_vecs = X_train.iloc[:, 3:10].as_matrix()
test_sent_vecs = X_test.iloc[:, 3:10].as_matrix()

In [0]:
# create tokeniser/vocabulary and calculate vocab size
tokenizer = create_vocab(all_docs)
vocab_size = len(tokenizer.word_index) + 1

In [0]:
# tokenise all text data as integers
tokenized_titles_train = tokenize(train_titles)
tokenized_titles_test = tokenize(test_titles)

In [0]:
# calculate longest title in tokens
MAX_T_LEN = find_max_length(tokenized_titles_train)

In [0]:
# pad documents to max length
padded_titles_train = pad_text(tokenized_titles_train, MAX_T_LEN)
padded_titles_test = pad_text(tokenized_titles_test, MAX_T_LEN)

In [0]:
# load in word GLoVe word embeddings
path = '/content/drive/My Drive/Colab Notebooks/MSc Project 2019 Aaron Dougherty GloVe/glove.6B.300d.txt'
word_embeddings = load_embeddings(path)

In [0]:
#create word embedding matrix
embedding_matrix = create_embeddings(word_embeddings, tokenizer, vocab_size)

In [0]:
# convert y variables into one hot vector encodings for binary crossentropy
y_train__binary_matrix = to_categorical(y_train['Direction'], num_classes = 2)
y_test__binary_matrix = to_categorical(y_test['Direction'], num_classes = 2)

#Model

In [0]:
def create_model(vec_dim, num_classes,input_shape_title):
  """ Method to create a deep neural network model

  Parameters
  ----------
  vec_dim : int
      Word embedding vector dimensions
  num_classes : int
      number of classes (dimensions of final layer output)
  input_shape_title : int
      shape of title array input
  """
  # Inputs:
  titles_input  = Input(shape=(input_shape_title,), name='titles_input') 
  # embedding layer
  embed_titles = Embedding(vocab_size, vec_dim,
                           weights=[embedding_matrix],
                           input_length=input_shape_title,
                           trainable=True)(titles_input)
  activation = 'tanh'
  lstm_titles  = LSTM(vec_dim, activation=activation,
                      input_shape=(input_shape_title, vec_dim),
                      return_sequences=True)(embed_titles)
  lstm_titles  = LSTM(vec_dim, activation=activation, return_sequences=True )(lstm_titles)
  lstm_titles  = LSTM(vec_dim, activation=activation )(lstm_titles)
  # Output/fully connected layer
  out = lstm_titles
  out = Dense(200, activation=activation)(out)
  out = Dense(num_classes, activation=activation)(out)
  # Model initialisation
  model = Model(inputs=[titles_input], outputs=[out])
  return model

def create_bidirectional_model(vec_dim, num_classes,input_shape_title):
  """ Method to create a deep neural network model with bidirectional LSTMs

  Parameters
  ----------
  vec_dim : int
      Word embedding vector dimensions
  num_classes : int
      number of classes (dimensions of final layer output)
  input_shape_title : int
      shape of title array input
  """
  # Inputs:
  titles_input  = Input(shape=(input_shape_title,), name='titles_input') 
  # embedding layers
  embed_titles = Embedding(vocab_size, vec_dim,
                           weights=[embedding_matrix],
                           input_length=input_shape_title,
                           trainable=True)(titles_input)
  activation = 'tanh' # change to relu?
  lstm_titles  = Bidirectional(LSTM(vec_dim, activation=activation,
                                    input_shape=(input_shape_title, vec_dim),
                                    return_sequences=True, dropout=0.2,
                                    recurrent_dropout=0.1))(embed_titles)
  lstm_titles  = Bidirectional(LSTM(vec_dim, activation=activation, return_sequences=True))(lstm_titles)
  lstm_titles  = LSTM(vec_dim, activation=activation)(lstm_titles)
  # Output/Fully connected layer
  out = lstm_titles
  out = Dense(250, activation='tanh')(out)
  out = Dense(200, activation='tanh')(out)
  out = Dense(num_classes, activation='tanh')(out)
  # Model initialisation
  model = Model(inputs=[titles_input], outputs=[out])
  return model

def create_sentiment_model(vec_dim, num_classes,input_shape_title, input_shape_sentiment):
  """ Method to create a deep neural network model with dual input (sentiment vector)

  Parameters
  ----------
  vec_dim : int
      Word embedding vector dimensions
  num_classes : int
      number of classes (dimensions of final layer output)
  input_shape_title : int
      shape of title array input
  input_shape_sentiment : int
      shape of sentiment array input
  """
  # Inputs:
  titles_input  = Input(shape=(input_shape_title,), name='titles_input')
  sentiment_input = Input(shape=(input_shape_sentiment,),
                          name='sentiment_features') 
  # embedding layers
  embed_titles = Embedding(vocab_size, vec_dim,
                           weights=[embedding_matrix],
                           input_length=input_shape_title,
                           trainable=True)(titles_input)
  activation = 'relu'
  lstm_titles  = Bidirectional(LSTM(vec_dim, activation=activation,
                                    input_shape=(input_shape_title, vec_dim),
                                    return_sequences=True))(embed_titles)
  lstm_titles  = Bidirectional(LSTM(vec_dim, activation=activation, return_sequences=True))(lstm_titles)
  lstm_titles  = Bidirectional(LSTM(vec_dim, activation=activation))(lstm_titles)
  # Sentiment layer
  sentiment = Dense(600, activation=activation)(sentiment_input)
  sentiment = Dense(450, activation=activation)(sentiment)
  sentiment = Dense(300, activation=activation)(sentiment)
  # Combine inputs
  concat = concatenate([lstm_titles, sentiment])
  # Output/Fully connected layer
  out = concat
  out = Dense(500, activation='tanh')(out)
  out = Dense(200, activation='tanh')(out)
  out = Dense(num_classes, activation='tanh')(out)
  # Model initialisation
  model = Model(inputs=[titles_input, sentiment_input], outputs=[out])
  return model

In [0]:
vec_dim = 300
num_classes = 2
input_shape_title = MAX_T_LEN
input_shape_sentiment = 7

# instantiate model
model = create_bidirectional_model(vec_dim, num_classes, input_shape_title)
# compile the model
opt = Adam(lr=0.0001, clipnorm=1., decay=1e-6, amsgrad=True)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
# summarize the model
print(model.summary())

In [0]:
# fit the model
history = model.fit([padded_titles_train], y_train__binary_matrix, batch_size = 512, epochs = 5, verbose=1, validation_split=0.15)

In [0]:
import matplotlib.pyplot as plt
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

##Save  and load model

In [0]:
!pip install h5py pyyaml

In [0]:
# Save the weights
# model.save_weights('/content/drive/My Drive/Colab Notebooks/Models/')
# Save entire model to a HDF5 file
model.save('/content/drive/My Drive/Colab Notebooks/Models/model1.h5')

In [0]:
model = keras.models.load_model('/content/drive/My Drive/Colab Notebooks/Models/model8_25epochs.h5')
print(keras.backend.eval(model.optimizer.lr))

##Evaluate Predictions

In [0]:
y_pred = model.predict([padded_titles_test], verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

In [0]:
y_pred_df = pd.DataFrame(y_pred_bool)
y_test_df = pd.DataFrame(y_test)
y_pred_df['Date'] = [date for date in X_test['Date']]
y_test_df['Date'] = [date for date in X_test['Date']]
y_pred_df = y_pred_df.rename(columns={0: "Direction"})
y_pred_modes = y_pred_df.groupby('Date')['Direction'].apply(lambda x: mode(x)[0][0]).reset_index()
y_test_modes = y_test_df.groupby('Date')['Direction'].apply(lambda x: mode(x)[0][0]).reset_index()
y_pred_majority = y_pred_modes['Direction'].tolist()
y_test_majority = y_test_modes['Direction'].tolist()

In [0]:
print(classification_report(y_test_majority, y_pred_majority))
print(confusion_matrix(y_test_majority, y_pred_majority))