# Import Libraries

In [None]:
# Import all libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from dateutil import parser
import re
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
import os
from sklearn import metrics
from tensorflow.keras.utils import plot_model
import pydot

In [None]:
master_df = pd.read_csv('processed_data.csv')
master_df.head()

# 1. Regression

## Linear Regression

In [None]:
dummy_topics = pd.get_dummies(master_df['topic'])

X_lr = pd.concat([X.loc[:, ['compound', 'favorites', 'retweets']], dummy_topics], axis=1)
y_lr = master_df.loc[:, '60mins_price_diff_perc']*100

In [2]:
# Removing correlated features

correlated_features = set()
correlation_matrix = X_lr.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
X_lr.drop(labels=correlated_features, axis=1, inplace=True)

In [None]:
# Train test split and resampling

X_train_lr, X_test_lr, y_train_lr, y_test_lr = model_selection.train_test_split(X_lr, y_lr, test_size = 0.33, random_state = 2020)

smote = SMOTE(random_state=0, sampling_strategy='not majority')
X_train_lr, y_train_lr = smote.fit_sample(X_train_lr, y_train_lr)

In [None]:
# Normalize all values
X_train_lr = preprocessing.scale(X_train_lr)
y_train_lr = preprocessing.scale(y_train_lr)
X_test_lr = preprocessing.scale(X_test_lr)
y_test_lr = preprocessing.scale(y_test_lr)

# Linear regression
regr = linear_model.LinearRegression()
regr.fit(X_train_lr, y_train_lr)

y_pred_lr = regr.predict(X_test_lr)
print('Linear metrics for stock price 60 mins after tweet')
print('Linear score:', regr.score(X_test_lr, y_test_lr))
print('Linear MSE', metrics.mean_squared_error(y_test_lr, y_pred_lr))
print(regr.coef_, '\n')

print('-'*64)

num_C = 5
C = [1.0] * num_C
for i in range(num_C):
    C[i] = pow(10, i-5)
print('Ridge and Lasso metrics for stock price 60 mins after tweet')
for i in range(5):
    # Lasso Regression
    lasso = linear_model.Lasso(alpha = C[i])
    lasso.fit(X_train_lr, y_train_lr)
    y_pred = regr.predict(X_test_lr)
    print('Alpha = ', C[i])
    print('Lasso score:', lasso.score(X_test_lr, y_test_lr))
    print('Lasso MSE', metrics.mean_squared_error(y_test_lr, y_pred_lr))
    print('Lasso coefs:', lasso.coef_, '\n')
    
    # Ridge regression
    ridge = linear_model.Ridge(alpha = C[i])
    ridge.fit(X_train_lr, y_train_lr)
    y_pred_lr = regr.predict(X_test_lr)
    print('Ridge score:', ridge.score(X_test_lr, y_test_lr))
    print('Ridge MSE', metrics.mean_squared_error(y_test_lr, y_pred_lr))
    print('Ridge coefs:', ridge.coef_, '\n')
    print('-'*64)

## LSTM Regression

### Model A: Word Vectors as Input

Method 1: Word2Vec

In [None]:
X_lstm_reg_a_w2v = master_df.loc[:, 'lstm_text']
y_lstm_reg_a_w2v = master_df.loc[:, '60mins_price_diff_perc']*100

In [None]:
X_train_lstm_reg_a_w2v, X_test_lstm_reg_a_w2v, y_train_lstm_reg_a_w2v, y_test_lstm_reg_a_w2v = train_test_split(X_lstm_reg_a_w2v, y_lstm_reg_a_w2v, test_size=0.33, random_state=2020)

In [None]:
# Add words into corpus list

lstm_reg_a_corpus_list = []

for i in X_train_lstm_reg_a_w2v:
    lstm_reg_a_corpus_list.append(i.split())

In [None]:
# Train Word2Vec

lstm_reg_a_w2v_model = Word2Vec(lstm_reg_a_corpus_list, min_count=1, size=100)
lstm_reg_a_w2v_weights = lstm_reg_a_w2v_model.wv.vectors

In [None]:
# Find length of longest sentence (for padding later on)

lstm_reg_a_w2v_num_words = [len(i) for i in lstm_reg_a_w2v_corpus_list]
lstm_reg_a_w2v_longest_sentence_len = max(lstm_reg_a_w2v_num_words)

In [None]:
# Pad sentences that are shorter than length of longest sentence in training data

def word2vec_sentence_to_indices_padded(sentences, longest_sentence_len, word2vec_model):
    result = []
    for sentence in sentences:
        indices = []
        sentence_splitted = sentence.split()
        for word in sentence_splitted:
            if word in word2vec_model.wv.vocab:
                indices.append(word2vec_model.wv.vocab[word].index)
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=longest_sentence_len, padding='pre')

In [None]:
X_train_lstm_reg_a_w2v_padded = word2vec_sentence_to_indices_padded(X_train_lstm_reg_a_w2v, lstm_reg_a_w2v_longest_sentence_len, lstm_reg_a_w2v_model)
X_test_lstm_reg_a_w2v_padded = word2vec_sentence_to_indices_padded(X_test_lstm_reg_a_w2v, lstm_reg_a_w2v_longest_sentence_len, lstm_reg_a_w2v_model)

In [3]:
def create_lstm_reg_a_w2v(pretrained_weights, longest_sentence_len):
    vocab_size, embedding_size = pretrained_weights.shape
    model = tf.keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], trainable=False))  
    model.add(layers.LSTM(4, return_sequences=True, name='LSTM1'))
    model.add(layers.Dropout(0.2,name='Dropout1'))
    model.add(layers.LSTM(4, return_sequences=False, name='LSTM2'))
    model.add(layers.Dropout(0.2,name='Dropout2'))
    model.add(layers.Dense(4,name='Dense',activation='tanh'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1,activation='linear'))
    return model

In [None]:
lstm_reg_a_w2v_ann = create_lstm_reg_a_w2v(lstm_reg_a_w2v_weights, lstm_reg_a_w2v_longest_sentence_len)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_reg_a_w2v_ann.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])
lstm_reg_a_w2v_ann.summary()

In [None]:
plot_model(
    lstm_reg_a_w2v_ann,
    to_file="lstm_reg_a_w2v_ann.png",
    show_shapes=False,
    show_layer_names=True,
    rankdir="TB"
)

In [None]:
now = dt.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

lstm_reg_a_w2v_checkpoint_filepath = f'./lstm_reg_a_w2v/lstm_reg_a_w2v_{dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

lstm_reg_a_w2v_history = lstm_reg_a_w2v_ann.fit(X_train_lstm_reg_a_w2v_padded, y_train_lstm_reg_a_w2v, validation_split=0.33, epochs=50, callbacks=[model_checkpoint_callback])

In [None]:
lstm_reg_a_w2v_ann_loaded = create_lstm_reg_a_w2v(lstm_reg_a_w2v_weights, lstm_reg_a_w2v_longest_sentence_len)
lstm_reg_a_w2v_ann_loaded.load_weights(lstm_reg_a_w2v_checkpoint_filepath)
lstm_reg_a_w2v_ann_loaded.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])

In [None]:
dev_loss, dev_acc = lstm_reg_a_w2v_ann_loaded.evaluate(X_test_lstm_reg_a_w2v_padded, y_test_lstm_reg_a_w2v, verbose=1)

print(f"Training MSE: {np.sqrt(metrics.mean_squared_error(y_train_lstm_reg_a_w2v, lstm_reg_a_w2v_ann_loaded.predict(X_train_lstm_reg_a_w2v_padded)))}")
print(f"Test MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_a_w2v, lstm_reg_a_w2v_ann_loaded.predict(X_test_lstm_reg_a_w2v_padded)))}")
print(f"Test R^2: {metrics.r2_score(y_test_lstm_reg_a_w2v, lstm_reg_a_w2v_ann_loaded.predict(X_test_lstm_reg_a_w2v_padded))}")
print(f"Baseline MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_a_w2v, 0*y_test_lstm_reg_a_w2v))}")

In [None]:
# summarize history for loss
plt.plot(lstm_reg_a_w2v_history.history['loss'])
plt.plot(lstm_reg_a_w2v_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Method 2: Glove

In [None]:
X_lstm_reg_a_glove = master_df.loc[:, 'lstm_text']
y_lstm_reg_a_glove = master_df.loc[:, '60mins_price_diff_perc']*100

In [None]:
X_train_lstm_reg_a_glove, X_test_lstm_reg_a_glove, y_train_lstm_reg_a_glove, y_test_lstm_reg_a_glove = train_test_split(X_lstm_reg_a_glove, y_lstm_reg_a_glove, test_size=0.33, random_state=2020)

In [None]:
embeddings_index = {}
f = open('glove/glove.twitter.27B.50d.txt', encoding='utf8')
glove_vocab = []
glove_vocab_index = {}
count = 0
for line in f:
    values = line.split()
    word = values[0]
    glove_vocab.append(word)
    glove_vocab_index[word] = count
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    count += 1
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(nb_words=None)
tokenizer.fit_on_texts(X_train_lstm_reg_a_glove)
sequences = tokenizer.texts_to_sequences(X_train_lstm_reg_a_glove)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
def glove_sentence_to_indices_padded(sentences, longest_sentence_len):
    global glove_vocab
    result = []
    for sentence in sentences:
        indices = []
        try:
            sentence_splitted = sentence.split()
        except:
            continue
            
        for word in sentence_splitted:
            if word in glove_vocab:
                indices.append(glove_vocab_index[word])
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=longest_sentence_len, padding='post')

In [None]:
X_train_lstm_reg_a_glove_padded = glove_sentence_to_indices_padded(X_train_lstm_reg_a_glove, lstm_reg_a_w2v_longest_sentence_len)
X_test_lstm_reg_a_glove_padded = glove_sentence_to_indices_padded(X_test_lstm_reg_a_glove, lstm_reg_a_w2v_longest_sentence_len)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 50))
count = 0
skipped_words = []
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        count += 1
        skipped_words.append(word)
        
vocab_size_glove, embedding_size_glove = embedding_matrix.shape

embedding_layer_glove = Embedding(len(word_index) + 1,
                            50,
                            weights=[embedding_matrix],
                            input_length=lstm_reg_a_w2v_longest_sentence_len,
                            trainable=False)

In [None]:
def create_lstm_reg_a_glove(longest_sentence_len):
    global embedding_layer_glove
    model = tf.keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(embedding_layer_glove)
    model.add(layers.LSTM(4, return_sequences=True, name='LSTM1'))
    model.add(layers.Dropout(0.2,name='Dropout1'))
    model.add(layers.LSTM(4, return_sequences=False, name='LSTM2'))
    model.add(layers.Dropout(0.2,name='Dropout2'))
    model.add(layers.Dense(4,name='Dense',activation='tanh'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1,activation='linear'))
    return model

In [None]:
lstm_reg_a_glove_ann = create_lstm_reg_a_glove(lstm_reg_a_w2v_longest_sentence_len)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_reg_a_glove_ann.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])
lstm_reg_a_glove_ann.summary()

In [None]:
plot_model(
    lstm_reg_a_glove_ann,
    to_file="lstm_reg_a_glove_ann.png",
    show_shapes=False,
    show_layer_names=True,
    rankdir="TB"
)

In [None]:
now = dt.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

lstm_reg_a_glove_checkpoint_filepath = f'./lstm_reg_a_glove/lstm_reg_a_glove_{dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

lstm_reg_a_glove_history = lstm_reg_a_glove_ann.fit(X_train_lstm_reg_a_glove_padded, y_train_lstm_reg_a_glove, validation_split=0.33, epochs=50, callbacks=[model_checkpoint_callback])

In [None]:
lstm_reg_a_glove_ann_loaded = create_lstm_reg_a_glove(lstm_reg_a_w2v_longest_sentence_len)
lstm_reg_a_glove_ann_loaded.load_weights(lstm_reg_a_glove_checkpoint_filepath)
lstm_reg_a_glove_ann_loaded.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])

In [None]:
dev_loss, dev_acc = lstm_reg_a_glove_ann_loaded.evaluate(X_test_lstm_reg_a_glove_padded, y_test_lstm_reg_a_glove, verbose=1)

print(f"Training MSE: {np.sqrt(metrics.mean_squared_error(y_train_lstm_reg_a_glove, lstm_reg_a_glove_ann_loaded.predict(X_train_lstm_reg_a_glove_padded)))}")
print(f"Test MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_a_glove, lstm_reg_a_glove_ann_loaded.predict(X_test_lstm_reg_a_glove_padded)))}")
print(f"Test R^2: {metrics.r2_score(y_test_lstm_reg_a_glove, lstm_reg_a_glove_ann_loaded.predict(X_test_lstm_reg_a_glove_padded))}")
print(f"Baseline MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_a_glove, 0*y_test_lstm_reg_a_glove))}")

In [None]:
# summarize history for loss
plt.plot(lstm_reg_a_glove_history.history['loss'])
plt.plot(lstm_reg_a_glove_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### Model B: Word Vectors + 30min Price History as Inputs

Method 1: Word2Vec

In [None]:
X_lstm_reg_b_w2v = master_df.loc[:, ['lstm_text', 'prev_30_min_prices']
y_lstm_reg_b_w2v = master_df.loc[:, '60mins_price_diff_perc']*100

In [None]:
X_train_lstm_reg_b_w2v, X_test_lstm_reg_b_w2v, y_train_lstm_reg_b_w2v, y_test_lstm_reg_b_w2v = train_test_split(X_lstm_reg_b_w2v, y_lstm_reg_b_w2v, test_size=0.33, random_state=2020)

def parse_price_history(price_history):
    result = [float(i) for i in price_history.strip('[').strip(']').replace(' ', '').split(',')]
    return result

X_train_price_history = X_train_lstm_reg_b_w2v.iloc[:, 1].apply(parse_price_history)
X_train_price_history = np.stack(X_train_price_history)
X_train_price_history = X_train_price_history.reshape(X_train_price_history.shape[0], X_train_price_history.shape[1],1)

X_test_price_history = X_test_lstm_reg_b_w2v.iloc[:, 1].apply(parse_price_history)
X_test_price_history = np.stack(X_test_price_history)
X_test_price_history = X_test_price_history.reshape(X_test_price_history.shape[0], X_test_price_history.shape[1],1)
    
# Word2Vec embeddings are the same as the one used in Model A
X_train_lstm_reg_b_w2v = [X_train_lstm_reg_a_w2v_padded, np.array(X_train_price_history)]
X_test_lstm_reg_b_w2v = [X_test_lstm_reg_a_w2v_padded, X_test_price_history]

In [None]:
def create_lstm_reg_b_w2v(pretrained_weights, longest_sentence_len, price_history_shape):
    vocab_size, embedding_size = pretrained_weights.shape
    
    # word vectors model
    model1_input = layers.Input(shape=longest_sentence_len, dtype='int32', name='sentence_index_input')
    model1 = layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], trainable=False)(model1_input)  
    model1 = layers.LSTM(4, return_sequences=True, name='model1_LSTM1')(model1)
    model1 = layers.Dropout(0.25,name='model1_dropout1')(model1)
    model1 = layers.LSTM(4, return_sequences=False, name='model1_LSTM2')(model1)
    model1 = layers.Dropout(0.25,name='model1_dropout2')(model1)
    
    # price history model
    model2_input = layers.Input(shape=price_history_shape, dtype='float32', name='price_history_input')
    model2 = layers.LSTM(4, return_sequences=True, name='model2_LSTM1')(model2_input)
    model2 = layers.Dropout(0.25,name='model2_dropout1')(model2)
    model2 = layers.LSTM(4, return_sequences=False, name='model2_LSTM2')(model2)
    model2 = layers.Dropout(0.25,name='model2_dropout2')(model2)
    
    model_concat = layers.concatenate([model1, model2])
    model_concat = layers.Dense(4,name='Dense',activation='tanh')(model_concat)
    model_concat = layers.Dropout(0.1)(model_concat)
    model_concat = layers.Dense(1,activation='linear')(model_concat)
    
    model = keras.models.Model(inputs=[model1_input, model2_input], outputs = model_concat)
    
    return model

In [None]:
lstm_reg_b_w2v_ann = create_lstm_reg_b_w2v(lstm_reg_a_w2v_weights, lstm_reg_a_w2v_longest_sentence_len)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_reg_b_w2v_ann.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])
lstm_reg_b_w2v_ann.summary()

In [None]:
plot_model(
    lstm_reg_b_w2v_ann,
    to_file="lstm_reg_b_w2v_ann.png",
    show_shapes=False,
    show_layer_names=True,
    rankdir="TB"
)

In [None]:
now = dt.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

lstm_reg_b_w2v_checkpoint_filepath = f'./lstm_reg_b_w2v/lstm_reg_b_w2v_{dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

lstm_reg_b_w2v_history = lstm_reg_b_w2v_ann.fit(X_train_lstm_reg_a_w2v_padded, y_train_lstm_reg_b_w2v, validation_split=0.33, epochs=50, callbacks=[model_checkpoint_callback])

In [None]:
lstm_reg_b_w2v_ann_loaded = create_lstm_reg_b_w2v(lstm_reg_a_w2v_weights, lstm_reg_a_w2v_longest_sentence_len, (30,1,))
lstm_reg_b_w2v_ann_loaded.load_weights(lstm_reg_b_w2v_checkpoint_filepath)
lstm_reg_b_w2v_ann_loaded.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])

In [None]:
dev_loss, dev_acc = lstm_reg_b_w2v_ann_loaded.evaluate(X_test_lstm_reg_a_w2v_padded, y_test_lstm_reg_b_w2v, verbose=1)

print(f"Training MSE: {np.sqrt(metrics.mean_squared_error(y_train_lstm_reg_b_w2v, lstm_reg_b_w2v_ann_loaded.predict(X_train_lstm_reg_a_w2v_padded)))}")
print(f"Test MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_b_w2v, lstm_reg_b_w2v_ann_loaded.predict(X_test_lstm_reg_a_w2v_padded)))}")
print(f"Test R^2: {metrics.r2_score(y_test_lstm_reg_b_w2v, lstm_reg_b_w2v_ann_loaded.predict(X_test_lstm_reg_a_w2v_padded))}")
print(f"Baseline MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_b_w2v, 0*y_test_lstm_reg_b_w2v))}")

In [None]:
# summarize history for loss
plt.plot(lstm_reg_b_w2v_history.history['loss'])
plt.plot(lstm_reg_b_w2v_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Method 2: Glove

In [None]:
# Glove embeddings are the same as the one used in Model A
X_train_lstm_reg_b_glove = [X_train_lstm_reg_a_glove_padded, np.array(X_train_price_history)]
X_test_lstm_reg_b_glove = [X_test_lstm_reg_a_glove_padded, X_test_price_history]

In [None]:
def create_lstm_reg_b_glove(longest_sentence_len, price_history_shape):
    global embedding_layer_glove
    # word vectors model
    model1_input = layers.Input(shape=longest_sentence_len, dtype='int32', name='sentence_index_input')
    model1 = embedding_layer_glove(model1_input)  
    model1 = layers.LSTM(4, return_sequences=True, name='model1_LSTM1')(model1)
    model1 = layers.Dropout(0.25,name='model1_dropout1')(model1)
    model1 = layers.LSTM(4, return_sequences=False, name='model1_LSTM2')(model1)
    model1 = layers.Dropout(0.25,name='model1_dropout2')(model1)
    
    # price history model
    model2_input = layers.Input(shape=price_history_shape, dtype='float32', name='price_history_input')
    model2 = layers.LSTM(4, return_sequences=True, name='model2_LSTM1')(model2_input)
    model2 = layers.Dropout(0.25,name='model2_dropout1')(model2)
    model2 = layers.LSTM(4, return_sequences=False, name='model2_LSTM2')(model2)
    model2 = layers.Dropout(0.25,name='model2_dropout2')(model2)
    
    model_concat = layers.concatenate([model1, model2])
    model_concat = layers.Dense(4,name='Dense',activation='tanh')(model_concat)
    model_concat = layers.Dropout(0.1)(model_concat)
    model_concat = layers.Dense(1,activation='linear')(model_concat)
    
    model = keras.models.Model(inputs=[model1_input, model2_input], outputs = model_concat)
    
    return model

In [None]:
lstm_reg_b_glove_ann = create_lstm_reg_b_glove(lstm_reg_a_glove_longest_sentence_len, (30,1,))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_reg_b_glove_ann.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])
lstm_reg_b_glove_ann.summary()

In [None]:
plot_model(
    lstm_reg_b_glove_ann,
    to_file="lstm_reg_b_glove_ann.png",
    show_shapes=False,
    show_layer_names=True,
    rankdir="TB"
)

In [None]:
now = dt.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

lstm_reg_b_glove_checkpoint_filepath = f'./lstm_reg_b_glove/lstm_reg_b_glove_{dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

lstm_reg_b_glove_history = lstm_reg_b_glove_ann.fit(X_train_lstm_reg_a_glove_padded, y_train_lstm_reg_b_glove, validation_split=0.33, epochs=50, callbacks=[model_checkpoint_callback])

In [None]:
lstm_reg_b_glove_ann_loaded = create_lstm_reg_b_glove(lstm_reg_b_glove_longest_sentence_len, (30,1,))
lstm_reg_b_glove_ann_loaded.load_weights(lstm_reg_b_glove_checkpoint_filepath)
lstm_reg_b_glove_ann_loaded.compile(optimizer=opt, loss='mean_squared_error', metrics=['mae'])

In [None]:
dev_loss, dev_acc = lstm_reg_b_glove_ann_loaded.evaluate(X_test_lstm_reg_a_glove_padded, y_test_lstm_reg_b_glove, verbose=1)

print(f"Training MSE: {np.sqrt(metrics.mean_squared_error(y_train_lstm_reg_b_glove, lstm_reg_b_glove_ann_loaded.predict(X_train_lstm_reg_a_glove_padded)))}")
print(f"Test MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_b_glove, lstm_reg_b_glove_ann_loaded.predict(X_test_lstm_reg_a_glove_padded)))}")
print(f"Test R^2: {metrics.r2_score(y_test_lstm_reg_b_glove, lstm_reg_b_glove_ann_loaded.predict(X_test_lstm_reg_a_glove_padded))}")
print(f"Baseline MSE: {np.sqrt(metrics.mean_squared_error(y_test_lstm_reg_b_glove, 0*y_test_lstm_reg_b_glove))}")

In [None]:
# summarize history for loss
plt.plot(lstm_reg_b_glove_history.history['loss'])
plt.plot(lstm_reg_b_glove_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# 2. Classification

## Naive Bayes

Method 1: Text-Based

Method 2: Feature-Based

## Logistic Regression

Method 1: Text-Based

Method 2: Feature-Based

## Random Forest

Method 1: Text-Based

Method 2: Feature-Based

## Support Vector Machine (SVM)

Method 1: Text-Based

Method 2: Feature-Based

## Artificial Neural Network (ANN)