In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from dateutil import parser
import re
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [6]:
df = pd.read_csv('tweets_stocks_combined_5mins.csv')

In [7]:
X = df.loc[:, 'cleaned_text']
y = df.loc[:, '5mins_price_diff_perc']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
corpus_list = []

for i in X_train:
    corpus_list.append(i.split(' '))

In [10]:
word2vec_model = Word2Vec(corpus_list, min_count=1, size=100)
print(word2vec_model)

Word2Vec(vocab=9195, size=100, alpha=0.025)


In [11]:
num_words = [len(i) for i in corpus_list]
longest_sentence_len = max(num_words)

In [12]:
X_train

1702    kim jong un of north korea proclaims “unwaveri...
2164    prime minister trudeau is being so indignant, ...
1281    “trump gets no credit for what he’s done in th...
1866    ....china, which is for the first time doing p...
306     two dozen nfl players continue to kneel during...
                              ...                        
1638    best economic numbers in decades. if the democ...
1095    "is it legal for a sitting president to be ""w...
1130    the fake news media (failing @nytimes, @cnn, @...
1294    as i predicted all along, obamacare has been s...
860     karen handel for congress. she will fight for ...
Name: cleaned_text, Length: 1788, dtype: object

In [13]:
def sentence_to_indices_padded(sentences, longest_sentence_len):
    result = []
    for sentence in sentences:
        indices = []
        sentence_splitted = sentence.split()
        for word in sentence_splitted:
            if word in word2vec_model.wv.vocab:
                indices.append(word2vec_model.wv.vocab[word].index)
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=longest_sentence_len, padding='post')

In [14]:
X_train_padded = sentence_to_indices_padded(X_train, longest_sentence_len)
X_test_padded = sentence_to_indices_padded(X_test, longest_sentence_len)

In [15]:
pretrained_weights = word2vec_model.wv.vectors

In [16]:
vocab_size, embedding_size = pretrained_weights.shape

In [17]:
vocab_size

9195

In [18]:
embedding_size

100

In [1]:
def create_LSTM_model(pretrained_weights, longest_sentence_len):
    vocab_size, embedding_size = pretrained_weights.shape
    
    model = keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights], trainable=False))  
    model.add(layers.LSTM(4, return_sequences=True, name='LSTM1'))
    model.add(layers.Dropout(0.25,name='Dropout1'))
    model.add(layers.LSTM(4, return_sequences=False, name='LSTM2'))
    model.add(layers.Dropout(0.25,name='Dropout2'))
    model.add(layers.Dense(4,name='Dense',activation='sigmoid'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1,activation='linear'))
    
    return model

In [31]:
model = create_LSTM_model(pretrained_weights, longest_sentence_len)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 158, 100)          919500    
_________________________________________________________________
LSTM1 (LSTM)                 (None, 158, 4)            1680      
_________________________________________________________________
Dropout1 (Dropout)           (None, 158, 4)            0         
_________________________________________________________________
LSTM2 (LSTM)                 (None, 4)                 144       
_________________________________________________________________
Dropout2 (Dropout)           (None, 4)                 0         
_________________________________________________________________
Dense (Dense)                (None, 4)                 20        
_________________________________________________________________
dropout_1 (Dropout)          (None, 4)                

In [32]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
from datetime import datetime

now = datetime.now()
dt_string = now.strftime("%d%m%Y %Hh%Mm")

checkpoint_filepath = f'./model_a_checkpoint/{dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True)

model.fit(X_train_padded, y_train, validation_split=0.2, epochs=50, callbacks=[model_checkpoint_callback])

Train on 1430 samples, validate on 358 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.00001, saving model to ./model_a_checkpoint/08112020 15h58m.h5
Epoch 2/50
Epoch 00002: val_loss did not improve from 0.00001
Epoch 3/50
Epoch 00003: val_loss did not improve from 0.00001
Epoch 4/50
Epoch 00004: val_loss did not improve from 0.00001
Epoch 5/50
Epoch 00005: val_loss did not improve from 0.00001
Epoch 6/50
Epoch 00006: val_loss improved from 0.00001 to 0.00000, saving model to ./model_a_checkpoint/08112020 15h58m.h5
Epoch 7/50
Epoch 00007: val_loss did not improve from 0.00000
Epoch 8/50
Epoch 00008: val_loss did not improve from 0.00000
Epoch 9/50
Epoch 00009: val_loss improved from 0.00000 to 0.00000, saving model to ./model_a_checkpoint/08112020 15h58m.h5
Epoch 10/50
Epoch 00010: val_loss did not improve from 0.00000
Epoch 11/50
Epoch 00011: val_loss did not improve from 0.00000
Epoch 12/50
Epoch 00012: val_loss did not improve from 0.00000
Epoch 13/50
Epoch 00013: va