In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from dateutil import parser
import re
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import preprocessor as p

In [167]:
df = pd.read_csv('tweets_stocks_combined_5mins.csv')

In [168]:
X = df.loc[:, 'cleaned_text']
y = df.loc[:, '5mins_price_diff_perc']

In [171]:
corpus_list = []

for i in X:
    corpus_list.append(i.split(' '))

In [172]:
word2vec_model = Word2Vec(corpus_list, min_count=1, size=100)
print(word2vec_model)

Word2Vec(vocab=10608, size=100, alpha=0.025)


In [183]:
def sentence_to_indices_padded(X, longest_sentence_len):
    result = []
    for sentence in X:
        indices = [word2vec_model.wv.vocab[i].index for i in sentence.split()]
        result.append(indices)
    return pad_sequences(result, maxlen=longest_sentence_len, padding='post')

In [None]:


num_words = [len(i) for i in corpus_list]
longest_sentence_len = max(num_words)

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [170]:
X_train

1702    Kim Jong Un of North Korea proclaims “unwaveri...
2164    Prime Minister Trudeau is being so indignant, ...
1281    “Trump gets no credit for what he’s done in th...
1866    ....China, which is for the first time doing p...
306     Two dozen NFL players continue to kneel during...
                              ...                        
1638    Best economic numbers in decades. If the Democ...
1095    "Is it legal for a sitting President to be ""w...
1130    The FAKE NEWS media (failing @nytimes, @CNN, @...
1294    As I predicted all along, Obamacare has been s...
860     KAREN HANDEL FOR CONGRESS. She will fight for ...
Name: cleaned_text, Length: 1788, dtype: object

In [173]:
pretrained_weights = word2vec_model.wv.vectors

In [174]:
vocab_size, embedding_size = pretrained_weights.shape

In [175]:
vocab_size

10608

In [176]:
embedding_size

100

In [178]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [179]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        if word_to_vec_map[word].shape[0] == emb_dim:
            emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = keras.layers.Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [180]:
def create_LSTM_model(pretrained_weights, longest_sentence_len):
    vocab_size, embedding_size = pretrained_weights.shape
    
    model = keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))  
    model.add(layers.LSTM(4, return_sequences=True, name='LSTM1'))
    model.add(layers.Dropout(0.25,name='Dropout1'))
    model.add(layers.LSTM(4, return_sequences=False, name='LSTM2'))
    model.add(layers.Dropout(0.25,name='Dropout2'))
    model.add(layers.Dense(4,name='Dense',activation='sigmoid'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1,activation='linear'))
    
    return model

In [181]:
model = create_LSTM_model(pretrained_weights, longest_sentence_len)

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 158, 100)          1060800   
_________________________________________________________________
LSTM1 (LSTM)                 (None, 158, 4)            1680      
_________________________________________________________________
Dropout1 (Dropout)           (None, 158, 4)            0         
_________________________________________________________________
LSTM2 (LSTM)                 (None, 4)                 144       
_________________________________________________________________
Dropout2 (Dropout)           (None, 4)                 0         
_________________________________________________________________
Dense (Dense)                (None, 4)                 20        
_________________________________________________________________
dropout_3 (Dropout)          (None, 4)                

In [184]:
print(sentence_to_indices_padded(X_train, longest_sentence_len))

[[  339   469   470 ...     0     0     0]
 [  269   286  3898 ...     0     0     0]
 [  823   634    65 ...     0     0     0]
 ...
 [   22   309   442 ...     0     0     0]
 [  258    14  3291 ...     0     0     0]
 [10601 10602  1126 ...     0     0     0]]


In [160]:
max_length = vocab_size
padded_docs = pad_sequences(encoded_docs, maxlen=longest_sentence_len, padding='post')
print(padded_docs[0])

[ 4320 10473  5364  9016  8178  5010  1475 10107  2947  1371 10527  7902
  3667  8546  9452 10024   441  4320  5783  5534  4592  8519  8258  6776
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [122]:
X_train[0]

'Thank you Rand! '