In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from dateutil import parser
import re
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [10]:
df = pd.read_csv('tweets_stocks_combined_5mins.csv')

In [11]:
X = df.loc[:, 'cleaned_text']
y = df.loc[:, '5mins_price_diff_perc']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
corpus_list = []

for i in X_train:
    corpus_list.append(i.split(' '))

In [19]:
word2vec_model = Word2Vec(corpus_list, min_count=1, size=100)
print(word2vec_model)

Word2Vec(vocab=9195, size=100, alpha=0.025)


In [23]:
num_words = [len(i) for i in corpus_list]
longest_sentence_len = max(num_words)

In [20]:
X_train

1702    kim jong un of north korea proclaims “unwaveri...
2164    prime minister trudeau is being so indignant, ...
1281    “trump gets no credit for what he’s done in th...
1866    ....china, which is for the first time doing p...
306     two dozen nfl players continue to kneel during...
                              ...                        
1638    best economic numbers in decades. if the democ...
1095    "is it legal for a sitting president to be ""w...
1130    the fake news media (failing @nytimes, @cnn, @...
1294    as i predicted all along, obamacare has been s...
860     karen handel for congress. she will fight for ...
Name: cleaned_text, Length: 1788, dtype: object

In [50]:
def sentence_to_indices_padded(sentences, longest_sentence_len):
    result = []
    for sentence in sentences:
        indices = []
        sentence_splitted = sentence.split()
        for word in sentence_splitted:
            if word in word2vec_model.wv.vocab:
                indices.append(word2vec_model.wv.vocab[word].index)
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=longest_sentence_len, padding='post')

In [51]:
X_train_padded = sentence_to_indices_padded(X_train, longest_sentence_len)
X_test_padded = sentence_to_indices_padded(X_test, longest_sentence_len)

In [56]:
pretrained_weights = word2vec_model.wv.vectors

In [57]:
vocab_size, embedding_size = pretrained_weights.shape

In [58]:
vocab_size

9195

In [59]:
embedding_size

100

In [60]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        if word_to_vec_map[word].shape[0] == emb_dim:
            emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = keras.layers.Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [61]:
def create_LSTM_model(pretrained_weights, longest_sentence_len):
    vocab_size, embedding_size = pretrained_weights.shape
    
    model = keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))  
    model.add(layers.LSTM(4, return_sequences=True, name='LSTM1'))
    model.add(layers.Dropout(0.25,name='Dropout1'))
    model.add(layers.LSTM(4, return_sequences=False, name='LSTM2'))
    model.add(layers.Dropout(0.25,name='Dropout2'))
    model.add(layers.Dense(4,name='Dense',activation='sigmoid'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(1,activation='linear'))
    
    return model

In [62]:
model = create_LSTM_model(pretrained_weights, longest_sentence_len)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 158, 100)          919500    
_________________________________________________________________
LSTM1 (LSTM)                 (None, 158, 4)            1680      
_________________________________________________________________
Dropout1 (Dropout)           (None, 158, 4)            0         
_________________________________________________________________
LSTM2 (LSTM)                 (None, 4)                 144       
_________________________________________________________________
Dropout2 (Dropout)           (None, 4)                 0         
_________________________________________________________________
Dense (Dense)                (None, 4)                 20        
_________________________________________________________________
dropout_1 (Dropout)          (None, 4)                

In [63]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
model.fit(X_train_padded, y_train, epochs=50)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50