In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from dateutil import parser
import re
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
import os
from sklearn import metrics

  from ._conv import register_converters as _register_converters
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
df = pd.read_csv('tweets_stocks_combined_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2246 entries, 0 to 2245
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        2246 non-null   float64
 1   text                      2199 non-null   object 
 2   favorites                 2246 non-null   int64  
 3   retweets                  2246 non-null   int64  
 4   date                      2246 non-null   object 
 5   tweet_datetime            2246 non-null   object 
 6   date_part                 2246 non-null   object 
 7   time_part                 2246 non-null   object 
 8   hour                      2246 non-null   int64  
 9   year                      2246 non-null   int64  
 10  month                     2246 non-null   int64  
 11  datetime_60mins_after     2246 non-null   object 
 12  price_60mins_after        2246 non-null   float64
 13  datetime_20mins_before_x  2246 non-null   object 
 14  datetime

In [243]:
model_a_df = df.dropna(subset=['text'])
vals = np.round(model_a_df["60mins_price_diff_abs"].values,5)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(model_a_df['text'], model_a_df['60mins_price_diff_abs'], test_size=0.33, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [242]:
benchmark_1 = np.percentile(y_train,40)
benchmark_2 = np.percentile(y_train,60)
print(benchmark_1)
print(benchmark_2)

-0.0035844155844756594
0.0181071428572295


In [274]:
y_train = y_train.apply(lambda x: 2 if x > benchmark_2 else 0 if x<benchmark_1 else 1)
y_test = y_test.apply(lambda x: 2 if x > benchmark_2 else 0 if x<benchmark_1 else 1)

# Model A (only word vectors)

In [280]:
corpus = []
for i in x_train:
    corpus.append(i.split())

In [281]:
model_a_word2vec_model = Word2Vec(corpus, min_count=1, size=100)
model_a_pretrained_weights = model_a_word2vec_model.wv.vectors

In [249]:
dimension = 100
embeddings_index = {}
f = open(f'glove/glove.twitter.27B.{dimension}d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [250]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1

#text to integers
sequences = tokenizer.texts_to_sequences(x_train)
longest_sentence_len = 30
x_train_padded = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=longest_sentence_len, padding='post')

print('Found %s unique tokens.' % len(word_index))

Found 4823 unique tokens.


In [251]:
unique_words = set(word_index.keys())
def prepare_test_x_glove(x):
    global unique_words
    global word_index
    global longest_sentence_len
    
    result = []
    for tweet in x:
        indices = []
        for word in tweet.split():
            if word in unique_words:
                indices.append(word_index[word])
            else:
                indices.append(0)
            
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=longest_sentence_len, padding='post')

x_test_padded = prepare_test_x_glove(x_test)

In [252]:
print(np.mean([len(sequence) for sequence in sequences]))
x_test_padded.shape

27.19755600814664


(726, 30)

In [263]:
embedding_matrix = np.zeros((vocab_size, dimension))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer_glove = Embedding(vocab_size,
                            100,
                            weights=[embedding_matrix],
                            input_length=longest_sentence_len,
                            trainable=False)

# Model A - GloVe

In [264]:
def create_model_classification():
    global embedding_layer_glove
    model = keras.Sequential()
    model.add(layers.Input(shape=longest_sentence_len, dtype='int32'))
    model.add(embedding_layer_glove)
    model.add(layers.LSTM(64, return_sequences=False))
    model.add(layers.Dropout(0.25,name='Dropout1'))
    model.add(layers.Flatten())
    model.add(layers.Dense(3, activation='sigmoid'))
    return model

In [265]:
classification_model = create_model_classification()
opt = keras.optimizers.Adam(learning_rate=0.001)
classification_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
classification_model.summary()

Model: "sequential_74"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 30, 100)           482400    
_________________________________________________________________
lstm_88 (LSTM)               (None, 64)                42240     
_________________________________________________________________
Dropout1 (Dropout)           (None, 64)                0         
_________________________________________________________________
flatten_67 (Flatten)         (None, 64)                0         
_________________________________________________________________
dense_71 (Dense)             (None, 3)                 195       
Total params: 524,835
Trainable params: 42,435
Non-trainable params: 482,400
_________________________________________________________________


In [266]:
x_train_padded.shape

(1473, 30)

1264    0.390000
1098    0.007679
721     0.006667
18      0.060000
958    -0.007375
          ...   
1680    0.007143
1114   -0.020000
1149    0.010536
1319    0.003750
877    -0.031667
Name: 60mins_price_diff_abs, Length: 1473, dtype: float64

In [275]:
from datetime import datetime
from keras.utils import to_categorical

now = datetime.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

checkpoint_filepath = f'./model_a_checkpoint/classification {dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

classification_model.fit(x_train_padded, to_categorical(y_train), validation_split=0.2, epochs=15,verbose=1, callbacks=[model_checkpoint_callback])

Train on 1178 samples, validate on 295 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 1.05218, saving model to ./model_a_checkpoint/classification 19112020 1648h.h5
Epoch 2/15
Epoch 00002: val_loss improved from 1.05218 to 1.04590, saving model to ./model_a_checkpoint/classification 19112020 1648h.h5
Epoch 3/15
Epoch 00003: val_loss did not improve from 1.04590
Epoch 4/15
Epoch 00004: val_loss improved from 1.04590 to 1.04447, saving model to ./model_a_checkpoint/classification 19112020 1648h.h5
Epoch 5/15
Epoch 00005: val_loss did not improve from 1.04447
Epoch 6/15
Epoch 00006: val_loss did not improve from 1.04447
Epoch 7/15
Epoch 00007: val_loss did not improve from 1.04447
Epoch 8/15
Epoch 00008: val_loss did not improve from 1.04447
Epoch 9/15
Epoch 00009: val_loss did not improve from 1.04447
Epoch 10/15
Epoch 00010: val_loss did not improve from 1.04447
Epoch 11/15
Epoch 00011: val_loss did not improve from 1.04447
Epoch 12/15
Epoch 00012: val_loss did not improv

<tensorflow.python.keras.callbacks.History at 0x15e6797b8>

In [276]:
classification_model = create_model_classification()
classification_model.load_weights(checkpoint_filepath)
classification_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
y_predict = classification_model.predict_classes(x_test_padded)
print(metrics.classification_report(y_test, y_predict, labels=[0,1,2]))

              precision    recall  f1-score   support

           0       0.40      0.31      0.35       283
           1       0.17      0.01      0.03       138
           2       0.43      0.69      0.53       305

    accuracy                           0.42       726
   macro avg       0.33      0.34      0.30       726
weighted avg       0.37      0.42      0.36       726

[[ 89   4 190]
 [ 44   2  92]
 [ 88   6 211]]


In [278]:
classification_model.save('./models/LSTM_glove_v2')
# model = keras.models.load_model('path/to/location')

INFO:tensorflow:Assets written to: ./models/LSTM_glove_v2/assets


## Word 2 Vec

In [291]:
word2vec_model = Word2Vec(corpus, min_count=1, size=100)
model_a_pretrained_weights = model_a_word2vec_model.wv.vectors

In [292]:
def word2vec_sentence_to_indices_padded(sentences):
    global word2vec_model
    result = []
    for sentence in sentences:
        indices = []
        sentence_splitted = sentence.split()
        for word in sentence_splitted:
            if word in word2vec_model.wv.vocab:
                indices.append(word2vec_model.wv.vocab[word].index)
        result.append(indices)
    return keras.preprocessing.sequence.pad_sequences(result, maxlen=30, padding='pre')

In [293]:
x_train_padded_word2vec = word2vec_sentence_to_indices_padded(x_train)
x_test_padded_word2vec = word2vec_sentence_to_indices_padded(x_test)

In [294]:
def create_model_word2vec():
    global model_a_pretrained_weights
    vocab_size, embedding_size = model_a_pretrained_weights.shape
    model = keras.Sequential()
    model.add(layers.Input(shape=30, dtype='int32'))
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[model_a_pretrained_weights], trainable=False))  
    model.add(layers.LSTM(32, return_sequences=False))
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.25,name='Dropout1'))
    model.add(layers.Dense(3, activation='sigmoid'))
    return model

In [295]:
vocab_size, embedding_size = model_a_pretrained_weights.shape
print(vocab_size)
print(embedding_size)


4821
100


In [296]:
classification_model = create_model_word2vec()
opt = keras.optimizers.Adam(learning_rate=0.001)
classification_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
classification_model.summary()

Model: "sequential_78"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_36 (Embedding)     (None, 30, 100)           482100    
_________________________________________________________________
lstm_92 (LSTM)               (None, 32)                17024     
_________________________________________________________________
flatten_71 (Flatten)         (None, 32)                0         
_________________________________________________________________
Dropout1 (Dropout)           (None, 32)                0         
_________________________________________________________________
dense_75 (Dense)             (None, 3)                 99        
Total params: 499,223
Trainable params: 17,123
Non-trainable params: 482,100
_________________________________________________________________


In [297]:
from datetime import datetime
from keras.utils import to_categorical

now = datetime.now()
dt_string = now.strftime("%d%m%Y %H%Mh")

checkpoint_filepath = f'./model_a_checkpoint/classification {dt_string}.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose = 1,
    save_best_only=True) 

classification_model.fit(x_train_padded_word2vec, to_categorical(y_train), validation_split=0.2, epochs=15,verbose=1, callbacks=[model_checkpoint_callback])

Train on 1178 samples, validate on 295 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 1.03186, saving model to ./model_a_checkpoint/classification 19112020 1659h.h5
Epoch 2/15
Epoch 00002: val_loss improved from 1.03186 to 1.02680, saving model to ./model_a_checkpoint/classification 19112020 1659h.h5
Epoch 3/15
Epoch 00003: val_loss did not improve from 1.02680
Epoch 4/15
Epoch 00004: val_loss did not improve from 1.02680
Epoch 5/15
Epoch 00005: val_loss did not improve from 1.02680
Epoch 6/15
Epoch 00006: val_loss did not improve from 1.02680
Epoch 7/15
Epoch 00007: val_loss did not improve from 1.02680
Epoch 8/15
Epoch 00008: val_loss did not improve from 1.02680
Epoch 9/15
Epoch 00009: val_loss did not improve from 1.02680
Epoch 10/15
Epoch 00010: val_loss improved from 1.02680 to 1.02673, saving model to ./model_a_checkpoint/classification 19112020 1659h.h5
Epoch 11/15
Epoch 00011: val_loss did not improve from 1.02673
Epoch 12/15
Epoch 00012: val_loss did not improv

<tensorflow.python.keras.callbacks.History at 0x149111208>

In [298]:
classification_model = create_model_word2vec()
classification_model.load_weights(checkpoint_filepath)
classification_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
y_predict = classification_model.predict_classes(x_test_padded_word2vec)
print(metrics.classification_report(y_test, y_predict, labels=[0,1,2]))

              precision    recall  f1-score   support

           0       0.33      0.20      0.25       283
           1       0.00      0.00      0.00       138
           2       0.43      0.77      0.55       305

    accuracy                           0.40       726
   macro avg       0.25      0.33      0.27       726
weighted avg       0.31      0.40      0.33       726



  _warn_prf(average, modifier, msg_start, len(result))


In [299]:
classification_model.save('./models/LSTM_word2vec_v2')
# model = keras.models.load_model('path/to/location')

INFO:tensorflow:Assets written to: ./models/LSTM_word2vec_v2/assets
