<a href="https://colab.research.google.com/github/asanth7/SP500-news-prediction/blob/main/FinalStockPrediction_Incremented_Labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dense, LSTM, Flatten, Bidirectional, Dropout
from keras.layers.normalization.batch_normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import regularizers
from gc import callbacks
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import itertools
import csv
import nltk
import string
from copy import deepcopy


nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def load_data(file_path):
    if file_path is None:
        return "No filepath found"

    labels = []
    headlines_per_day = []
    with open(file_path, "r") as file:
      reader = csv.reader(file, delimiter=',')
      # next(reader, None)
      for row in reader:
          convertedRow = [label_or_headline for label_or_headline in row]
          labels.append(convertedRow[0])
          # print(convertedRow[1:])
          headlines_per_day.append(convertedRow[1:])

      file.close()

      # Checks if headline is type byte from csv file, if so, decodes to string for cleaning and tokenization

      for day in headlines_per_day:
          for headline in day:
              # print(type(headline))
              if isinstance(headline, bytes):
                  headline = headline.decode()

      for label in labels:
        if type(label) == int:
          return headlines_per_day, labels
        else:
          int_labels = [int(label) for label in labels]
          return headlines_per_day, int_labels

In [None]:
file_path = r'/content/S&P500_RedditNews_labeled_data_INCREMENTED_LABELS.csv'
headlines, labels = load_data(file_path)
X_train, X_test, y_train, y_test = train_test_split(headlines, labels, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2)

In [None]:
def clean(headline):
    nopunct = headline.translate(str.maketrans('', '', string.punctuation))
    nostop = [word for word in nopunct.split(' ') if word not in STOPWORDS]
    clean_sentence = [word.lower() for word in nostop]
    return clean_sentence

In [None]:
X_train_token = [clean(headline) for day in X_train for headline in day]
X_val_token = [clean(headline) for day in X_val for headline in day]

In [None]:
max_len = 0
for phrase in X_train_token:
    if len(phrase) > max_len:
         max_len = len(phrase)
for phrase in X_val_token:
    if len(phrase) > max_len:
        max_len = len(phrase)

In [None]:
max_len

In [None]:
def buildVocabulary(X_train_token, X_val_token):

    list_sequence = X_train_token + X_val_token
    all_words = []
    for phrase in list_sequence:
      for word in phrase:
          word = word.strip('\n')
          all_words.append(word)

    unique_words = set(all_words)
    print(len(unique_words))

    word_count = {}
    for word in unique_words:
      print(word)
      word_count[word] = all_words.count(word)
    print('created word_count')

    word2ind = {word: i for i, word in enumerate(unique_words, start=1)}
    vocab_size = len(word2ind)
    print('created word2ind')

    return word_count, word2ind, vocab_size

In [None]:
word_count, word2ind, vocab_size = buildVocabulary(X_train_token, X_val_token)

In [None]:
def vectorize(tokens, max_len, word2ind):
    '''
    :param tokens:
    :param max_len:
    :param word2ind:
    :return: 1D numpy array (length = max)len)
    '''

    if tokens is None:
        return "No tokens found"
    if max_len is None:
        return "No max_len provided"
    if word2ind is None:
        return "No word2ind found"

    sentence = np.zeros(max_len)
    position = 0
    for token in tokens:
        index = word2ind.get(token, 0)
        sentence[position] = index
        position += 1

    return sentence


In [None]:
X_train_array = np.array([vectorize(tokens, max_len, word2ind) for tokens in X_train_token])
X_val_array = np.array([vectorize(tokens, max_len, word2ind) for tokens in X_val_token])
print('train and val arrays done')
assert X_train_array.shape[-1] == max_len

In [None]:
X_train_array.shape

In [None]:
def convert2onehot(labels, num_classes):

    if labels is None:
        return "No labels found"

    num_repeats = 25
    labels = np.repeat(labels, num_repeats)
    print(len(labels))

    final = []
    start = []
    for i in range(num_classes):
        start.append(0)

    for label in labels:
        values = deepcopy(start)
        values[label] = 1
        final.append(values)
    return np.array(final)

In [None]:
y_train_onehot = convert2onehot(y_train, 10)
y_val_onehot = convert2onehot(y_val, 10)
print('onehots done')
assert y_train_onehot.shape[1] == 10

In [None]:
print(X_train_array.shape)
print(y_train_onehot.shape)

print(X_train_array)
print('\n')
print(y_train_onehot)

In [None]:
# KERAS MODEL

num_classes = 10

stock_model = keras.Sequential([
    Embedding(vocab_size + 1, 48, input_length=max_len),
    LSTM(units=64, input_shape=(X_train_array.shape[0], X_train_array.shape[1]), return_sequences=True),
    LSTM(units=32),
    BatchNormalization(),
    Dropout(0.35),
    Dense(units=16, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
    BatchNormalization(),
    Dropout(0.35),
    Dense(units=12, activation='relu'),
    BatchNormalization(),
    Dropout(0.35),
    Dense(units=num_classes, kernel_regularizer=regularizers.l2(0.001), activation='softmax')
])

stock_model.summary()

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# checkpoint_filepath = r'C:\Users\AravSanthanam\Polygence_2022\mdcheckpoints'
# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_filepath,
#     save_weights_only=True,
#     monitor='val_accuracy',
#     mode='max',
#     save_best_only=True)

stock_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = stock_model.fit(X_train_array, y_train_onehot, batch_size=48, epochs=15, validation_data=(X_val_array, y_val_onehot), shuffle=True, callbacks=[callback], verbose=1)

#stock_model.load_weights(checkpoint_filepath)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()