In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Importing local modules from src folder
src_dir = os.path.join(os.getcwd(), '..', 'src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

from model_utils import build_simple_RNN_model

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Data loading
csv_file_path = '../data/processed/preprocessed_tweets.csv'
df = pd.read_csv(csv_file_path)

In [3]:
X = df['tweet']  # Features: tweet texts
y = df['sentiment']  # Labels: sentiments

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Data preparation for NN models

In [4]:
# Tokenization of text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [5]:
# Padding sequences to be of equal length
max_length = max([len(seq) for seq in X_train_seq]) # max length won't be too large since tweets are char-limited

X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [6]:
# Saving tokenizer to json
json_file_path = '../data/processed/tokenizer.json'
tokenizer_json = tokenizer.to_json()
with open(json_file_path, "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

### Building, training, and evaluating neural network models

In [7]:
# Building model
rnn_model = build_simple_RNN_model(input_length=max_length, learning_rate=0.0005)

In [8]:
# Train the RNN model
history_rnn = rnn_model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2, batch_size=32)

Epoch 1/5
[1m31994/31994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 5ms/step - accuracy: 0.7477 - loss: 0.5133 - val_accuracy: 0.7948 - val_loss: 0.4469
Epoch 2/5
[1m31994/31994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 5ms/step - accuracy: 0.7997 - loss: 0.4359 - val_accuracy: 0.8001 - val_loss: 0.4320
Epoch 3/5
[1m31994/31994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 5ms/step - accuracy: 0.8032 - loss: 0.4314 - val_accuracy: 0.7874 - val_loss: 0.4373
Epoch 4/5
[1m31994/31994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 5ms/step - accuracy: 0.8096 - loss: 0.4180 - val_accuracy: 0.8055 - val_loss: 0.4314
Epoch 5/5
[1m31994/31994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 5ms/step - accuracy: 0.8114 - loss: 0.4149 - val_accuracy: 0.7906 - val_loss: 0.4500


In [9]:
# Evaluate the RNN model
rnn_loss, rnn_acc = rnn_model.evaluate(X_test_padded, y_test)
print(f'RNN Model Accuracy: {rnn_acc}')

[1m9998/9998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step - accuracy: 0.7899 - loss: 0.4511
RNN Model Accuracy: 0.7893228530883789
