In [5]:
import pandas as pd
from os import walk

In [170]:
import pandas as pd
import numpy as np
import os
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split

stop_words = stopwords.words('english')
porter = PorterStemmer()

# Data Pre-Processing

In [124]:
def clean_text_for_files(file_path):
    all_texts = []
    max_tokens = 0
    for filename in os.listdir(file_path):
        if filename.endswith(".txt"):
            with open(os.path.join(file_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                words = re.split(r'\W+',text)
                lower_words = [word.lower() for word in words ]
                re_stop_words = [word for word in lower_words if not word in stop_words]
                stem_words = [porter.stem(word) for word in re_stop_words]
                # if len(stem_words) > max_tokens:
                #     max_tokens = len(stem_words)
                all_texts.append(" ".join(stem_words))
    return all_texts #, max_tokens
    

In [125]:
pos_train = clean_text_for_files('data/train/pos')
# print(max_tokens)
neg_train = clean_text_for_files('data/train/neg')
# print(max_tokens)

In [130]:
pos_labels_train = [1] * len(pos_train)
neg_labels_train = [0] * len(neg_train)

all_text_train = pos_train + neg_train 
all_text_labels_train = np.array(pos_labels_train +neg_labels_train)

In [127]:
pos_test = clean_text_for_files('data/test/pos')
# print(max_tokens)
neg_test = clean_text_for_files('data/test/neg')
# print(max_tokens)

In [129]:
pos_labels_test = [1] * len(pos_test)
neg_labels_test = [0] * len(neg_test)

all_text_test = pos_test + neg_test
all_labels_test = np.array(pos_labels_test + neg_labels_test)

In [150]:
max_words = 100_000
max_len = 100

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(all_text_train)
sequences = tokenizer.texts_to_sequences(all_text_train)

padd_sequences = pad_sequences(sequences, maxlen = max_len)

word_index = tokenizer.word_index

vocabulary = set(word_index.keys())

X_train, X_val, y_train, y_val = train_test_split(padd_sequences, all_text_labels_train, test_size = 0.2, random_state=42)

sequences_test = tokenizer.texts_to_sequences(all_text_test)

padd_sequences_test = pad_sequences(sequences_test, maxlen = max_len)

# Base Model

In [168]:
vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embedding_dim = 100

model = Sequential()

model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_len))
model.add(SimpleRNN(units=64, return_sequences=False))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='relu'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

history = model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=100, validation_data=(np.array(X_val), np.array(y_val)))

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.6322 - loss: 0.6003 - val_accuracy: 0.8644 - val_loss: 0.3181
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9293 - loss: 0.2010 - val_accuracy: 0.8634 - val_loss: 0.3621
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.9784 - loss: 0.0711 - val_accuracy: 0.8406 - val_loss: 0.5051
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9921 - loss: 0.0255 - val_accuracy: 0.8470 - val_loss: 0.6293
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9989 - loss: 0.0051 - val_accuracy: 0.8540 - val_loss: 0.7870
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.9987 - loss: 0.0049 - val_accuracy: 0.8226 - val_loss: 0.8032
Epoch 7/10
[1m200/200

In [169]:
loss, accuracy = model.evaluate(padd_sequences_test, all_labels_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8147 - loss: 0.9050


# Ablation Studies

In [153]:
# Vocabulary size (based on tokenizer) and embedding dimension
vocab_size = min(max_words, len(tokenizer.word_index) + 1)  # Cap the vocabulary size
embedding_dim = 50  # Dimension of embedding vector

# Build the RNN model
model_1 = Sequential()

# Embedding layer
model_1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))

# Simple RNN layer
model_1.add(SimpleRNN(units=100, return_sequences=False))

# Fully connected layer
model_1.add(Dense(64, activation='relu'))

# Output layer for binary classification
model_1.add(Dense(1, activation='sigmoid'))

# Compile the model
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model_1.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=100, validation_data=(np.array(X_val), np.array(y_val)))


Epoch 1/10




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.5327 - loss: 0.6804 - val_accuracy: 0.8152 - val_loss: 0.4279
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.8425 - loss: 0.3764 - val_accuracy: 0.8276 - val_loss: 0.4090
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.8896 - loss: 0.2786 - val_accuracy: 0.8240 - val_loss: 0.5019
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9262 - loss: 0.1991 - val_accuracy: 0.8140 - val_loss: 0.4590
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9553 - loss: 0.1240 - val_accuracy: 0.8154 - val_loss: 0.6332
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9799 - loss: 0.0609 - val_accuracy: 0.8056 - val_loss: 0.6781
Epoch 7/10
[1m625/625[0m [32m

In [154]:
loss, accuracy = model_1.evaluate(padd_sequences_test, all_labels_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7721 - loss: 1.0914


In [173]:
embedding_dim = 150

model_2 = Sequential()

# Embedding layer
model_2.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))

# Simple RNN layer
model_2.add(SimpleRNN(units=100, return_sequences=False))

# Fully connected layer
model_2.add(Dense(64, activation='relu'))

# Output layer for binary classification
model_2.add(Dense(1, activation='sigmoid'))

# Compile the model
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model_2.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=100, validation_data=(np.array(X_val), np.array(y_val)))

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 55ms/step - accuracy: 0.5567 - loss: 0.6742 - val_accuracy: 0.5514 - val_loss: 0.9086
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - accuracy: 0.6807 - loss: 0.5922 - val_accuracy: 0.6926 - val_loss: 0.6960
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 49ms/step - accuracy: 0.8612 - loss: 0.3354 - val_accuracy: 0.8056 - val_loss: 0.4643
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - accuracy: 0.9446 - loss: 0.1575 - val_accuracy: 0.7958 - val_loss: 0.5416
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9672 - loss: 0.0978 - val_accuracy: 0.8052 - val_loss: 0.6851
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - accuracy: 0.9825 - loss: 0.0577 - val_accuracy: 0.7944 - val_loss: 0.7830
Epoch 7/10
[1m2

In [176]:
loss, accuracy = model_2.evaluate(padd_sequences_test, all_labels_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7743 - loss: 1.3375


In [181]:
vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embedding_dim = 150

model_3 = Sequential()

model_3.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_len))
model_3.add(SimpleRNN(units=100, return_sequences=False))
model_3.add(Dense(100, activation='relu'))
model_3.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='relu'))

model_3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

history = model_3.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=100, validation_data=(np.array(X_val), np.array(y_val)))

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 49ms/step - accuracy: 0.6529 - loss: 0.5858 - val_accuracy: 0.8378 - val_loss: 0.3878
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - accuracy: 0.9139 - loss: 0.2229 - val_accuracy: 0.8488 - val_loss: 0.3733
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.9824 - loss: 0.0526 - val_accuracy: 0.8432 - val_loss: 0.5446
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.9960 - loss: 0.0141 - val_accuracy: 0.8012 - val_loss: 0.6857
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9955 - loss: 0.0157 - val_accuracy: 0.8334 - val_loss: 0.8947
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.9947 - loss: 0.0170 - val_accuracy: 0.8248 - val_loss: 0.7374
Epoch 7/10
[1m2

In [182]:
loss, accuracy = model_3.evaluate(padd_sequences_test, all_labels_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8187 - loss: 1.0183
