In [2]:
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the Rotten Tomatoes dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

train_text = train_dataset.to_pandas()['text']

max_text_len = 0
for text in train_text:
    max_text_len = max(max_text_len, len(text))

print(max_text_len) #length of longest train sentence
len(train_text) #number of sentences

267


8530

In [4]:
with open('embedding_matrix.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f).astype(np.float32)
    padding = [0 for i in range(50)]
    embedding_matrix = np.insert(embedding_matrix, 0, padding, 0)
    print(type(embedding_matrix))

with open('vocab_word_to_index.pkl', 'rb') as f:
    vocab_word_to_index = pickle.load(f)
    print(vocab_word_to_index) #word index

# Convert to torch tensors
embedding_matrix = tf.convert_to_tensor(embedding_matrix) #adopting the glove embeddings 
vocab_size, embedding_dim = embedding_matrix.shape

<class 'numpy.ndarray'>


In [5]:
import numpy as np
import tensorflow as tf
import nltk

class SentimentDataset:
    def __init__(self, dataset, word_to_index, max_len=max_text_len):
        self.dataset = dataset
        self.word_to_index = word_to_index
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        
        # Tokenization and word-to-index conversion
        text = text.lower()
        word_list = nltk.tokenize.word_tokenize(text)
        word_list = [word.strip("'\"") for word in word_list]  # Remove extra punctuation
        indices = [self.word_to_index.get(word, self.word_to_index.get('<UNK>')) + 1 for word in word_list]
        indices = indices[:self.max_len] + [0] * (self.max_len - len(indices))  # Padding

        return np.array(indices), np.array(label)

    def preprocess_data(self):
        texts = []
        labels = []
        for i in range(len(self.dataset)):
            features, label = self.__getitem__(i)
            texts.append(features)
            labels.append(label)
        return np.array(texts), np.array(labels)

# Create a TensorFlow dataset
def create_tf_dataset(texts, labels, batch_size=32, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(texts))  # Shuffle the dataset
    
    dataset = dataset.batch(batch_size)  # Batch the dataset
    return dataset

train_texts, train_labels = SentimentDataset(train_dataset, vocab_word_to_index, max_text_len).preprocess_data()
valid_texts, valid_labels = SentimentDataset(valid_dataset, vocab_word_to_index, max_text_len).preprocess_data()
test_texts, test_labels = SentimentDataset(test_dataset, vocab_word_to_index, max_text_len).preprocess_data()

# Create TensorFlow datasets
train_tf_dataset = create_tf_dataset(train_texts, train_labels, batch_size=32)
valid_tf_dataset = create_tf_dataset(valid_texts, valid_labels, batch_size=32, shuffle = False)
test_tf_dataset = create_tf_dataset(test_texts, test_labels, batch_size=32, shuffle = False)
'''for step, (features, labels) in enumerate(train_tf_dataset):
    print(f"Batch {step + 1}")
    print("Features (inputs):", features.numpy())  # Convert tensor to NumPy for easier reading
    print("Labels (outputs):", labels.numpy())  # Convert tensor to NumPy for easier reading
    print("\n")
    break'''

'for step, (features, labels) in enumerate(train_tf_dataset):\n    print(f"Batch {step + 1}")\n    print("Features (inputs):", features.numpy())  # Convert tensor to NumPy for easier reading\n    print("Labels (outputs):", labels.numpy())  # Convert tensor to NumPy for easier reading\n    print("\n")\n    break'

In [6]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
import random
# Build model
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

def build_model(nb_words, embedding_matrix, max_len):
    model = Sequential(name="Simple_RNN")

    # Embedding layer with pre-trained embeddings
    if embedding_matrix is not None:
        model.add(Embedding(nb_words, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False))  # Freeze the embedding layer
    else:
        model.add(Embedding(nb_words, 50, trainable=False))
    
    model.add(SimpleRNN(128,#activation= 'relu', 
                        return_sequences=False))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=0.0001),loss="binary_crossentropy", metrics=['accuracy'])
    return model

nb_words = len(vocab_word_to_index) + 1  # Vocabulary size
model = build_model(nb_words, embedding_matrix, max_text_len)

#early_stopping = EarlyStopping(monitor='accuracy', mode='max', patience=5, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3, min_lr=0.00001)
train = model.fit(train_tf_dataset, epochs=30, batch_size=32, validation_data=valid_tf_dataset)
                  #callbacks=[early_stopping])


Epoch 1/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 50ms/step - accuracy: 0.4937 - loss: 0.6942 - val_accuracy: 0.5000 - val_loss: 0.6937
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 71ms/step - accuracy: 0.5187 - loss: 0.6931 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 67ms/step - accuracy: 0.4887 - loss: 0.6937 - val_accuracy: 0.5000 - val_loss: 0.6943
Epoch 4/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 57ms/step - accuracy: 0.4966 - loss: 0.6942 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 5/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 66ms/step - accuracy: 0.5005 - loss: 0.6936 - val_accuracy: 0.5000 - val_loss: 0.6933
Epoch 6/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 60ms/step - accuracy: 0.5038 - loss: 0.6936 - val_accuracy: 0.5188 - val_loss: 0.6925
Epoch 7/50
[1m2

KeyboardInterrupt: 

In [7]:
print("Simple_RNN Score---> ", model.evaluate(test_tf_dataset))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.4746 - loss: 0.6971
Simple_RNN Score--->  [0.6974078416824341, 0.4727954864501953]


In [119]:
# Get predictions on the test dataset
predictions = model.predict(test_tf_dataset)

# Convert probabilities to binary predictions (0 or 1) using a threshold of 0.5
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Get true labels from the test dataset
true_labels = np.concatenate([y.numpy() for x, y in test_tf_dataset], axis=0)

from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Confusion Matrix:")
print(conf_matrix)

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step
Confusion Matrix:
[[226 307]
 [229 304]]


2024-10-24 10:50:55.973759: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
