In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib 

# Loading and Preprocess the Dataset

In [2]:
df = pd.read_csv(r"D:\Projects\Sentiment_analysis\movie.csv")

In [3]:
df

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 100  
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1

# Load Pre-Trained Embeddings

In [None]:
embedding_index = {}
with open(r"D:\Projects\Sentiment_analysis\glove.6B\glove.6B.100d.txt", 'r', encoding='utf-8') as file:  # Use appropriate file path
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100  # Dimension of embeddings
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


# Model Architectures

In [8]:
def build_model(model_type='LSTM'):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                        input_length=max_length, trainable=False))
    if model_type == 'LSTM':
        model.add(LSTM(128, return_sequences=False))
    elif model_type == 'GRU':
        model.add(GRU(128, return_sequences=False))
    elif model_type == 'Bidirectional':
        model.add(Bidirectional(LSTM(128, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


# Training and Save Models

In [11]:
models = ['LSTM', 'GRU', 'Bidirectional']
saved_models = {}

for model_type in models:
    print(f"Training {model_type} model...")
    model = build_model(model_type)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train_padded, y_train, validation_split=0.2, epochs=10, batch_size=64, callbacks=[early_stopping])
    model.save(f'{model_type}_model.h5')  # Save model
    saved_models[model_type] = model


Training LSTM model...




Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 122ms/step - accuracy: 0.5406 - loss: 0.6915 - val_accuracy: 0.5605 - val_loss: 0.6867
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 121ms/step - accuracy: 0.6241 - loss: 0.6369 - val_accuracy: 0.7742 - val_loss: 0.4743
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 121ms/step - accuracy: 0.7814 - loss: 0.4660 - val_accuracy: 0.7884 - val_loss: 0.4405
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 121ms/step - accuracy: 0.7956 - loss: 0.4373 - val_accuracy: 0.7905 - val_loss: 0.4306
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 121ms/step - accuracy: 0.8062 - loss: 0.4223 - val_accuracy: 0.8053 - val_loss: 0.4212
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 121ms/step - accuracy: 0.8111 - loss: 0.4102 - val_accuracy: 0.8008 - val_loss: 0.4206
Epoch 7/10



Training GRU model...
Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 119ms/step - accuracy: 0.5843 - loss: 0.6601 - val_accuracy: 0.7683 - val_loss: 0.4903
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 117ms/step - accuracy: 0.7869 - loss: 0.4625 - val_accuracy: 0.8025 - val_loss: 0.4242
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 118ms/step - accuracy: 0.8054 - loss: 0.4178 - val_accuracy: 0.8105 - val_loss: 0.4070
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 118ms/step - accuracy: 0.8209 - loss: 0.3918 - val_accuracy: 0.8152 - val_loss: 0.4027
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 118ms/step - accuracy: 0.8323 - loss: 0.3707 - val_accuracy: 0.8241 - val_loss: 0.3852
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 118ms/step - accuracy: 0.8418 - loss: 0.3494 - val_accuracy: 0.8150 - val_l



Training Bidirectional model...
Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 211ms/step - accuracy: 0.6197 - loss: 0.6457 - val_accuracy: 0.7423 - val_loss: 0.5168
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 186ms/step - accuracy: 0.7559 - loss: 0.5068 - val_accuracy: 0.7497 - val_loss: 0.5012
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 185ms/step - accuracy: 0.7888 - loss: 0.4501 - val_accuracy: 0.8005 - val_loss: 0.4228
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 194ms/step - accuracy: 0.8109 - loss: 0.4115 - val_accuracy: 0.8022 - val_loss: 0.4213
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 187ms/step - accuracy: 0.8224 - loss: 0.3906 - val_accuracy: 0.8169 - val_loss: 0.3988
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 193ms/step - accuracy: 0.8364 - loss: 0.3622 - val_accuracy: 0.81



# Evaluate Models

In [12]:
for model_type, model in saved_models.items():
    loss, accuracy = model.evaluate(X_test_padded, y_test)
    print(f"{model_type} Model - Loss: {loss}, Accuracy: {accuracy}")


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.8174 - loss: 0.3903
LSTM Model - Loss: 0.3956148326396942, Accuracy: 0.8176249861717224
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.8279 - loss: 0.3792
GRU Model - Loss: 0.384158730506897, Accuracy: 0.8255000114440918
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.8156 - loss: 0.4015
Bidirectional Model - Loss: 0.39977532625198364, Accuracy: 0.8153749704360962


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import joblib


joblib.dump(tokenizer, "tokenizer.pkl")
print("Tokenizer saved!")


Tokenizer saved!
