<a href="https://colab.research.google.com/github/ahmed-sala/NLP-Assignment/blob/main/20210064_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import time
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional


**Without any RNN**

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
data['cleaned'] = data['label'].astype(str).apply(preprocess_text)
texts = data['cleaned']
labels = data['sentiment']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


In [10]:
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(texts)



In [11]:

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

In [12]:
start_time = time.time()
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_lr, y_train_lr)
lr_training_time = time.time() - start_time



In [13]:
y_pred_lr = logistic_model.predict(X_test_lr)
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
print("Logistic Regression Results:")
print(f"Training Time: {lr_training_time:.2f} seconds")
print(f"Accuracy: {accuracy_lr:.2f}")
print("Classification Report:")
print(classification_report(y_test_lr, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Results:
Training Time: 0.17 seconds
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      4961
    positive       0.86      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



**With RNN (LSTM)**

In [14]:
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X_seq = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')


In [15]:
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
    X_seq, y, test_size=0.2, random_state=42
)
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Bidirectional(LSTM(128)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

start_time = time.time()
history = model.fit(
    X_train_rnn, y_train_rnn,
    epochs=5,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)
rnn_training_time = time.time() - start_time


Epoch 1/5




[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.7444 - loss: 0.4805 - val_accuracy: 0.8432 - val_loss: 0.3515
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.9143 - loss: 0.2274 - val_accuracy: 0.8825 - val_loss: 0.2916
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.9388 - loss: 0.1693 - val_accuracy: 0.8892 - val_loss: 0.3174
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 26ms/step - accuracy: 0.9578 - loss: 0.1227 - val_accuracy: 0.8813 - val_loss: 0.3832
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.9731 - loss: 0.0827 - val_accuracy: 0.8715 - val_loss: 0.4603


In [17]:

loss, accuracy_rnn = model.evaluate(X_test_rnn, y_test_rnn, verbose=0)
print("\nRNN Model Results:")
print(f"Training Time: {rnn_training_time:.2f} seconds")
print(f"Test Accuracy: {accuracy_rnn * 100:.2f}%")
y_pred_rnn_prob = model.predict(X_test_rnn)
y_pred_rnn = (y_pred_rnn_prob > 0.5).astype("int32")

print("\nRNN Classification Report:")
print(classification_report(y_test_rnn, y_pred_rnn, target_names=label_encoder.classes_))


RNN Model Results:
Training Time: 92.39 seconds
Test Accuracy: 86.54%
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step

RNN Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.90      0.87      4961
    positive       0.89      0.83      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

