In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [6]:

# Load dataset
# A CSV file with 'review' and 'sentiment' columns
data = pd.read_csv('/content/IMDB Dataset.csv')

In [7]:
# Data Preprocessing
data['review'] = data['review'].str.replace('[^\w\s]', '').str.lower()  # Remove punctuation and convert to lowercase

In [8]:
# Convert labels to numerical values
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])  # 0 for negative, 1 for positive


In [9]:

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)


In [10]:


# Tokenization: Convert text into sequences of integers
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)



In [11]:
# Pad sequences to ensure equal length inputs
max_length = 100  # Maximum length of review (can be adjusted)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')


In [12]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(LSTM(units=128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))



In [13]:

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [14]:
# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 286ms/step - accuracy: 0.6699 - loss: 0.5820 - val_accuracy: 0.7662 - val_loss: 0.4899
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 292ms/step - accuracy: 0.8196 - loss: 0.4099 - val_accuracy: 0.8341 - val_loss: 0.3809
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 292ms/step - accuracy: 0.8725 - loss: 0.3148 - val_accuracy: 0.8408 - val_loss: 0.3774
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 298ms/step - accuracy: 0.8901 - loss: 0.2729 - val_accuracy: 0.8160 - val_loss: 0.4017
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 286ms/step - accuracy: 0.9033 - loss: 0.2437 - val_accuracy: 0.8312 - val_loss: 0.3991


In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.8287 - loss: 0.3917
Accuracy: 83.12%


In [16]:
# Make predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 69ms/step


In [17]:
# Evaluate with precision, recall, and F1-score
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [18]:
# Print evaluation metrics
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Precision: 0.82
Recall: 0.85
F1 Score: 0.83
