In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt


In [90]:
# Load the dataset
data = pd.read_csv("./IMDB Dataset.csv")
# data = pd.read_csv("https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data")

print(data.head())  

# Preprocessing the text data (convert 'sentiment' to binary)
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})  # Convert labels to binary (1, 0)
print(data.head())  # Confirm 'sentiment' is mapped to 0 and 1

# Define reviews and labels from the DataFrame
reviews = data['review'].values
labels = data['sentiment'].values

# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Tokenize the text data
vocab_size = 5000
max_len = 300
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

# Convert text to sequences and pad them
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
x_train_padded = pad_sequences(x_train_seq, maxlen=max_len, padding='post')
x_test_padded = pad_sequences(x_test_seq, maxlen=max_len, padding='post')

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [91]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\LEGION\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


In [92]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),  # Use max_len here
    SimpleRNN(64, return_sequences=False),  # RNN layer
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

max_len = 300 
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()



In [93]:
model.fit(x_train_padded, y_train, epochs=5, batch_size=32, validation_data=(x_test_padded, y_test))


Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 47ms/step - accuracy: 0.5053 - loss: 0.6938 - val_accuracy: 0.5057 - val_loss: 0.6943
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 47ms/step - accuracy: 0.5218 - loss: 0.6886 - val_accuracy: 0.5310 - val_loss: 0.6809
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 46ms/step - accuracy: 0.5572 - loss: 0.6536 - val_accuracy: 0.5399 - val_loss: 0.6723
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 46ms/step - accuracy: 0.5725 - loss: 0.6311 - val_accuracy: 0.5439 - val_loss: 0.6839
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 46ms/step - accuracy: 0.5857 - loss: 0.6109 - val_accuracy: 0.5363 - val_loss: 0.6942


<keras.src.callbacks.history.History at 0x1702bd0bed0>

In [94]:
oss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.5366 - loss: 0.6952
Test Accuracy: 0.5363


In [95]:
max_length = 300  

def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')  # Now max_length is defined
    prediction = model.predict(padded)[0][0]

    if prediction > 0.5:
        return "Positive "
    else:
        return "Negative "

# Test again
sample_review = "The movie was absolutely fantastic! I loved it."
print(predict_sentiment(sample_review))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
Negative 
