In [None]:
import pandas as pd

In [None]:
import kagglehub

import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 76.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [None]:
data=pd.read_csv(path+"/IMDB Dataset.csv")
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.shape

(50000, 2)

In [None]:
data.tail(10)

Unnamed: 0,review,sentiment
49990,"Lame, lame, lame!!! A 90-minute cringe-fest th...",negative
49991,"Les Visiteurs, the first movie about the medie...",negative
49992,John Garfield plays a Marine who is blinded by...,positive
49993,Robert Colomb has two full-time jobs. He's kno...,negative
49994,This is your typical junk comedy.<br /><br />T...,negative
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


**Checking if data is balanced or not**

In [None]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


**Checking if the data has null values**

In [None]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


**Preprocessing the data, removing html tags and unnecessary digits and characters**

In [None]:
import re
from bs4 import BeautifulSoup

def preprocess_text(text):
    text=BeautifulSoup(text,'html.parser').get_text()
    text=re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

data['review'] = data['review'].apply(preprocess_text)

data.head(5)


  text=BeautifulSoup(text,'html.parser').get_text()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


Converting a categorical column (like "sentiment") into numbers is crucial for most machine learning algorithms because:

Machine Learning Models Require Numerical Input

In [None]:
data.replace({'sentiment':{'positive':1,'negative':0}}, inplace = True)

  data.replace({'sentiment':{'positive':1,'negative':0}}, inplace = True)


In [None]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy J...,0
4,Petter Matteis Love in the Time of Money is a ...,1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,Bad plot bad dialogue bad acting idiotic direc...,0
49997,I am a Catholic taught in parochial elementary...,0
49998,Im going to have to disagree with the previous...,0


Splitting the data into train and test data

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.20, random_state=42)

Checking the data split

In [None]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

# **Creating the Y column for train and test**

In [None]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

# **Building the model for LSTM**

In [None]:
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU




In [None]:
model = Sequential()
model.add(keras.Input(shape=(200,), name='input_layer'))
model.add(Embedding(input_dim=5000, output_dim=128, input_length = 200, name='embedding_layer'))
model.add(LSTM(128, dropout= 0.2, recurrent_dropout=0.2, name='lstm_layer'))
model.add(Dense(1, activation='sigmoid', name='output_layer'))



# **Printing model summary**

In [None]:
print(model.summary())

None


# **Compiling the model**

In [None]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

# **Training the model**

In [None]:
# Train the model
model.fit(
    X_train,            # Training data
    y_train,            # Training labels
    epochs=5,           # Number of epochs
    batch_size=64,      # Batch size
    validation_split=0.2 # 20% data used for validation
)


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 521ms/step - accuracy: 0.9549 - loss: 0.1208 - val_accuracy: 0.8755 - val_loss: 0.3932
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 530ms/step - accuracy: 0.9638 - loss: 0.1026 - val_accuracy: 0.8742 - val_loss: 0.4400
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 486ms/step - accuracy: 0.9650 - loss: 0.0987 - val_accuracy: 0.8721 - val_loss: 0.4570
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 472ms/step - accuracy: 0.9715 - loss: 0.0828 - val_accuracy: 0.8756 - val_loss: 0.4820
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 477ms/step - accuracy: 0.9761 - loss: 0.0704 - val_accuracy: 0.8674 - val_loss: 0.5026


<keras.src.callbacks.history.History at 0x7de441764be0>

# **Evaluating the model**

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 111ms/step - accuracy: 0.8632 - loss: 0.4895
Test Loss: 0.4957248568534851
Test Accuracy: 0.866100013256073


# **Checking the model**

In [None]:
def predict_sentiment(review):
  #tokeninzing and padding the review before sending it to the model
  sequence = pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=200)
  prediction = model.predict(sequence)[0][0]
  return "Positive" if prediction > 0.5 else "Negative"


In [None]:
example_review = "The movie was not that good"

sentiment = predict_sentiment(example_review)

print(f"The sentiment of this review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
The sentiment of this review is: Negative
