<a href="https://colab.research.google.com/github/bantee-sharma/IMDB-Review-Sentiment-LSTM-GRU/blob/main/GRU_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.layers import SimpleRNN,LSTM,GRU,Dropout,Embedding,Dense
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 103MB/s] 

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [31]:
# Assuming the dataset is downloaded and stored in the 'path' variable
dataset_file = path

# List the files in the directory to check the dataset files
files = os.listdir(dataset_file)
print(files)

# Load the CSV file
dataset = os.path.join(dataset_file,'IMDB Dataset.csv')
df = pd.read_csv(dataset)

['IMDB Dataset.csv']


In [32]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [34]:
df.shape

(50000, 2)

In [35]:
df.sentiment.value_counts().reset_index()

Unnamed: 0,sentiment,count
0,positive,25000
1,negative,25000


In [36]:
# Clean the text (remove HTML tags, punctuation, and lowercase)
def clean_sent(text):
  text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
  text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
  text = text.lower() # Convert to lowercase
  return text

In [37]:
# Apply cleaning
df['review'] = df['review'].apply(clean_sent)

In [38]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive


In [39]:
# Convert sentiments to binary labels
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [40]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [41]:
#tokenize text
tokenizer = Tokenizer(num_words=10000,oov_token = "<OOV>")
tokenizer.fit_on_texts(df['review'])


In [45]:
seq = tokenizer.texts_to_sequences(df['review'])

In [60]:
max([len(i) for i in seq])

2441

In [61]:
#padding sequence
padded_seq = pad_sequences(seq,maxlen = 200,padding='post')

In [62]:
padded_seq

array([[   1,    9, 2580, ...,  122, 3940,  499],
       [   4,  382,  115, ...,    0,    0,    0],
       [  10,  193,   11, ...,    0,    0,    0],
       ...,
       [  18,   57,   21, ...,    1,    3, 5866],
       [ 499,   15,    1, ...,   68,  704,   42],
       [  55,   28, 5642, ...,    0,    0,    0]], dtype=int32)

In [63]:
#slpit the data
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(padded_seq,df['sentiment'],test_size = 0.20,random_state=42)

In [64]:
xtrain.shape,ytrain.shape

((40000, 200), (40000,))

In [65]:
xtest.shape,ytest.shape

((10000, 200), (10000,))

In [102]:
# Build the model
# Build the model
model = Sequential()
model.add(Embedding(input_dim = 10000,output_dim=100,input_shape=(200,)))
model.add(GRU(128,kernel_regularizer = l2,return_sequences=False)) #keras.kerael_regularizer corrected to kernel_regularizer
model.add(Dropout(0.4))
model.add(Dense(1,activation='sigmoid'))

  super().__init__(**kwargs)


In [103]:
model.summary()

In [104]:
from keras.optimizers import Adam,RMSprop,R

In [110]:
#compile model
model.compile(optimizer = RMSprop(learning_rate = 1e-3),loss = 'binary_crossentropy',metrics = ['acc'])

In [111]:
#define early stopping
early_stopping  = EarlyStopping(
         monitor = 'val_loss',
         patience = 3,
         restore_best_weights = True
)

In [112]:
#train model with dropout and l2
history = model.fit(xtrain,ytrain,epochs=10,batch_size=64,validation_data=(xtest,ytest))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - acc: 0.6025 - loss: 0.7348 - val_acc: 0.7892 - val_loss: 0.5049
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - acc: 0.7064 - loss: 0.6114 - val_acc: 0.8355 - val_loss: 0.3941
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - acc: 0.8643 - loss: 0.3546 - val_acc: 0.8889 - val_loss: 0.2940
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - acc: 0.8918 - loss: 0.2946 - val_acc: 0.8932 - val_loss: 0.2746
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - acc: 0.9051 - loss: 0.2577 - val_acc: 0.8996 - val_loss: 0.2640
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - acc: 0.9141 - loss: 0.2373 - val_acc: 0.8764 - val_loss: 0.3074
Epoch 7/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1

In [123]:
# Evaluate the model train
loss, accuracy = model.evaluate(xtrain, ytrain)
print(f"train Loss: {loss:.4f}, train Accuracy: {accuracy * 100:.2f}%")

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - acc: 0.9755 - loss: 0.0795
train Loss: 0.0803, train Accuracy: 97.60%


In [124]:
# Evaluate the model test
loss, accuracy = model.evaluate(xtest, ytest)
print(f"test Loss: {loss:.4f}, test Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - acc: 0.8610 - loss: 0.6442
test Loss: 0.6439, test Accuracy: 86.17%
