# **`Importing`** ***`Libraries`*** ✈

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import nltk
from nltk.corpus import stopwords
import statistics
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,SimpleRNN,GRU,Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# **Importing** **Data** ⏭

In [2]:
df = pd.read_csv('IMDB Dataset.csv',nrows=25000)

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **Text** **Preprocessing** ✅


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
def remove_html_tags(text):
  clean_text = re.sub(r'<.*?>', '', text)
  return clean_text

df['review'] = df['review'].apply(remove_html_tags)
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
import string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
def remove_punc(text):
  clean_text = "".join([char for char in text if char not in punc])
  return clean_text

df['review'] = df['review'].apply(remove_punc)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [9]:
df['review'] =  df['review'].str.lower()

In [10]:
def stp_words(text):
  clean_text = [word for word in text.split() if word not in stopwords.words('english')]
  return clean_text

df['review'] = df['review'].apply(stp_words)

In [12]:
token = Tokenizer()
token.fit_on_texts(df['review'])

In [13]:
len(token.word_index)

144060

In [14]:
seq = token.texts_to_sequences(df['review'])

In [15]:
average = statistics.mean

In [16]:
average([len(x) for x in seq])

119.99108

In [17]:
padding = pad_sequences(seq,maxlen=220,padding='pre')

In [18]:
padding.shape

(25000, 220)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

In [20]:
x= padding
y = df['sentiment']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.175, random_state=42)

In [22]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((20625, 220), (4375, 220), (20625,), (4375,))

In [23]:
y.value_counts()

sentiment
0    12526
1    12474
Name: count, dtype: int64

In [24]:
vocab_size = len(token.word_index) + 1
print(f"Vocabulary size in x_train: {vocab_size}")

Vocabulary size in x_train: 144061


# **Model** **Building** ⌛

In [28]:
m1 = Sequential()
m1.add(Embedding(67142,100,input_length=220))
m1.add(SimpleRNN(150,return_sequences=True))
m1.add(SimpleRNN(50,return_sequences=True))
m1.add(SimpleRNN(25))
m1.add(Dense(1,activation='sigmoid'))

m1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 220, 100)          6714200   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 220, 150)          37650     
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 220, 50)           10050     
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 25)                1900      
                                                                 
 dense_1 (Dense)             (None, 1)                 26        
                                                                 
Total params: 6763826 (25.80 MB)
Trainable params: 6763826 (25.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
m1.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
m1.fit(x_train,y_train,epochs=5,batch_size=500,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d6b6b79c9a0>

In [30]:
m2 = Sequential()
m2.add(Embedding(144771,output_dim=250,input_length=220))
m2.add(Bidirectional(LSTM(150,return_sequences=True)))
m2.add(Bidirectional(GRU(100,return_sequences=True)))
m2.add(Dropout(0.2))
m2.add(Bidirectional(LSTM(50,return_sequences=True)))
m2.add(Bidirectional(GRU(30,return_sequences=True)))
m2.add(Dropout(0.2))
m2.add(LSTM(10))
m2.add(Dense(1,activation='sigmoid'))

m2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 220, 250)          36192750  
                                                                 
 bidirectional (Bidirection  (None, 220, 300)          481200    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 220, 200)          241200    
 onal)                                                           
                                                                 
 dropout (Dropout)           (None, 220, 200)          0         
                                                                 
 bidirectional_2 (Bidirecti  (None, 220, 100)          100400    
 onal)                                                           
                                                      

In [31]:
m2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
m2.fit(x,y,epochs=7,batch_size=500)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x7d6ae4f667d0>

In [42]:
m3 = Sequential()
m3.add(Embedding(144771,output_dim=250,input_length=220))
m3.add(Bidirectional(LSTM(170,return_sequences=True)))
m3.add(Bidirectional(GRU(130,return_sequences=True)))
m3.add(Dropout(0.2))
m3.add(Bidirectional(LSTM(70,return_sequences=True)))
m3.add(Bidirectional(GRU(40,return_sequences=True)))
m3.add(Dropout(0.2))
m3.add(Bidirectional(GRU(20,return_sequences=True)))
m3.add(LSTM(20))
m3.add(Dense(1,activation='sigmoid'))

m3.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 220, 250)          36192750  
                                                                 
 bidirectional_24 (Bidirect  (None, 220, 340)          572560    
 ional)                                                          
                                                                 
 bidirectional_25 (Bidirect  (None, 220, 260)          368160    
 ional)                                                          
                                                                 
 dropout_10 (Dropout)        (None, 220, 260)          0         
                                                                 
 bidirectional_26 (Bidirect  (None, 220, 140)          185360    
 ional)                                                          
                                                      

In [43]:
m3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
m3.fit(x_train,y_train,epochs=15,validation_data=(x_test,y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7d6ae5a246a0>

**m3 model delivered an accuracy of 99.90 after 15 epochs.**

Others metrics after 15 epochs are:


*   loss : 0.38
*   validation accuracy : 84.73
*   validation loss : 86.81



# **Evaluation** ⚓

In [48]:
text = ['''One of the most brain dead idiotic movies with the most crazy and psychotic characters as protagonists of 2024.
What a waste of time and energy. There are 0 characters in this movie that are remotely normal''' ]
seq = token.texts_to_sequences(text)
padding = pad_sequences(seq,maxlen=220,padding='pre')
pred = mode.predict(padding)
if pred < 0.2:
    print('flop movie')
elif 0.2 <= pred < 0.55:
    print('average movie')
elif 0.55 <= pred < 0.8:
    print('good movie')
else:
    print('Blockboster movie')

flop movie


In [50]:
text = ['''Laapata Ladies is a sort of film that is very rare these days. A satire that is an eye-opener for the audience and needs everyone's attention. With unknown cast, limited budget and great script, it's a movie for all age-groups, every gender and is bound to go a long way.

This Kiran Rao's tragicomedy of two brides in rural India who accidentally get swapped during a train journey while returning from their wedding. Their misadventures after this strange event will make you laugh and the same time think about the social taboos a women has to go through in our society.

Ravi Kishan playing the cop investigating this case is a delight to watch. Very subtly and hilariously he gives movie the much needed comic side that perfectly balances the emotional cause for which the movie was made. Probably one of his best performance till date.
''']

In [51]:
seq = token.texts_to_sequences(text)
padding = pad_sequences(seq,maxlen=220,padding='pre')
pred = mode.predict(padding)
if pred < 0.2:
    print('flop movie')
elif 0.2 <= pred < 0.55:
    print('average movie')
elif 0.55 <= pred < 0.8:
    print('good movie')
else:
    print('Blockboster movie')

Blockboster movie
