<a href="https://colab.research.google.com/github/abhhiixxhek/Sentiment-Analysis-of-Movie-Reviews/blob/main/Sentiment_Analysis_of_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation

from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re
import tensorflow as tf
import pandas as pd

In [4]:
data = pd.read_csv(r"/content/IMDB_Dataset.csv")


data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# from sklearn.preprocessing import LabelEncoder
# d1 = data
# le = LabelEncoder()
# d1['sentiment'] = le.fit_transform(d1['sentiment'])

# X = d1.drop('sentiment', axis=1)
# y = d1['sentiment']

In [10]:
import nltk
nltk.download('stopwords')
english_stops = set(nltk.corpus.stopwords.words('english'))
def load_dataset():
    df = pd.read_csv('IMDB_Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


## Spit Dataset

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
17792    [first, i, like, make, correction, another, re...
23307    [this, four, hour, miniseries, production, two...
34651    [the, directing, behind, film, fantastic, come...
30160    [i, really, disappointed, film, the, first, wa...
3199     [dramatic, license, hate, though, necessary, r...
                               ...                        
43326    [there, movies, films, movies, often, merely, ...
48547    [arthur, miller, always, known, one, america, ...
27690    [the, main, reason, watching, picture, savor, ...
10186    [but, even, caricatures, need, plausible, plot...
26764    [legendary, movie, producer, walt, disney, bro...
Name: review, Length: 40000, dtype: object 

6206     [one, favorite, movies, ever, along, casablanc...
43186    [this, must, see, anybody, loves, thriller, sp...
18210    [i, love, horror, films, i, think, work, way, ...
7276     [this, movie, worse, heaven, gate, plan, outer...
19748    [this, probably, greatest, war, film, certainl...
 

In [12]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

## Tokenize and Pad/Truncate Reviews

In [13]:
print(x_train)
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

17792    [first, i, like, make, correction, another, re...
23307    [this, four, hour, miniseries, production, two...
34651    [the, directing, behind, film, fantastic, come...
30160    [i, really, disappointed, film, the, first, wa...
3199     [dramatic, license, hate, though, necessary, r...
                               ...                        
43326    [there, movies, films, movies, often, merely, ...
48547    [arthur, miller, always, known, one, america, ...
27690    [the, main, reason, watching, picture, savor, ...
10186    [but, even, caricatures, need, plausible, plot...
26764    [legendary, movie, producer, walt, disney, bro...
Name: review, Length: 40000, dtype: object
Encoded X Train
 [[   23     1     6 ...   257  1439 57000]
 [    9   569   421 ...   721     4  3721]
 [    2   891   423 ...     0     0     0]
 ...
 [    2   188   195 ... 12114 13439  4171]
 [   29    11  5905 ...     0     0     0]
 [ 2606     3  1184 ...   494  1593 16873]] 

Encoded X Test
 [[    5  

# Build Architecture/Model

**Embedding Layer:** in simple terms, it creates word vectors of each word in the word_index and group words that are related or have similar meaning by analyzing other words around them.

**LSTM Layer:** to make a decision to keep or throw away data by considering the current input, previous output, and previous memory. There are some important components in LSTM.

- **Forget Gate**, decides information is to be kept or thrown away
- Input Gate, updates cell state by passing previous output and current input into sigmoid activation function
- **Cell State**, calculate new cell state, it is multiplied by forget vector (drop value if multiplied by a near 0), add it with the output from input gate to update the cell state value.
- **Ouput Gate**, decides the next hidden state and used for predictions

**Dense Layer:** compute the input with the weight matrix and bias (optional), and using an activation function. I use Sigmoid activation function for this work because the output is only 0 or 1.

The optimizer is Adam and the loss function is Binary Crossentropy because again the output is only 0 and 1, which is a binary number.

## Model 1

In [14]:
from tensorflow.keras.utils import plot_model
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(964, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
plot_model(model, to_file='model1LSTM.png')
print(model.summary())
checkpoint1 = ModelCheckpoint(
    'LSTM1_2.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2955360   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 964)               62660     
                                                                 
 dense_1 (Dense)             (None, 1)                 965       
                                                                 
Total params: 3043817 (11.61 MB)
Trainable params: 3043817 (11.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint1])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.74775, saving model to LSTM1_2.h5
Epoch 2/5
  1/313 [..............................] - ETA: 4s - loss: 0.1846 - accuracy: 0.9375

  saving_api.save_model(


Epoch 2: accuracy improved from 0.74775 to 0.92738, saving model to LSTM1_2.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92738 to 0.96475, saving model to LSTM1_2.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.96475 to 0.98037, saving model to LSTM1_2.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.98037 to 0.98740, saving model to LSTM1_2.h5


<keras.src.callbacks.History at 0x78e26abd1ba0>

In [16]:
y_pred = (model.predict(x_test, batch_size = 128)> 0.5).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8697
Wrong Prediction: 1303
Accuracy: 86.97


## Model 2

In [17]:
from tensorflow.keras.utils import plot_model
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model2 = Sequential()
model2.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model2.add(LSTM(LSTM_OUT))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(1, activation='softmax'))
model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
plot_model(model2, to_file='model2LSTM.png')
print(model2.summary())
checkpoint2 = ModelCheckpoint(
    'LSTM2_2.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 130, 32)           2955360   
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2984417 (11.38 MB)
Trainable params: 2984417 (11.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [18]:
model2.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint2])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.49990, saving model to LSTM2_2.h5
Epoch 2/5
Epoch 2: accuracy did not improve from 0.49990
Epoch 3/5
Epoch 3: accuracy did not improve from 0.49990
Epoch 4/5
Epoch 4: accuracy did not improve from 0.49990
Epoch 5/5
Epoch 5: accuracy did not improve from 0.49990


<keras.src.callbacks.History at 0x78e26abebe80>

In [19]:
y_pred = model2.predict(x_test, batch_size=128)
y_pred_classes = (y_pred > 0.0).astype("int32")

correct_predictions = np.sum(y_pred_classes.flatten() == y_test)
total_predictions = len(y_pred_classes)

accuracy = correct_predictions / total_predictions * 100

print('Correct Predictions: {}'.format(correct_predictions))
print('Wrong Predictions: {}'.format(total_predictions - correct_predictions))
print('Accuracy: {:.2f}%'.format(accuracy))

Correct Predictions: 5004
Wrong Predictions: 4996
Accuracy: 50.04%


## Model 3

In [20]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Activation

EMBED_DIM = 32
model3 = Sequential()
model3.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model3.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(32, activation='relu'))
model3.add(Dense(2, activation='softmax'))
model3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
plot_model(model3, to_file='model3LSTM.png')
print(model3.summary())
checkpoint3 = ModelCheckpoint(
    'LSTM3_2.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 130, 32)           2955360   
                                                                 
 conv1d (Conv1D)             (None, 126, 64)           10304     
                                                                 
 global_max_pooling1d (Glob  (None, 64)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 2967810 (11.32 MB)
Trainable params: 2967810 (11.32 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [21]:
model3.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint3])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.77127, saving model to LSTM3_2.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.77127 to 0.91815, saving model to LSTM3_2.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.91815 to 0.97728, saving model to LSTM3_2.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.97728 to 0.99725, saving model to LSTM3_2.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.99725 to 0.99977, saving model to LSTM3_2.h5


<keras.src.callbacks.History at 0x78e24f324ee0>

In [22]:
y_pred = np.argmax(model3.predict(x_test, batch_size=128), axis=1)

correct = np.sum(y_pred == y_test)
total = len(y_test)

print('Correct Predictions:', correct)
print('Wrong Predictions:', total - correct)
print('Accuracy:', correct / total * 100)

Correct Predictions: 8716
Wrong Predictions: 1284
Accuracy: 87.16000000000001


## Model4

In [23]:
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

model4 = Sequential()
model4.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model4.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model4.add(GlobalMaxPooling1D())
model4.add(Dense(64, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))

model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
checkpoint4 = ModelCheckpoint(
    'LSTM4_2.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)
model4.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint4])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.78392, saving model to LSTM4_2.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.78392 to 0.92390, saving model to LSTM4_2.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92390 to 0.97948, saving model to LSTM4_2.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.97948 to 0.99733, saving model to LSTM4_2.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.99733 to 0.99985, saving model to LSTM4_2.h5


<keras.src.callbacks.History at 0x78e24f252590>

In [24]:
y_pred = (model4.predict(x_test, batch_size = 128)> 0.5).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8714
Wrong Prediction: 1286
Accuracy: 87.14


## Model 5 BiLSTM

In [25]:
from tensorflow.keras.layers import Bidirectional, LSTM, Reshape

# Define the model
model5 = Sequential()
model5.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model5.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model5.add(GlobalMaxPooling1D())
model5.add(Reshape((1, 64)))  # Reshape the tensor to (batch_size, timesteps, input_dim)
model5.add(Bidirectional(LSTM(LSTM_OUT)))
model5.add(Dense(32, activation='relu'))
model5.add(Dense(2, activation='softmax'))

# Compile the model
model5.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
checkpoint5 = ModelCheckpoint(
    'LSTM5_2.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [26]:
# Fit the model
model5.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint5])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.77660, saving model to LSTM5_2.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.77660 to 0.93322, saving model to LSTM5_2.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.93322 to 0.98980, saving model to LSTM5_2.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.98980 to 0.99910, saving model to LSTM5_2.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.99910 to 0.99998, saving model to LSTM5_2.h5


<keras.src.callbacks.History at 0x78e24f218af0>

In [27]:
y_pred = np.argmax(model5.predict(x_test, batch_size=128), axis=1)

correct = np.sum(y_pred == y_test)
total = len(y_test)

print('Correct Predictions:', correct)
print('Wrong Predictions:', total - correct)
print('Accuracy:', correct / total * 100)

Correct Predictions: 8741
Wrong Predictions: 1259
Accuracy: 87.41


# Load Saved Model

Load saved model and use it to predict a movie review statement's sentiment (positive or negative)

In [47]:
loaded_model = load_model('LSTM2_2.h5')

In [55]:
review ="Captivating performances and a gripping storyline make 'The Silence of the Lambs' a timeless classic. With masterful direction and intense suspense, it's a must-watch thriller that keeps you on the edge."

In [56]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Captivating performances and a gripping storyline make The Silence of the Lambs a timeless classic With masterful direction and intense suspense its a mustwatch thriller that keeps you on the edge
Filtered:  ['captivating performances gripping storyline make the silence lambs timeless classic with masterful direction intense suspense mustwatch thriller keeps edge']


In [57]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[3857  275 3012  682   25    2 2982 8028 3653  270  411 4260  367 1448
   674  605  855 1182    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]


In [58]:
result = loaded_model.predict(tokenize_words)
print(result)

[[1.]]


In [60]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
