# Sentiment Analysis on IMDB Review Data USing LSTM

In [1]:
!pip install kaggle



Importing the dependencies

In [2]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential  #Neural network
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Data Collection KAggle api

In [3]:
kaggle_dict=json.load(open('kaggle.json'))

In [4]:
# setup kaggle creds as env variables
os.environ["KAGGLE_USERNAME"]=kaggle_dict['username']
os.environ["KAGGLE_KEY"]=kaggle_dict['key']

In [5]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews


Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
!ls

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


In [7]:
#unzip dataset
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip','r') as zip_ref:
  zip_ref.extractall()

In [8]:
!ls

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


Loading Data

In [9]:
data=pd.read_csv('IMDB Dataset.csv')

In [10]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
data.shape

(50000, 2)

In [12]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [13]:
data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)

  data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)


In [14]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [15]:
#split data into training and testing data
train_data,test_data=train_test_split(data, test_size=0.2,random_state=42)

In [16]:
train_data.shape

(40000, 2)

In [17]:
test_data.shape

(10000, 2)

Data Preprocessing

In [18]:
#Tokenize text data
tokenizer = Tokenizer(num_words=5000) #Used to convert words or phrases to numbers
tokenizer.fit_on_texts(train_data['review']) #For no leakage
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200) #to make sure all the input data are of the same length
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [19]:
x_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [20]:
x_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [21]:
y_train=train_data['sentiment']
y_test=test_data['sentiment']

In [22]:
y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


**LSTM Model- Long Short Term Memory**
- Useful for sequential data

In [23]:
#build a model
model=Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2)) #dropout is to prevent overfitting
model.add(Dense(1,activation='sigmoid'))



In [24]:
model.build(input_shape=(None,200))

In [25]:
model.summary()

In [26]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

TRaining model

In [27]:
model.fit(x_train,y_train,epochs=5,batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 420ms/step - accuracy: 0.6984 - loss: 0.5610 - val_accuracy: 0.8378 - val_loss: 0.3782
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 417ms/step - accuracy: 0.8603 - loss: 0.3442 - val_accuracy: 0.7559 - val_loss: 0.4925
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 418ms/step - accuracy: 0.8484 - loss: 0.3549 - val_accuracy: 0.8661 - val_loss: 0.3222
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 420ms/step - accuracy: 0.8993 - loss: 0.2538 - val_accuracy: 0.8680 - val_loss: 0.3194
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 420ms/step - accuracy: 0.9149 - loss: 0.2217 - val_accuracy: 0.8755 - val_loss: 0.3225


<keras.src.callbacks.history.History at 0x7e11930a0b90>

Model Evaluation

In [28]:
loss, accuracy=model.evaluate(x_test,y_test)
print(f"Test Loss: {loss}")
print(f"Test accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 119ms/step - accuracy: 0.8783 - loss: 0.3184
Test Loss: 0.3131282329559326
Test accuracy: 0.8816999793052673


Building a predictive System

In [43]:
def predict_sentiment(review):
  #tokenize and pad the data
  sequence=tokenizer.texts_to_sequences([review])
  padded_sequence=pad_sequences(sequence,maxlen=200)
  prediction=model.predict(padded_sequence)
  sentiment="Positive" if prediction[0][0]>0.5 else 'Negative'
  return sentiment

In [44]:
#example usage
new_review="This movie is fantastic. i loved it"
sentiment=predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
The sentiment of the review is: Positive


In [45]:
#example usage
new_review="This movie was not that good"
sentiment=predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
The sentiment of the review is: Negative
