In [3]:
!pip install kaggle
import os
import json
import pandas as pd
import numpy as np



**DATA COLLECTION**

In [4]:
from zipfile import ZipFile
kaggle_dictionary=json.load(open('/content/kaggle (1).json'))
os.environ['kaggle_username']=kaggle_dictionary['username']
os.environ['kaggle_key']=kaggle_dictionary['key']
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip','r') as zip_ref:
  zip_ref.extractall()

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:01<00:00, 32.2MB/s]
100% 25.7M/25.7M [00:01<00:00, 20.8MB/s]


**DATASET PREPROCESSING**

In [5]:
import re
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
data=pd.read_csv('/content/IMDB Dataset.csv')
def preprocess_text(text):
    # Lowercasing the text
    text = text.lower()

    # Remove HTML tags, URLs, and special characters
    text = re.sub(r'<.*?>', '', text)       # Remove HTML tags
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters and numbers

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    return ' '.join(tokens)
data['cleaned_review'] = data['review'].apply(preprocess_text)

print(data[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


**IMPORTING LIBRARIES TO TRAIN LSTM MODEL**

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

**ENCODING THE LABELS**

In [8]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)
data.head()

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


**SPLITING AND TOKENIZING THE TEXT**

In [9]:
x=data['cleaned_review']
y=data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=data['sentiment'])
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')




**BUILDING THE LSTM MODEL**

In [13]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




**TRAINING THE MODEL**

In [14]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 263ms/step - accuracy: 0.6159 - loss: 0.6436 - val_accuracy: 0.7060 - val_loss: 0.6059
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 260ms/step - accuracy: 0.6817 - loss: 0.5944 - val_accuracy: 0.7749 - val_loss: 0.5167
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 261ms/step - accuracy: 0.7199 - loss: 0.5624 - val_accuracy: 0.6549 - val_loss: 0.6170
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 260ms/step - accuracy: 0.7972 - loss: 0.4582 - val_accuracy: 0.8253 - val_loss: 0.4029
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 257ms/step - accuracy: 0.8288 - loss: 0.3961 - val_accuracy: 0.8550 - val_loss: 0.3403


<keras.src.callbacks.history.History at 0x7ce40395a320>

**EVALUATING THE MODEL**

In [15]:
y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
print("LSTM Model Performance:\n")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 64ms/step
LSTM Model Performance:

              precision    recall  f1-score   support

    Negative       0.89      0.81      0.85      5000
    Positive       0.83      0.90      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



**PREDICTING SENTIMENT**

In [16]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=100)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [17]:
new_review = "what a great movie it was bad acting zero chemistry garbage script"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
The sentiment of the review is: negative


In [18]:
new_review = "Best movie with an excellent VFx and actors have shown their best. Iron man love you 3000 times"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
The sentiment of the review is: positive
