In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import sys
import keras

In [2]:
path = "/content/IMDB Dataset.csv"
data = pd.read_csv(path)
data


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
data.shape

(50000, 2)

In [4]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [5]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [7]:
import re
from bs4 import BeautifulSoup

def clean_review(text):
  text = BeautifulSoup(text, "html.parser").get_text()
  text = re.sub(r'[^a-zA-Z0-9\s]','',text)
  return text

data['review'] = data['review'].apply(clean_review)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,Im going to have to disagree with the previous...,negative


In [8]:
data.replace({'sentiment': {'positive':1,'negative':0}}, inplace = True)

data.to_csv('preprocessed_data.csv', index = False)
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically theres a family where a little boy J...,0
4,Petter Matteis Love in the Time of Money is a ...,1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,Bad plot bad dialogue bad acting idiotic direc...,0
49997,I am a Catholic taught in parochial elementary...,0
49998,Im going to have to disagree with the previous...,0


In [9]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [10]:
train_data.shape

(40000, 2)

In [11]:
test_data.shape

(10000, 2)

tokenizing

In [14]:
tokenizer = Tokenizer(num_words = 5000 )
tokenizer.fit_on_texts(train_data["review"])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [15]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [16]:
import keras
model = Sequential()

model.add(keras.Input(shape = (200,)))
model.add(Embedding(input_dim = 5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation= 'sigmoid'))

In [17]:
model.summary(())

In [18]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [19]:
model.fit(X_train, y_train, epochs = 5, batch_size =64, validation_split = 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 450ms/step - accuracy: 0.7168 - loss: 0.5448 - val_accuracy: 0.8440 - val_loss: 0.3704
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 448ms/step - accuracy: 0.8374 - loss: 0.3842 - val_accuracy: 0.8465 - val_loss: 0.3668
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 444ms/step - accuracy: 0.8165 - loss: 0.4151 - val_accuracy: 0.5153 - val_loss: 0.7439
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 445ms/step - accuracy: 0.7376 - loss: 0.5012 - val_accuracy: 0.8518 - val_loss: 0.3742
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 445ms/step - accuracy: 0.8661 - loss: 0.3345 - val_accuracy: 0.8671 - val_loss: 0.3442


<keras.src.callbacks.history.History at 0x7e51a11a02e0>

In [21]:
lossG , accuracyG = model.evaluate(X_test, y_test)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 102ms/step - accuracy: 0.8699 - loss: 0.3330


In [22]:
lossG

0.3338600993156433

In [23]:
accuracyG

0.8718000054359436

In [35]:
def predict_sentiment(text):
  sequence = pad_sequences(tokenizer.texts_to_sequences([text]),maxlen=200)
  predictionG = model.predict(sequence)

  sentimentG = (
    'W movie' if predictionG[0][0] > 0.75 else
    'mid ass' if 0.5 < predictionG[0][0] <= 0.75 else
    'worst ew movie....time'
  )




  return sentimentG

In [45]:
example_review = "This movie falls squarely into the realm of mediocrity, delivering an experience that's neither particularly memorable nor completely unwatchable. The story has moments of promise but ultimately feels underdeveloped, with predictable plot points and a lack of emotional depth. While the performances are competent, they don’t elevate the material enough to leave a lasting impression. The cinematography and score are serviceable but unremarkable, failing to bring any unique flair to the film. It’s the kind of movie that’s fine to pass the time but unlikely to inspire a rewatch or much discussion afterward."
sentimentG = predict_sentiment(example_review)
print(f"the sentiment of the review is {sentimentG}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
the sentiment of the review is worst ew movie....time
