In [37]:
import pandas as pd
import numpy as np


In [38]:
# data = pd.read_csv("/content/drive/MyDrive/Datasets/IMDB Dataset.csv")

In [39]:
data = pd.read_csv("trainingData\\IMDB Dataset.csv")

In [40]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [41]:
data.shape

(50000, 2)

In [42]:
type(data)

pandas.core.frame.DataFrame

In [43]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [44]:
data["sentiment"].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [45]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [46]:
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [47]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [48]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [49]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)

In [50]:
train_data.shape


(40000, 2)

In [51]:
test_data.shape

(10000, 2)

In [52]:
tokenizer = Tokenizer(num_words=5000)
test = tokenizer.fit_on_texts(train_data["review"])

In [53]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [54]:
x_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]])

In [55]:
x_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]])

In [56]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

In [57]:
# LSTM MODEL BUILDING

In [58]:
from keras.layers import Input

model = Sequential()
model.add(Input(shape=(200,)))
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [59]:
model.summary()

In [60]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [61]:
model.fit(x_train, y_train, epochs = 5, batch_size= 64, validation_split = 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 307ms/step - accuracy: 0.7119 - loss: 0.5510 - val_accuracy: 0.8521 - val_loss: 0.3484
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 307ms/step - accuracy: 0.8466 - loss: 0.3637 - val_accuracy: 0.8146 - val_loss: 0.4071
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 299ms/step - accuracy: 0.8498 - loss: 0.3536 - val_accuracy: 0.8146 - val_loss: 0.4035
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 303ms/step - accuracy: 0.8737 - loss: 0.3054 - val_accuracy: 0.8583 - val_loss: 0.3497
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 299ms/step - accuracy: 0.8986 - loss: 0.2532 - val_accuracy: 0.8730 - val_loss: 0.3132


<keras.src.callbacks.history.History at 0x1fa164370e0>

In [62]:
model.save("finalResult\\model.h5")



In [63]:
import joblib
joblib.dump(tokenizer, "finalResult\\tokenizer.pkl")

['finalResult\\tokenizer.pkl']

In [64]:
loss, accuracy = model.evaluate(x_test, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 61ms/step - accuracy: 0.8821 - loss: 0.2933


In [65]:
print(loss)

0.2911872863769531


In [66]:
print(accuracy)

0.8831999897956848


In [67]:
# Building Predictive System

In [68]:
def predictive_system(reveiw):
  sequences = tokenizer.texts_to_sequences([reveiw])
  padded_sequence = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [69]:
predictive_system("This movie was so fantastic and everyone should watch this")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314ms/step


'positive'

In [70]:
predictive_system("A thrilling adventure with stunning visual")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'positive'