**Preprocessing**

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!unzip '/content/drive/MyDrive/dataset/IMDB Dataset.csv.zip' -d '/content/drive/MyDrive/df'

Archive:  /content/drive/MyDrive/dataset/IMDB Dataset.csv.zip
replace /content/drive/MyDrive/df/IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


In [5]:
dataset = pd.read_csv('/content/drive/MyDrive/df/IMDB Dataset.csv')

# Clean the review texts
dataset['cleaned_review'] = dataset['review'].apply(clean_text)

# Encode labels: 'positive' -> 1, 'negative' -> 0
dataset['label'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
dataset.head(5000)

Unnamed: 0,review,sentiment,cleaned_review,label
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1
...,...,...,...,...
4995,An interesting slasher film with multiple susp...,negative,interesting slasher film multiple suspectsincl...,0
4996,i watched this series when it first came out i...,positive,watched series first came 70si 14 years old wa...,1
4997,Once again Jet Li brings his charismatic prese...,positive,jet li brings charismatic presence movie scree...,1
4998,"I rented this movie, after hearing Chris Gore ...",negative,rented movie hearing chris gore saying somethi...,0


**Tokenizer**

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 10000  # Adjust based on your dataset
max_length = 100    # Adjust based on your dataset

# Initialize tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(dataset['cleaned_review'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(dataset['cleaned_review'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

**Neural Network**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model architecture
model.summary()

In [13]:
from sklearn.model_selection import train_test_split

# Features and labels
X = padded_sequences
y = dataset['label'].values

# Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X, y, epochs=5,  verbose=1, validation_split=0.2)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.5846 - loss: 0.6408 - val_accuracy: 0.8677 - val_loss: 0.3199
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.8973 - loss: 0.2680 - val_accuracy: 0.8735 - val_loss: 0.2985
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.9361 - loss: 0.1818 - val_accuracy: 0.8675 - val_loss: 0.3244
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - accuracy: 0.9563 - loss: 0.1273 - val_accuracy: 0.8676 - val_loss: 0.3874
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9734 - loss: 0.0791 - val_accuracy: 0.8612 - val_loss: 0.4639


In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f'Test Accuracy: {accuracy * 100:.2f}%')
print(f'Test Loss: {loss:.4f}')

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9823 - loss: 0.0707
Test Accuracy: 95.84%
Test Loss: 0.1350


In [15]:
for text in ["Storyline is rubbish but movie direction is best", "really bad", "It is good ", "Absolutely brilliant"]:
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, padding='post')
    pred = model.predict(pad)[0][0]
    label = "positive" if pred > 0.5 else "negative"
    print(f"{text} → {label} ({pred:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Storyline is rubbish but movie direction is best → negative (0.06)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
really bad → negative (0.14)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
It is good  → positive (0.66)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Absolutely brilliant → positive (0.96)


# **Testing**

In [None]:
test_sentences = ["I was really looking forward to this amazing cast and hoping they would bring a fantastic performance and story... Honestly shocking! Why would they sign up for this utterly boring movie. There was no saving grace anywhere in those 2+ hrs.", "It was  good but not much",'What a film story']
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq,padding='post')
predictions = model.predict(test_pad)

# Show results with positive/negative
for sentence, pred in zip(test_sentences, predictions):
    sentiment = "positive" if pred > 0.5 else "negative"
    print(f"{sentence} -> {sentiment} ({pred[0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step
I was really looking forward to this amazing cast and hoping they would bring a fantastic performance and story... Honestly shocking! Why would they sign up for this utterly boring movie. There was no saving grace anywhere in those 2+ hrs. -> negative (0.01)
It was  good but not much -> negative (0.48)
What a film story -> negative (0.46)


**model saving**

In [16]:
model.save('/content/drive/MyDrive/dataset/IMDB_Dataset.keras')

In [17]:
import pickle

# Save tokenizer
with open('/content/drive/MyDrive/dataset/imdb.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [18]:
import tensorflow as tf
print(tf.__version__)


2.18.0
