In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv('IMDB-Dataset.csv')
df = df.head(8000)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Label Encoding
df['sentiment'] = df['sentiment'].map({'positive' : 1 , 'negative' : 0})


In [4]:
texts = df['review'].values
labels = df['sentiment'].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2)

## Text Tokenization Parameters

In [6]:
vocab_size = 10000  # Only consider the top 10k words
max_length = 200  # Cut off reviews after 200 words
embedding_dim = 16  # 16 Abstract Features


In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)


In [8]:
# Convert Texts to Sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Add Padding to Sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')



In [9]:
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense

# Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')   # Binary CLassification
     
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()



In [10]:
history = model.fit(X_train_pad, y_train, epochs=20, validation_data=(X_test_pad, y_test), batch_size=128)

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5421 - loss: 0.6918 - val_accuracy: 0.6538 - val_loss: 0.6837
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6769 - loss: 0.6778 - val_accuracy: 0.7050 - val_loss: 0.6513
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7560 - loss: 0.6340 - val_accuracy: 0.7450 - val_loss: 0.5922
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7824 - loss: 0.5600 - val_accuracy: 0.7844 - val_loss: 0.5256
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8474 - loss: 0.4700 - val_accuracy: 0.8119 - val_loss: 0.4665
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8694 - loss: 0.4001 - val_accuracy: 0.8244 - val_loss: 0.4202
Epoch 7/20
[1m50/50[0m [32m━━━━━━━━━━

In [11]:
# Evaluate the model

loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy}")


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8633 - loss: 0.3607
Test Accuracy: 0.859375


In [13]:
# Example prediction
sample_review = ["This movie was fantastic! Absolutely loved it."]
sample_seq = tokenizer.texts_to_sequences(sample_review)
sample_pad = pad_sequences(sample_seq, maxlen=max_length, padding='post', truncating='post')
pred = model.predict(sample_pad)

if pred[0][0] > 0.5:
    print(f"Positive, Score: {pred[0][0]}")
else:
    print(f"Negative, Score: {pred[0][0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Positive, Score: 0.6963098049163818
