In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Bidirectional, LSTM, Dense, Dropout
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 1. Load data
df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')

In [12]:
# 2. Preprocess
max_vocab = 10000
max_length = 200

tokenizer = Tokenizer(num_words=max_vocab, oov_token='<OOV>')
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

label_map = {'positive': 1, 'negative': 0}
labels = df['sentiment'].map(label_map).values

# 3. Split
X_train, X_test, y_train, y_test = train_test_split(
    padded, labels, test_size=0.2, random_state=42
)

# 4. Build model with Bi-LSTM
model = Sequential([
    Embedding(input_dim=max_vocab, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

# 5. Set up callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    ModelCheckpoint(
        'best_imdb_model.h5',
        monitor='val_loss',
        save_best_only=True
    )
]



None


In [13]:
# 6. Train
history = model.fit(
    X_train,
    y_train,
    epochs=12,
    batch_size=320,
    validation_split=0.2,
    callbacks=callbacks
)


Epoch 1/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1s/step - accuracy: 0.5772 - loss: 0.6546 - val_accuracy: 0.7741 - val_loss: 0.4828
Epoch 2/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 1s/step - accuracy: 0.8308 - loss: 0.4163 - val_accuracy: 0.8284 - val_loss: 0.4075
Epoch 3/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 1s/step - accuracy: 0.8496 - loss: 0.3794 - val_accuracy: 0.8382 - val_loss: 0.3916
Epoch 4/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 1s/step - accuracy: 0.8747 - loss: 0.3197 - val_accuracy: 0.8509 - val_loss: 0.4053
Epoch 5/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 1s/step - accuracy: 0.8679 - loss: 0.3306 - val_accuracy: 0.8440 - val_loss: 0.3851
Epoch 6/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 1s/step - accuracy: 0.8935 - loss: 0.2805 - val_accuracy: 0.8478 - val_loss: 0.3894
Epoch 7/12
[1m100/100

In [14]:
# 7. Evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test accuracy: {acc:.4f}')

Test accuracy: 0.8542


In [15]:
# 7. Save Trained Model
model.save('sentiment_rnn_model.h5')

In [16]:
# 8. Save Tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [19]:
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model and tokenizer
model = tf.keras.models.load_model('sentiment_rnn_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

max_length = 200

def predict_review_sentiment(review_text):
    seq = tokenizer.texts_to_sequences([review_text])
    padded_seq = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_seq)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    print(f"Prediction: {sentiment} (confidence: {prediction:.2f})")
    return sentiment, prediction

# Example usage:
example_review = "I really enjoyed this movie! The story was compelling and the acting was great."
predict_review_sentiment(example_review)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Prediction: Positive (confidence: 0.97)


('Positive', 0.9706955)