Importing the Dependencies

In [1]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Data Collection - Kaggle API

In [3]:
kaggle_dictionary = json.load(open('kaggle.json'))

In [4]:
# setup kaggle credentials as environment variables
os.environ['KAGGLE_USERNAME'] = kaggle_dictionary['username']
os.environ['KAGGLE_KEY'] = kaggle_dictionary['key']

In [5]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s]
100% 25.7M/25.7M [00:00<00:00, 1.34GB/s]


In [6]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [7]:
# unzip the dataset file
with ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
  zip_ref.extractall()

In [8]:
!ls # to see the extracted file

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


Loading the Dataset

In [9]:
data = pd.read_csv('IMDB Dataset.csv')
df_sampled = data.sample(n=5000, random_state=42).reset_index(drop=True)

In [10]:

df_sampled.shape

(5000, 2)

In [11]:
df_sampled.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [12]:
df_sampled.tail() # for printing the last five rows

Unnamed: 0,review,sentiment
4995,One of eastwood's best movies after he had sep...,positive
4996,My blurred childhood memories have kept the ec...,negative
4997,I love Zombie-Movies and I love amateur-produc...,negative
4998,Chan is in New York and he gets involved with ...,positive
4999,My wife and I both thought this film a watered...,negative


In [13]:
df_sampled['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,2519
negative,2481


In [14]:
df_sampled.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace = True)

  df_sampled.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace = True)


In [15]:
df_sampled.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,1
1,Not many television shows appeal to quite as m...,1
2,The film quickly gets to a major chase scene w...,0
3,Jane Austen would definitely approve of this o...,1
4,Expectations were somewhat high for me when I ...,0


In [16]:
# split data into training data and test data
train_data, test_data = train_test_split(df_sampled, test_size = 0.2, random_state = 42)

In [17]:
print(train_data.shape)
print(test_data.shape)

(4000, 2)
(1000, 2)


Data Pre-Processing

In [18]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

LSTM - Long Short Term Memory

In [19]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert.to(device)
bert.eval()

batch_size = 128
max_len = 200
texts = list(train_data['review'])
X_embed = []

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Batches"):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors="pt", padding='max_length', truncation=True, max_length=max_len)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = bert(**inputs)
    batch_embeddings = outputs.last_hidden_state.cpu().numpy()  # shape: (batch_size, max_len, 768)
    X_embed.extend(batch_embeddings)

X_embed = np.array(X_embed)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Embedding Batches: 100%|██████████| 32/32 [01:06<00:00,  2.09s/it]


In [20]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(200, 768), dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

  super().__init__(**kwargs)


In [21]:
# trainig the model
model.fit(X_embed, y_train, epochs = 5, batch_size = 64, validation_split = 0.2)

Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 339ms/step - accuracy: 0.5967 - loss: 0.6626 - val_accuracy: 0.7713 - val_loss: 0.4900
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 319ms/step - accuracy: 0.7966 - loss: 0.4439 - val_accuracy: 0.7850 - val_loss: 0.4613
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 299ms/step - accuracy: 0.8253 - loss: 0.3957 - val_accuracy: 0.7400 - val_loss: 0.5376
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 317ms/step - accuracy: 0.8331 - loss: 0.3958 - val_accuracy: 0.7837 - val_loss: 0.4722
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 300ms/step - accuracy: 0.8496 - loss: 0.3453 - val_accuracy: 0.8425 - val_loss: 0.3999


<keras.src.callbacks.history.History at 0x7acf80e0d090>

Model Evaluation

In [22]:
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm

# Load model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = TFBertModel.from_pretrained("bert-base-uncased")
bert.trainable = False  # freeze BERT

# Ensure you use GPU if available
device = tf.device("/GPU:0" if tf.config.list_physical_devices("GPU") else "/CPU:0")

max_len = 200
test_texts = list(test_data['review'])
batch_size = 32

# Your test_texts is the list of sentences
x_test = []

with device:
    for i in tqdm(range(0, len(test_texts), batch_size), desc="Generating x_test"):
        batch_texts = test_texts[i:i + batch_size]

        # Tokenize batch
        inputs = tokenizer(
            batch_texts,
            padding='max_length',
            truncation=True,
            max_length=max_len,
            return_tensors='tf'
        )

        # Run BERT and extract embeddings
        outputs = bert(inputs['input_ids'], attention_mask=inputs['attention_mask'], training=False)
        last_hidden_state = outputs.last_hidden_state.numpy()  # (batch_size, 200, 768)

        x_test.append(last_hidden_state)

# Concatenate all batches to form final x_test
x_test = np.concatenate(x_test, axis=0)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [23]:
print("x_test shape:", x_test.shape)
print("x_test dtype:", type(x_test))
print("y_test shape:", y_test.shape)
print("y_test dtype:", type(y_test))

x_test shape: (1000, 200, 768)
x_test dtype: <class 'numpy.ndarray'>
y_test shape: (1000,)
y_test dtype: <class 'pandas.core.series.Series'>


In [24]:
print(type(x_test))         # Should be <class 'numpy.ndarray'>
print(np.array(x_test).shape)  # Should be (num_samples, 200, 768)


<class 'numpy.ndarray'>
(1000, 200, 768)


In [25]:
loss, accuracy = model.evaluate(x_test, y_test, batch_size=32)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 146ms/step - accuracy: 0.8177 - loss: 0.4112
Test loss: 0.3896
Test accuracy: 0.8320


Building Predictive System

In [26]:
import numpy as np
import tensorflow as tf

def predict_sentiment(review):
    # Tokenize and encode the review using BERT tokenizer
    encoded = tokenizer(review,
                        return_tensors='tf',
                        padding='max_length',
                        truncation=True,
                        max_length=200)

    # Ensure tensors are placed on CPU
    with tf.device('/CPU:0'):
        outputs = bert_model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
        embedding = outputs.last_hidden_state  # shape: (1, 200, 768)

        # Predict sentiment using the LSTM model
        prediction = model.predict(embedding)

    sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
    return sentiment


In [None]:
# example usage

new_review = "The story was really inspiring for budding sportsmen who belong to middle class families."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")