# IMDb Setimient Analysis

## Introduction

In this notebook, I will be analyzing the IMDb dataset. The dataset contains movies reviews and their binary classification (positive or negative), and I will be using the dataset to train a model to predict the sentiment of a review. The dataset contains 50,000 reviews, and I will be using 25,000 reviews for training and 25,000 for testing.

## Installing the Required Libraries

In [1]:
!pip install torchtext==0.17.0 portalocker==2.8.2 torch==2.2.0



## Data Preparation

In [2]:
import torch
import torch.nn as nn
import os

In [3]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split


train_dataset = IMDB(split="train")
test_dataset = IMDB(split="test")

test_dataset = list(test_dataset)

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [4]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()


def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    tokenized = text.split()
    return tokenized


for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)


print("Vocab-size:", len(token_counts))

Vocab-size: 69023


In [5]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

# print([vocab[token] for token in ['this', 'is', 'an', 'example']])

In [6]:
if not torch.cuda.is_available():
    print("Warning: this code may be very slow on CPU")

In [7]:
import torchtext

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

from torchtext import __version__ as torchtext_version
from pkg_resources import parse_version

if parse_version(torchtext.__version__) > parse_version("0.10"):
    label_pipeline = lambda x: (
        1.0 if x == 2 else 0.0
    )  # 1 ~ negative, 2 ~ positive review
else:
    label_pipeline = lambda x: 1.0 if x == "pos" else 0.0


def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [9]:
from torch.utils.data import DataLoader

batch_size = 32

train_dl = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
valid_dl = DataLoader(
    valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)
test_dl = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch
)

In [10]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)

text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 0]])
print(embedding(text_encoded_input))

tensor([[[ 0.1187, -0.5282,  0.7039],
         [-0.8321, -0.4651,  0.3234],
         [-0.3531,  0.9124,  0.3710],
         [-0.3757,  0.7046, -0.7106]],

        [[-0.3531,  0.9124,  0.3710],
         [-0.1976,  0.5566,  0.0946],
         [-0.8321, -0.4651,  0.3234],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


## Building an RNN model for the sentiment analysis - Nonbidirectional

In [12]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out


vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

In [13]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)


def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [14]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

torch.manual_seed(1)
if not os.path.exists(os.path.join("models", "imdb-rnn.pth")):
    for epoch in range(num_epochs):
        acc_train, loss_train = train(train_dl)
        acc_valid, loss_valid = evaluate(valid_dl)
        print(f"Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}")

Epoch 0 accuracy: 0.5724 val_accuracy: 0.6306
Epoch 1 accuracy: 0.6867 val_accuracy: 0.7486
Epoch 2 accuracy: 0.6998 val_accuracy: 0.7626
Epoch 3 accuracy: 0.7881 val_accuracy: 0.7780
Epoch 4 accuracy: 0.8448 val_accuracy: 0.8280
Epoch 5 accuracy: 0.8841 val_accuracy: 0.8114
Epoch 6 accuracy: 0.9048 val_accuracy: 0.8058
Epoch 7 accuracy: 0.9275 val_accuracy: 0.8562
Epoch 8 accuracy: 0.9427 val_accuracy: 0.8544
Epoch 9 accuracy: 0.9574 val_accuracy: 0.8606


In [15]:
if not os.path.exists("models"):
    os.makedirs("models")

if not os.path.exists(os.path.join("models", "imdb-rnn.pth")):
    torch.save(model.state_dict(), os.path.join("models", "imdb-rnn.pth"))

if os.path.exists(os.path.join("models, imdb-rnn.pth")):
    model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
    model.load_state_dict(torch.load(os.path.join("models", "imdb-rnn.pth")))
    model.eval()
    model = model.to(device)

In [16]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8580


## Building an RNN model for the sentiment analysis - Bidirectional

In [17]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [18]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

num_epochs = 10

torch.manual_seed(1)
if not os.path.exists(os.path.join("models", "imdb-rnn-bidirectional.pth")):
    for epoch in range(num_epochs):
        acc_train, loss_train = train(train_dl)
        acc_valid, loss_valid = evaluate(valid_dl)
        print(f"Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}")

Epoch 0 accuracy: 0.6521 val_accuracy: 0.7584
Epoch 1 accuracy: 0.7675 val_accuracy: 0.6946
Epoch 2 accuracy: 0.8300 val_accuracy: 0.8268
Epoch 3 accuracy: 0.8903 val_accuracy: 0.8388
Epoch 4 accuracy: 0.8902 val_accuracy: 0.8380
Epoch 5 accuracy: 0.9265 val_accuracy: 0.8434
Epoch 6 accuracy: 0.9526 val_accuracy: 0.8510
Epoch 7 accuracy: 0.9718 val_accuracy: 0.8736
Epoch 8 accuracy: 0.9857 val_accuracy: 0.8692
Epoch 9 accuracy: 0.9904 val_accuracy: 0.8632


In [19]:
if not os.path.exists("models"):
    os.makedirs("models")
    
if not os.path.exists(os.path.join("models", "imdb-rnn-bidirectional.pth")):
    torch.save(model.state_dict(), os.path.join("models", "imdb-rnn-bidirectional.pth"))

if os.path.exists(os.path.join("models, imdb-rnn.pth")):
    model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
    model.load_state_dict(
        torch.load(os.path.join("models", "imdb-rnn-bidirectional.pth"))
    )
    model.eval()
    model = model.to(device)

In [21]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8467
