<a href="https://colab.research.google.com/github/adnaen/machine-learning-notes/blob/main/deep_learning/5_rnn/imdb_movie_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/IMDB Dataset.csv", nrows=5000)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,2532
positive,2468


In [6]:
positive_df = df[df["sentiment"] == "positive"].sample(2000)
negative_df = df[df["sentiment"] == "negative"].sample(2000)

df = pd.concat([positive_df, negative_df], ignore_index=True)
df.shape

(4000, 2)

we know that, nn.RNN can't understand text. so we need to encode the text into VOCCAB index, lets create custome voccab storage

In [7]:
class Vocab:
    def __init__(self, VOCAB_SIZE: int = 10000) -> None:
        self.vocab: dict[str, int] = {"PAD": 0}
        self._current_value: int = 1
        self.VOCAB_SIZE = VOCAB_SIZE

    def add(self, sentence: list[str]) -> bool:

        if not isinstance(sentence, list):
            sentence = sentence.split()

        for token in sentence:
            if self._current_value + 1 >= self.VOCAB_SIZE:
                return False

            if self.is_exists(token):
                continue

            self.vocab[token] = self._current_value
            self._current_value += 1

        return True

    def is_exists(self, token: str) -> bool:
        return True if (
            token in self.vocab.keys()
        ) else (
            False
        )

    def __len__(self) -> int:
        return len(self.vocab.keys())

    def __getitem__(self, idx: str) -> int:
        if not self.is_exists(idx):
            return 0
        return self.vocab[idx]

    def __repr__(self) -> str:
        return str(self.vocab.items())


In [8]:
def clean_text(text: str) -> str:
    text = text.lower()
    return " ".join(re.findall("[a-z]+", text))

In [9]:
df["cleaned_review"] = df["review"].apply(clean_text)
df = df.drop(columns=["review"])
df.head()

Unnamed: 0,sentiment,cleaned_review
0,positive,the movie was actually a romantic drama based ...
1,positive,okay first to anne rice book fans br br sure l...
2,positive,well here s another terrific example of awkwar...
3,positive,i m a year old college professor i went with m...
4,positive,sitting typing nothing is the latest what if f...


In [10]:
vocab = Vocab()
for sentence in df["cleaned_review"]:
    vocab.add(sentence)

In [12]:
len(vocab) # our own vocab!

9999

In [None]:
def to_vocab_idx(text: str) -> torch.Tensor:
    if not isinstance(text, list):
        text = text.split()

    return torch.tensor([vocab[token] for token in text])

In [None]:
df["vocab_idx"] = df["cleaned_review"].apply(to_vocab_idx)
df.head()

Unnamed: 0,sentiment,cleaned_review,vocab_idx
0,positive,a vow to cherish is a wonderful movie it s bas...,"[tensor(1), tensor(2), tensor(3), tensor(4), t..."
1,positive,i haven t seen this movie in about years but i...,"[tensor(93), tensor(94), tensor(56), tensor(95..."
2,positive,you the living br br mordant i ve never writte...,"[tensor(27), tensor(14), tensor(151), tensor(5..."
3,positive,this is a typical sandra bullock movie in whic...,"[tensor(92), tensor(5), tensor(1), tensor(306)..."
4,positive,one of the few comedic twilight zones that s a...,"[tensor(85), tensor(13), tensor(14), tensor(35..."


In [None]:
df["y"] = df["sentiment"].map({"positive" : 1, "negative": 0})
df = df.drop(columns=["sentiment"])
df.head()

Unnamed: 0,cleaned_review,vocab_idx,y
0,a vow to cherish is a wonderful movie it s bas...,"[tensor(1), tensor(2), tensor(3), tensor(4), t...",1
1,i haven t seen this movie in about years but i...,"[tensor(93), tensor(94), tensor(56), tensor(95...",1
2,you the living br br mordant i ve never writte...,"[tensor(27), tensor(14), tensor(151), tensor(5...",1
3,this is a typical sandra bullock movie in whic...,"[tensor(92), tensor(5), tensor(1), tensor(306)...",1
4,one of the few comedic twilight zones that s a...,"[tensor(85), tensor(13), tensor(14), tensor(35...",1


In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    df["vocab_idx"].values,
    torch.tensor(df["y"].values),
    test_size=0.2,
    random_state=2323,
    shuffle=True,
)

(x_train.shape, y_train.shape), (x_test.shape, y_test.shape)

(((3200,), torch.Size([3200])), ((800,), torch.Size([800])))

In [None]:
new_x_train = pad_sequence(x_train, padding_value=0).T
new_x_test = pad_sequence(x_test, padding_value=0).T

In [None]:
# prepare data with dataset and dataloader
class IMDBDataset(Dataset):
    def __init__(self, x: torch.Tensor, y: torch.Tensor) -> None:
        self.x = x
        self.y = y

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(self, idx: int) -> tuple:
        return self.x[idx], self.y[idx]

In [None]:
imdb_ds = IMDBDataset(x=new_x_train, y=y_train)
dataloader = DataLoader(dataset=imdb_ds, shuffle=True, batch_size=300)

In [None]:
len(dataloader) # now data is ready with 11 batches

11

In [None]:
class IMDBMoviewReviewRNN(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_size: int) -> None:
        super().__init__()

        self.embedding = torch.nn.Embedding(
            num_embeddings=10000,
            embedding_dim=embedding_dim
            )
        self.rnn = torch.nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            batch_first=True
            )

        self.linear = torch.nn.Linear(hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embeddings = self.embedding(x)
        op, hidden = self.rnn(embeddings)
        hidden = hidden.squeeze(0)
        logits = self.linear(hidden)
        return logits

In [None]:
model = IMDBMoviewReviewRNN(embedding_dim=3, hidden_size=5)

epochs: int = 200
criterion = torch.nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
model.train()
for ep in range(epochs):
    for batch_x, batch_y in dataloader:
        y_pred = model(batch_x)
        loss = criterion(y_pred, batch_y.reshape(-1, 1).float())
        optim.zero_grad()
        loss.backward()
        optim.step()
    if ep % 100 == 0:
        print(f"epoch: {ep}, loss : {loss.item()}")

epoch: 0, loss : 0.6901195049285889
epoch: 100, loss : 0.6933199167251587


In [None]:
def model_inference(review: str) -> str:
    vocab_idx = to_vocab_idx(text=review)
    model.eval()
    with torch.no_grad():
        pred = model(vocab_idx)

    return "Positive Review" if (
        torch.sigmoid(pred) >= 0.5
    ) else (
        "Negative Review"
    )


In [None]:
model_inference("wonderful")

'Positive Review'

In [None]:
model_inference("i didn't like the movie")

'Negative Review'