<a href="https://colab.research.google.com/github/adnaen/machine-learning-notes/blob/main/imdb_dataset_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import re
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/IMDB Dataset.csv", nrows=5000)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,2532
positive,2468


In [4]:
positive_df = df[df["sentiment"] == "positive"].sample(2000)
negative_df = df[df["sentiment"] == "negative"].sample(2000)

df = pd.concat([positive_df, negative_df], ignore_index=True)
df.shape

(4000, 2)

we know that, nn.RNN can't understand text. so we need to encode the text into VOCCAB index, lets create custome voccab storage

In [5]:
class Vocab:
    def __init__(self, VOCAB_SIZE: int = 10000) -> None:
        self.vocab: dict[str, int] = {"PAD": 0}
        self._current_value: int = 1
        self.VOCAB_SIZE = VOCAB_SIZE

    def add(self, sentence: list[str]) -> bool:

        if not isinstance(sentence, list):
            sentence = sentence.split()

        for token in sentence:
            if self._current_value + 1 >= self.VOCAB_SIZE:
                return False

            if self.is_exists(token):
                continue

            self.vocab[token] = self._current_value
            self._current_value += 1

        return True

    def is_exists(self, token: str) -> bool:
        return True if (
            token in self.vocab.keys()
        ) else (
            False
        )

    def __len__(self) -> int:
        return len(self.vocab.keys())

    def __getitem__(self, idx: str) -> int:
        if not self.is_exists(idx):
            return 0
        return self.vocab[idx]

    def __repr__(self) -> str:
        return str(self.vocab.items())

In [6]:
def clean_text(text: str) -> str:
    text = text.lower()
    return " ".join(re.findall("[a-z]+", text))

In [7]:
df["cleaned_review"] = df["review"].apply(clean_text)
df = df.drop(columns=["review"])
df.head()

Unnamed: 0,sentiment,cleaned_review
0,positive,this may just be the most nostalgic journey ba...
1,positive,what seemed at first just another introverted ...
2,positive,my wife is a mental health therapist and we wa...
3,positive,okay that was just brilliant i wish that the r...
4,positive,this is a really fun breezy light hearted roma...


In [8]:
vocab = Vocab()
for sentence in df["cleaned_review"]:
    vocab.add(sentence)

In [9]:
vocab # our own vocab!



In [10]:
def to_vocab_idx(text: str) -> torch.Tensor:
    if not isinstance(text, list):
        text = text.split()

    return torch.tensor([vocab[token] for token in text])

In [11]:
df["vocab_idx"] = df["cleaned_review"].apply(to_vocab_idx)
df.head()

Unnamed: 0,sentiment,cleaned_review,vocab_idx
0,positive,this may just be the most nostalgic journey ba...,"[tensor(1), tensor(2), tensor(3), tensor(4), t..."
1,positive,what seemed at first just another introverted ...,"[tensor(67), tensor(138), tensor(139), tensor(..."
2,positive,my wife is a mental health therapist and we wa...,"[tensor(231), tensor(256), tensor(232), tensor..."
3,positive,okay that was just brilliant i wish that the r...,"[tensor(297), tensor(101), tensor(243), tensor..."
4,positive,this is a really fun breezy light hearted roma...,"[tensor(1), tensor(232), tensor(19), tensor(11..."


In [13]:
df["y"] = df["sentiment"].map({"positive" : 1, "negative": 0})
df = df.drop(columns=["sentiment"])
df.head()

Unnamed: 0,cleaned_review,vocab_idx,y
0,this may just be the most nostalgic journey ba...,"[tensor(1), tensor(2), tensor(3), tensor(4), t...",1
1,what seemed at first just another introverted ...,"[tensor(67), tensor(138), tensor(139), tensor(...",1
2,my wife is a mental health therapist and we wa...,"[tensor(231), tensor(256), tensor(232), tensor...",1
3,okay that was just brilliant i wish that the r...,"[tensor(297), tensor(101), tensor(243), tensor...",1
4,this is a really fun breezy light hearted roma...,"[tensor(1), tensor(232), tensor(19), tensor(11...",1


In [21]:
x_train, x_test, y_train, y_test = train_test_split(
    df["vocab_idx"].values,
    torch.tensor(df["y"].values),
    test_size=0.2,
    random_state=2323,
    shuffle=True,
)

(x_train.shape, y_train.shape), (x_test.shape, y_test.shape)

(((3200,), torch.Size([3200])), ((800,), torch.Size([800])))

In [40]:
new_x_train = pad_sequence(x_train, padding_value=0).T
new_x_test = pad_sequence(x_test, padding_value=0).T

In [41]:
# prepare data with dataset and dataloader
class IMDBDataset(Dataset):
    def __init__(self, x: torch.Tensor, y: torch.Tensor) -> None:
        self.x = x
        self.y = y

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(self, idx: int) -> tuple:
        return self.x[idx], self.y[idx]

In [42]:
imdb_ds = IMDBDataset(x=new_x_train, y=y_train)
dataloader = DataLoader(dataset=imdb_ds, shuffle=True, batch_size=300)

In [46]:
len(dataloader)

# now data is ready with 11 batches

11

In [49]:
class IMDBMoviewReviewRNN(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_size: int) -> None:
        super().__init__()

        self.embedding = torch.nn.Embedding(
            num_embeddings=10000,
            embedding_dim=embedding_dim
            )
        self.rnn = torch.nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            )

        self.linear = torch.nn.Linear(hidden_size, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embeddings = self.embedding(x)
        op, hidden = self.rnn(embeddings)
        logits = self.linear(hidden)
        return logits

In [51]:
model = IMDBMoviewReviewRNN(embedding_dim=3, hidden_size=5)

epochs: int = 200
criterion = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters())

model.train()
for ep in range(epochs):
    for batch_x, batch_y in dataloader:
        y_pred = model(batch_x)
        cre

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)