In [3]:
import torch
from torch import nn

In [4]:
class MyRnn(nn.Module):
  def __init__(self):
    super().__init__()
    self.e = nn.Embedding(5000, 256)
    self.rnn = nn.GRU(256, 512, 4)   # We can use RNN or GRU or LSTM
    self.linear = nn.Linear(512, 2)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # x = l x B
    x_e = self.e(x)
    # x_e = l x B x 256
    _, h =self.rnn(x_e)
    last_h = h[-1]

    y = self.linear(last_h)
    y_s = self.sigmoid(y)

    return y_s


net = MyRnn()

x = torch.tensor([[0, 1, 2, 500, 45], [455, 89, 94, 322, 1000]]).T

y = net(x)
y.shape

torch.Size([2, 2])

In [24]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

class IMDBDataSet(Dataset):
    def __init__(self, file_path: str, tokenizer_name: str = "bert-base-uncased"):
        self.sentences = []
        self.labels = []
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines[1:]:  # Skip header
                split_line = line.strip().split(',')
                text = split_line[0]
                label = split_line[1]
                self.sentences.append(text)
                self.labels.append(label)

    def __len__(self):
        return len(self.sentences)

    def convert_label(self, label: str) -> torch.tensor:
        return torch.tensor([1, 0] if label == "positive" else [0, 1], dtype=torch.float)

    def __getitem__(self, idx):
        input_ids = self.tokenizer(
            self.sentences[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )["input_ids"].squeeze(0)

        input_ids[input_ids >= 5000] = 0

        target = self.convert_label(self.labels[idx])
        return input_ids, target



def pad_collate(batch):
    input_batch = [item[0] for item in batch]
    target_batch = [item[1] for item in batch]

    input_batch = torch.nn.utils.rnn.pad_sequence(input_batch, batch_first=True, padding_value=0)
    target_batch = torch.stack(target_batch)

    return input_batch, target_batch


file_path = "/content/drive/MyDrive/Colab Notebooks/IMDB-Dataset.csv"
imdb_data = IMDBDataSet(file_path)
loader = DataLoader(imdb_data, batch_size=128, shuffle=True, collate_fn=pad_collate)

for input_batch, target_batch in loader:
    print("Input Batch Shape:", input_batch.shape)
    print("Target Batch Shape:", target_batch.shape)
    break


Input Batch Shape: torch.Size([128, 128])
Target Batch Shape: torch.Size([128, 2])


In [27]:
from torch.optim import Adam

net = MyRnn()
opt = Adam(net.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()


for epoch in range(1000):
  sum_loss = 0
  for i, batch in enumerate(loader):

    inputs = batch[0]
    targets = batch[1]

    y = net(inputs)

    loss_value = loss_func(y, targets)

    loss_value.backward()

    opt.step()

    sum_loss += loss_value.item()
    print(f'Loss: {sum_loss / (i + 1)}')

Loss: 0.6953949332237244
Loss: 0.5772836953401566
Loss: 0.4978826542695363


KeyboardInterrupt: 