In [3]:
import torch
from torch import nn

In [4]:
class MyRnn(nn.Module):
  def __init__(self):
    super().__init__()
    self.e = nn.Embedding(5000, 256)
    self.rnn = nn.GRU(256, 512, 4)   # We can use RNN or GRU or LSTM
    self.linear = nn.Linear(512, 2)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    # x = l x B
    x_e = self.e(x)
    # x_e = l x B x 256
    _, h =self.rnn(x_e)
    last_h = h[-1]

    y = self.linear(last_h)
    y_s = self.sigmoid(y)

    return y_s


net = MyRnn()

x = torch.tensor([[0, 1, 2, 500, 45], [455, 89, 94, 322, 1000]]).T

y = net(x)
y.shape

torch.Size([2, 2])

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter

class IMDBDataSet(Dataset):

  def __init__(self):
    self.sentences = []
    self.labels = []
    self.word = set()
    self.word_to_index = {}
    self.index_to_word = {}

    with open('/content/drive/MyDrive/Colab Notebooks/IMDB-Dataset.csv', 'r', encoding='utf-8') as f:
      lines = f.readlines()
      for line in lines[1:]:
        split_line = line.strip().split(',')
        text = split_line[0]
        label = split_line[1]
        self.sentences.append(text)
        self.labels.append(label)

        # Accumulate words and build vocabulary (after preprocessing)
        words = text.lower().split()
        for w in words:
            self.word.add(w)

    # create the vocab
    self.word = ["<unk>", "<pad>"] + list(self.word)
    for i, word in enumerate(self.word):
      self.word_to_index[word] = i
      self.index_to_word[i] = word


  def __len__(self):
    return len(self.sentences)


  def tokenizer(self, sentence: str) -> torch.tensor:
    tokens = sentence.lower().split()
    indices = [self.word_to_index.get(token, 0) for token in tokens]
    return torch.tensor(indices)

  def convert_label(self, label: str) -> torch.tensor:
    if label == 'positive':
      return torch.tensor([1, 0])

    else:
      return torch.tensor([0, 1])

  def __getitem__(self, idx):
    input_tensor = self.tokenizer(self.sentences[idx])
    target_tensor = self.convert_label(self.labels[idx])

    return input_tensor, target_tensor

def pad_collate(batch):
  input_batch = [item[0] for item in batch]
  target_batch = [item[1] for item in batch]

  input_batch = _pad_sequences(input_batch)
  target_batch = torch.stack(target_batch)

  return input_batch, target_batch

def _pad_sequences(sequences, pad_value=1):
    max_len = max(len(seq) for seq in sequences)
    padded_seqs = []
    for seq in sequences:
      padding_len = max_len - len(seq)
      padded_seq = torch.cat([seq, torch.full((padding_len,), pad_value, dtype=torch.long)])
      padded_seqs.append(padded_seq)

    return torch.stack(padded_seqs)

imdb_data = IMDBDataSet()
loader = DataLoader(imdb_data, batch_size=32, shuffle=True, collate_fn=pad_collate)

for input_batch, target_batch in loader:
    print("Input Batch Shape:", input_batch.shape)
    print("Target Batch Shape:", target_batch.shape)
    break

Input Batch Shape: torch.Size([32, 171])
Target Batch Shape: torch.Size([32, 2])


In [19]:
from torch.optim import Adam

net = MyRnn()
opt = Adam(net.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()


for epoch in range(1000):
  for i, batch in enumerate(loader):

    inputs = batch[0]
    targets = batch[1]

    y = net(inputs)

    loss_value = loss_func(y, targets)

    loss_value.backward()

    opt.step()

IndexError: index out of range in self