<a href="https://colab.research.google.com/github/ajdillhoff/CSE6363/blob/main/imdb-rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torchtext
import spacy
import torch
import torch.nn as nn
import torchtext.transforms as T
import torch.optim as optim
import pytorch_lightning as pl
import torch.nn.functional as F

# !pip install torchtext==0.12.0
# !pip install torchdata
# !pip install pytorch-lightning

print(torchtext.__version__)

0.12.0


In [77]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


class RNN(pl.LightningModule):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 train_datapipe, val_datapipe, test_datapipe):
        super().__init__()
        
        # Required since our input vector represents each word as an index into
        # the vocabulary.
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # Creates an RNN using tanh by default.
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # LightningModule attributes
        self.lr = 1e-3
        self.batch_size = 32
        self.loss_fn = nn.BCEWithLogitsLoss()

        # Datasets
        self.train_datapipe = train_datapipe
        self.val_datapipe = val_datapipe
        self.test_datapipe = test_datapipe

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

    def training_step(self, batch, batch_idx):
        input = torchtext.functional.to_tensor(batch["token_ids"], padding_value=1).cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input)
        loss = self.loss_fn(output.squeeze(), target)
        
        self.log("train_loss", loss)

        return loss
    
    def validation_step(self, batch, batch_idx):
        print("DEBUG")
        input = torchtext.functional.to_tensor(batch["token_ids"], padding_value=1).cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input).squeeze()
        loss = self.loss_fn(output, target)
        acc = binary_accuracy(output, target)
        
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=True)

        return loader

    def val_dataloader(self):
        loader = torch.utils.data.DataLoader(self.val_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def test_dataloader(self):
        loader = torch.utils.data.DataLoader(self.test_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)

Our model works with numerical input. So, we'll need to convert each word into a corresponding one-hot vector based on the vocabulary of our dataset.

Luckily, `torchtext` makes this simple by providing `build_vocab_from_iterator`. All we need to do is supply the `datapipe` iterator and it builds a vocabulary for us.

In [60]:
tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")
max_tokens = 10000

def make_vocabulary():
    train_dataset = torchtext.datasets.IMDB(split="train")
    train_datapipe = train_dataset.map(lambda x: tokenizer(x[1]))
    v = torchtext.vocab.build_vocab_from_iterator(train_datapipe, specials=["<unk>"], max_tokens=max_tokens)
    v.set_default_index(0)

    return v

In [61]:
v = make_vocabulary()

To finish preparing the data, the labels `pos` and `neg` should be converted to numeric values as well. This can be done with `LabelToIndex`.

With the transforms in place, we can pass the `datapipe` to a PyTorch `DataLoader` object for use during training.

In [75]:
text_transform = T.Sequential(
    T.VocabTransform(v),
)
label_transform = T.LabelToIndex(label_names=["neg", "pos"])

train_dataset, test_dataset = torchtext.datasets.IMDB()

train_datapipe = train_dataset.map(lambda x: (text_transform(tokenizer(x[1])), label_transform(x[0])))

train_size = len(list(train_dataset))
train_idxs = torch.randperm(train_size)
val_start = train_size - int(train_size * 0.1)

def split_fn(x):
    return x['index'] >= val_start

train_datapipe, val_datapipe = train_datapipe.add_index().demux(num_instances=2, classifier_fn=split_fn)
train_datapipe = train_datapipe.batch(32)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
val_datapipe = val_datapipe.batch(32)
val_datapipe = val_datapipe.rows2columnar(["token_ids", "target"])

test_datapipe = test_dataset.map(lambda x: (text_transform(tokenizer(x[1])), label_transform(x[0])))
test_datapipe = test_datapipe.batch(32)
test_datapipe = test_datapipe.rows2columnar(["token_ids", "target"])

Create our model

In [78]:
model = RNN(len(v), 100, 256, 1, train_datapipe, val_datapipe, test_datapipe)

In [79]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    mode="min"
)

trainer = pl.Trainer(accelerator="gpu", callbacks=[checkpoint_callback], max_epochs=5)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | embedding | Embedding         | 1.0 M 
1 | rnn       | RNN               | 91.6 K
2 | fc        | Linear            | 257   
3 | loss_fn   | BCEWithLogitsLoss | 0     
------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.368     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

NotImplementedError: Caught NotImplementedError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
    data = next(self.dataset_iter)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torchdata/datapipes/iter/util/rows2columnar.py", line 53, in __iter__
    for batch in self.source_datapipe:
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/grouping.py", line 90, in __iter__
    for x in self.datapipe:
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_typing.py", line 356, in __next__
    return next(self.iterator)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/combining.py", line 190, in get_generator_by_instance
    yield from self.main_datapipe.get_next_element_by_instance(self.instance_id)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/combining.py", line 301, in get_next_element_by_instance
    yield self._find_next(instance_id)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/datapipes/iter/combining.py", line 275, in _find_next
    value = next(self._datapipe_iterator)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torch/utils/data/_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "/home/alex/anaconda3/envs/cse6363/lib/python3.7/site-packages/torchdata/datapipes/iter/util/indexadder.py", line 68, in __iter__
    raise NotImplementedError("We only support adding index to row or batch in dict type")
NotImplementedError: We only support adding index to row or batch in dict type
