<a href="https://colab.research.google.com/github/ajdillhoff/CSE6363/blob/main/imdb-rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import torchtext
import spacy
import torch
import torch.nn as nn
import torchtext.transforms as T
import torch.optim as optim
import pytorch_lightning as pl
import torch.nn.functional as F

# !pip install torchtext==0.12.0
# !pip install torchdata
# !pip install pytorch-lightning

print(torchtext.__version__)

0.12.0


In [134]:
class RNN(pl.LightningModule):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 train_datapipe, test_datapipe):
        super().__init__()
        
        # Required since our input vector represents each word as an index into
        # the vocabulary.
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # Creates an RNN using tanh by default.
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # LightningModule attributes
        self.lr = 1e-3
        self.batch_size = 32
        self.loss_fn = nn.BCEWithLogitsLoss()

        # Datasets
        self.train_datapipe = train_datapipe
        self.test_datapipe = test_datapipe

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

    def training_step(self, batch, batch_idx):
        input = torchtext.functional.to_tensor(batch["token_ids"], padding_value=1)
        target = torch.tensor(batch["target"], dtype=torch.float)
        output = model(input)
        loss = self.loss_fn(output.squeeze(), target)

        return loss

    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_datapipe, batch_size=None)

        return loader

    # def val_dataloader(self):
    #     loader = torch.utils.data.DataLoader(self.val_datapipe, batch_size=self.batch_size)

    #     return loader

    def test_dataloader(self):
        loader = torch.utils.data.DataLoader(self.test_datapipe, batch_size=None)

        return loader

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

Our model works with numerical input. So, we'll need to convert each word into a corresponding one-hot vector based on the vocabulary of our dataset.

Luckily, `torchtext` makes this simple by providing `build_vocab_from_iterator`. All we need to do is supply the `datapipe` iterator and it builds a vocabulary for us.

In [137]:
tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")
max_tokens = 2500

def make_vocabulary():
    train_datapipe = torchtext.datasets.IMDB(split="train")
    train_datapipe = train_dataset.map(lambda x: tokenizer(x[1]))
    v = torchtext.vocab.build_vocab_from_iterator(train_datapipe, specials=["<unk>"], max_tokens=max_tokens)
    v.set_default_index(0)

    return v

In [138]:
v = make_vocabulary()



To finish preparing the data, the labels `pos` and `neg` should be converted to numeric values as well. This can be done with `LabelToIndex`.

With the transforms in place, we can pass the `datapipe` to a PyTorch `DataLoader` object for use during training.

In [143]:
text_transform = T.Sequential(
    T.VocabTransform(v),
)
label_transform = T.LabelToIndex(label_names=["pos", "neg"])

train_dataset, test_dataset = torchtext.datasets.IMDB()

train_datapipe = train_dataset.map(lambda x: (text_transform(tokenizer(x[1])), label_transform(x[0])))
train_datapipe = train_datapipe.batch(32)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
test_datapipe = test_dataset.map(lambda x: (text_transform(tokenizer(x[1])), label_transform(x[0])))
test_datapipe = test_datapipe.batch(32)
test_datapipe = test_datapipe.rows2columnar(["token_ids", "target"])

AttributeError: ignored

Create our model

In [140]:
model = RNN(len(v), 100, 256, 1, train_datapipe, test_datapipe)

In [141]:
trainer = pl.Trainer()
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs

  | Name      | Type              | Params
------------------------------------------------
0 | embedding | Embedding         | 250 K 
1 | rnn       | RNN               | 91.6 K
2 | fc        | Linear            | 257   
3 | loss_fn   | BCEWithLogitsLoss | 0     
------------------------------------------------
341 K     Trainable params
0         Non-trainable params
341 K     Total params
1.368     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
