# To review Chapters 15 of Raschka book  and submit one jupyter notebook as a tutorial to implement and train RNNs to predict the sentiment of IMDb movie reviews

# Preparing the movie review data

In [1]:
# install torchtext
!pip install torch torchvision torchaudio
!pip install torchtext
!pip install portalocker

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Using cached torch-2.0.0-cp38-cp38-manylinux1_x86_64.whl (619.9 MB)
Collecting torchvision
  Using cached torchvision-0.15.1-cp38-cp38-manylinux1_x86_64.whl (33.8 MB)
Collecting torchaudio
  Using cached torchaudio-2.0.1-cp38-cp38-manylinux1_x86_64.whl (4.4 MB)
Collecting nvidia-cufft-cu11==10.9.0.58; platform_system == "Linux" and platform_machine == "x86_64"
  Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
Collecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting nvidia-cuda-cupti-cu11==11.7.101; platform_system == "Linux" and platform_machine == "x86_64"
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
Collecting nvidia-curand-cu11==10.2.10.91; platform_system == "Linux" and platform_machine == "x86_64"
  Using cached nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)

Installing collected packages: torchdata, torchtext
Successfully installed torchdata-0.6.0 torchtext-0.15.1
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting portalocker
  Using cached portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# verify installation
import torch
import torchtext
import portalocker

print(torch.__version__)
print(torchtext.__version__)
print(portalocker.__version__)

2.0.0+cu117
0.15.1+cpu
2.7.0


In [3]:
# import movie review data and split dataset
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

# Step 1: load and create the datasets

train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

Each set has 25000 samples, and each sample of the dataset has two elements.
- Sentiment label representing the target label to predict
- Movie review text

However, before feeding the data to an RNN (Recurrant Neural Network), several preprocessing steps are needed.
- Split training dataset into separate training and validation partitions
- Identify the unique words in the training dataset
- Map each unique word to a unique integer and encode the review text into encoded integers
- Divide dataset into mini-batches as input to the model

20000 examples are randomly chosen for training, and 5000 for validation.

Now to identify the unique tokens in the training dataset by using the Counter class from the collections package

In [4]:
## Step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 69023


After verifying the number of unique tokens, each token can now be mapped to a unique integer.

In [5]:
## Step 3: encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]


From the demonstration above, the encoder works. Now the text_pipeline function can be defined to transform each text in the dataset accordingly and the label_pipeline function can convert each label to 1 or 0.

In [8]:
## Step 3-A: define the functions for transformation
import torch.nn as nn
device = torch.device("cuda:0")
# device = 'cpu'

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.


## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [9]:
# Take a small batch

from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34415,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42945,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

The sequences currently have different lengths. Although RNNs can handle sequences of different lengths, all the sequences in a mini-batch must have the same length in order to store them efficiently in a tensor.

PyTorch provides an efficient method, pad_sequence(), which will automatically pad the consecutive elements into a batch with placeholder values (0) so that all sequences within a batch will have the same shape.

To illustrate how padding words, take the first batch and print the sizes of the individual elements before combining into mini-batches.

As seen, the number of columns in the first batch is 218, which resulted from combining the first four examples into a single batch and using the maximum size of these examples. This means that the other three examples (165, 86, 145) are padded as much as necessary to match this size.

Now to divide all three datasets into data loaders with a batch size of 32.

In [10]:
## Step 4: batching the datasets

batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

# Embedding layers for sentence encoding
Although one-hot encoding can be used to convert indices of unique words to vectors, the amount of unique words (10^4 - 10^5) may result in the model suffering from the curse of dimensionality.

Therefore, a better approach is to map each oword to a vector of a fixed size with real-valued elements with finite-sized vectors to represent an infinite number of real numbers.

The advantages of embedding are as follows:
- Reduction in the dimensionality of the feature space
- Extraction of salient features

In [11]:
# create an embedding layer and apply to a batch of two samples
embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3, 
                         padding_idx=0)
 
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.4651, -0.3203,  2.2408],
         [ 0.3824, -0.3446, -0.3531],
         [-0.0251, -0.5973, -0.2959],
         [ 0.8356,  0.4025, -0.6924]],

        [[-0.0251, -0.5973, -0.2959],
         [ 0.9124, -0.4643,  0.3046],
         [ 0.3824, -0.3446, -0.3531],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


# Building an RNN model 
For the recurrent layers of the RNN, the following representations can be used:
- RNN: a regular RNN layer
- LSTM: a long short-term memory RNN, which is useful for capturing the long-term dependencies
- GRU: a recurrent layer with a gated recurrent unit

In [12]:
# create an RNN model with two recurrent layers of type RNN
# add a non-recurrent fully connected layer as output layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, 
                          hidden_size, 
                          num_layers=2, 
                          batch_first=True)
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out

model = RNN(64, 32) 

print(model) 
 
model(torch.randn(5, 3, 64)) 

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[-0.0113],
        [ 0.1886],
        [-0.0572],
        [-0.3209],
        [-0.0493]], grad_fn=<AddmmBackward0>)

# Building an RNN model for the sentiment analysis task
Now, an RNN model for sentiment analysis, starting with an embedding layer producing word embeddings of feature size 20 will be created. Then, a recurrent layer of type LSTM will be added. Finally, a fully connected layer as a hidden layer and another fully connected layer as an output layer will be added. A single class-membership probability value via the logistic sigmoid activation function will be returned.

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [14]:
# develop the train function to train model for one epoch
# return accuracy and loss

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

# develop the evaluate function to measure model's performance
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [15]:
# loss function: binary cross-entropy
# optimizer: adam
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# train model for 10 epochs and display training and validation performances
num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.9987 val_accuracy: 1.0000
Epoch 1 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 2 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 3 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 4 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 5 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 6 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 7 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 8 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 9 accuracy: 1.0000 val_accuracy: 1.0000


### More on the bidirectional RNN

In [19]:
# train the model for one epoch and return classification accuracy and loss
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)