**Topic :** RNN for Sequence Classification <br>
**Author:** Arun Prakash A

Import necessary libraries

In [None]:
!pip install torchdata==0.6.0 # to be compatible with torch 2.0
!pip install portalocker==2.0.0

Collecting torchdata==0.6.0
  Downloading torchdata-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.0.0 (from torchdata==0.6.0)
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0->torchdata==0.6.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0->torchdata==0.6.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

#torch specific
import torch
import torch.nn as nn
import torch.nn.functional as F

#Data loader
from torch.utils.data import Dataset,DataLoader


#text lib
import torchtext

#fetch data
from torchtext.datasets import AG_NEWS

# tokenizer
from torchtext.data.utils import get_tokenizer

#build vocabulary
from torchtext.vocab import vocab
from torchtext.vocab import build_vocab_from_iterator

# get input_ids (numericalization)
from torchtext.transforms import VocabTransform

# get embeddings
from torch.nn import Embedding

# get rnn model and layers
from torch.nn import RNN, Linear, Sigmoid, Softmax

# optimizer
import torch.optim as optim

# utils
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading data

In [None]:
os.makedirs('./data',exist_ok=True)
train_samples = AG_NEWS(root='./data',split='train')
print('Number of training samples: ',len(list(train_samples)))
print('A sample: \n',next(iter(train_samples)))

Number of training samples:  120000
A sample: 
 (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


# Tokenization

In [None]:
tokenizer = get_tokenizer(tokenizer="basic_english",language='en')

In [None]:
text = ['This is called tokenization!','this is not the best approach by the way']
token_list = [tokenizer(sentence) for sentence in text]
print(token_list)

[['this', 'is', 'called', 'tokenization', '!'], ['this', 'is', 'not', 'the', 'best', 'approach', 'by', 'the', 'way']]


In [None]:
# token iterator
def yield_tokens(corpus):
  for (label,sentence) in corpus:
    yield tokenizer(sentence)

# Build Vocabulary

In [None]:
v = build_vocab_from_iterator(yield_tokens(train_samples),min_freq=100,specials=['<pad>','<unk>'])
v.set_default_index(v['<unk>']) # index of OOV

In [None]:
print(v['deep'],v['learning'])

2162 4700


# Numericalization

In [None]:
vocab_transform = VocabTransform(v)

for sample in train_samples:
  input_ids = vocab_transform(tokenizer(sample[1])) # 0th index is a label
  print(input_ids)
  break

[432, 426, 2, 1606, 1, 114, 67, 3, 849, 14, 28, 15, 28, 16, 1, 4, 432, 375, 17, 10, 1, 7, 1, 4, 43, 4010, 784, 326, 2]


In [None]:
def get_input_ids(sample):
  tokens = tokenizer(sample[1]) # again, oth index is a label
  return torch.LongTensor(vocab_transform(tokens))

# Embeddings

In [None]:
embedding = Embedding(num_embeddings = len(v),embedding_dim=6,padding_idx=0)

In [None]:
for sample in train_samples:
  input_ids = get_input_ids(sample)
  print(input_ids)
  print(embedding(input_ids))
  break

tensor([ 432,  426,    2, 1606,    1,  114,   67,    3,  849,   14,   28,   15,
          28,   16,    1,    4,  432,  375,   17,   10,    1,    7,    1,    4,
          43, 4010,  784,  326,    2])
tensor([[ 2.3562, -0.5289, -0.7235,  0.1099,  1.5542, -0.0472],
        [-0.2429,  0.1881, -0.0528,  1.3146, -0.3414, -0.6835],
        [-0.3804, -0.3066,  1.0414,  0.5904, -1.4058, -0.5009],
        [ 1.1506, -1.3453, -1.3722,  0.1924, -0.6143,  2.3126],
        [ 1.2393, -1.5741, -0.1816, -0.0509, -0.4694, -0.8691],
        [ 0.0959, -0.2027, -1.1928,  0.7694,  0.1288,  0.6164],
        [-0.2400, -0.5430, -1.2325, -0.1354,  0.1389,  0.2343],
        [-0.4732, -0.4493,  1.0912,  0.2197,  0.4803,  0.2490],
        [-0.7445,  1.0203, -0.3015,  1.5908,  0.1232,  0.6188],
        [-0.5888,  2.6468,  0.1932,  0.8338, -0.3670,  0.1345],
        [ 0.5190,  1.4543, -1.9487, -0.4160,  0.3626, -0.0891],
        [ 0.2368,  0.1440,  0.5376,  0.1592, -0.7456,  1.1223],
        [ 0.5190,  1.4543, -1.948

# DataLoading

In [None]:
for sample in train_samples:
  input_ids = get_input_ids(sample)
  print(input_ids.shape)
  prompt = input('Continue?')
  if prompt == 'y':
    continue
  else:
    break

torch.Size([29])
Continue?y
torch.Size([42])
Continue?n


* The length of sequences is different for each sample.
* However, Batching tensors requires tensors to be of same length
* So we need to pad sequences to the maximum len of sequence in a batch

In [None]:
examples = [(1,'padding is necessary'),(4,'you know the reason right?')]
batch_input_ids = [get_input_ids(sample) for sample in examples ]
padded_input_ids = pad_sequence(batch_input_ids,batch_first=True,padding_value=0.0)

In [None]:
print(padded_input_ids)

tensor([[   1,   22, 4425,    0,    0,    0],
        [ 166, 1200,    3, 2257,  480,   81]])


Define collate function to be passed to a DataLoader

 * The output of the function is a tuple containing` (label tensor,padded_seq,length of unpadded sequence)`
 * We see the requirement for length info later

In [None]:
def collate_function(batch_samples):
  '''
  Input : Sample : (label,sentence)
  return :  (label tensor, padded_seuence ,lengths of unpadded seq in batches)
  '''

  #padded_seq
  batch_input_ids = [get_input_ids(sample) for sample in batch_samples ]

  padded_input_ids = pad_sequence(batch_input_ids,batch_first=True,padding_value=0.0)

  # label tensor
  # -1 is added to make class num starting from 0, required for one-hot encoding
  labels = torch.tensor([torch.LongTensor([sample[0]-1]) for sample in batch_samples])

  # lengths of unpadded seq

  lengths = [len(tokenizer(sample[1]))for sample in batch_samples]

  return (labels,padded_input_ids,lengths)

In [None]:
label,sample,lengths = collate_function([(1,'this is great'),(2,'why is this taking such a long time?')])
print('label tensor: \n ',label)
print('Padded sequence: \n',sample)
print('Actual lengths: ', lengths)

label tensor: 
  tensor([0, 1])
Padded sequence: 
 tensor([[  53,   22,  811,    0,    0,    0,    0,    0,    0],
        [1165,   22,   53,  608,  560,    6,  443,  102,   81]])
Actual lengths:  [3, 9]


# Create RNN model

* During the forward pass, the embedding layer takes in `padded_input_ids` and returns embedding vectors for each of index including padded index (0)
* However, while instantiating embed layer, we let the layer know that "Hey, **0**s in `padded_input_ids` is just padded values and therefore do not wait for gradients for the embeddings, just move on without raising an error"

* Moreover, we use `pack_padded_sequence` in the forward method to avoid unnecessary computation for padded tokens. Please refer to the documentation to know how.

 * Some good discussions at : https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch

In [None]:
class RNNClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super().__init__()
        self.embedding = Embedding(vocab_size, embed_dim,padding_idx=0)
        self.rnn = RNN(embed_dim,hidden_dim,batch_first=True)
        self.fc = Linear(hidden_dim, num_class)


    def forward(self, x, lengths):
        # get embedding for padded sequence
        x = self.embedding(x)
        x = pack_padded_sequence(x,lengths=lengths,enforce_sorted=False,batch_first=True)

        # get hidden states for all time steps, last time step h_T as packed sequence
        x = self.rnn(x)
        # get the final state h_T
        x = self.fc(x[1])  # logits
        return x

# Train the model

In [None]:
batch_size = 32
dataloader = DataLoader(train_samples,batch_size=batch_size,collate_fn = collate_function,shuffle=True)

In [None]:
vocab_size = len(v)
embedding_dim = 300
num_classes = 4
hidden_dim = 60
model = RNNClassifier(vocab_size,embedding_dim,hidden_dim,num_classes)

In [None]:
model = model.to(device)

In [None]:
for y,x,lengths in dataloader:
  print(y)
  pred = model(x.to(device),lengths)
  print('Logits: ',pred.squeeze())
  break

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
Logits:  tensor([[-0.5244, -0.1885,  0.4447,  0.0809],
        [-0.2737, -0.0047,  0.3406, -0.0866],
        [-0.6832,  0.0593,  0.4331,  0.0663],
        [-0.3094, -0.0596,  0.3702,  0.0261],
        [-0.6537, -0.0932,  0.1200, -0.0056],
        [-0.4887, -0.4386, -0.8866,  0.1748],
        [-0.2316,  0.1557,  0.3573, -0.1122],
        [-0.3872,  0.0068,  0.3042,  0.0156],
        [-0.7005,  0.1307,  0.4297,  0.0455],
        [-0.5245, -0.1886,  0.4448,  0.0808],
        [-0.6746, -0.0570,  0.4859,  0.0015],
        [-0.3526, -0.2338,  0.1421, -0.0042],
        [-0.6015,  0.3427,  0.1239, -0.1163],
        [-0.5070, -0.1952,  0.3351,  0.2003],
        [-0.5477,  0.0875,  0.2805,  0.0417],
        [-0.3720, -0.0570,  0.4535,  0.0223],
        [-0.5821, -0.0288,  0.4952,  0.0796],
        [-0.5940, -0.0900,  0.3068, -0.2462],
        [-0.3725, -0.1550,  0.1942,  0.1390],
    

In [None]:
Loss = nn.functional.cross_entropy
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [None]:
from torch.nn.functional import one_hot
for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    running_acc = 0.0
    for i, data in enumerate(dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        labels, samples,lengths = data
        labels_ohe = torch.tensor(one_hot(labels,num_classes=4),dtype=torch.float32)
        labels_ohe = labels_ohe.to(device)
        samples = samples.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(samples,lengths)

        loss = Loss(outputs.squeeze(), labels_ohe.squeeze())
        loss.backward()
        optimizer.step()

        # Loss
        running_loss += loss.item()


        class_correct = torch.argmax(outputs.to('cpu'),axis=2) == torch.as_tensor(labels)
        running_acc += torch.count_nonzero(class_correct)/batch_size
        if i % 100 == 99:    # print every 10000 mini-batches
            print('[%d, %5d] loss: %.3f Accuracy:%.3f' %
                  (epoch + 1, i + 1, running_loss / 99,running_acc/99))
            running_loss = 0.0
            running_acc = 0.0


  labels_ohe = torch.tensor(one_hot(labels,num_classes=4),dtype=torch.float32)


[1,   100] loss: 1.108 Accuracy:0.533
[1,   200] loss: 0.929 Accuracy:0.598
[1,   300] loss: 0.890 Accuracy:0.625
[1,   400] loss: 1.191 Accuracy:0.510
[1,   500] loss: 0.934 Accuracy:0.606
[1,   600] loss: 0.839 Accuracy:0.660
[1,   700] loss: 0.791 Accuracy:0.682
[1,   800] loss: 0.744 Accuracy:0.712
[1,   900] loss: 0.798 Accuracy:0.678
[1,  1000] loss: 0.776 Accuracy:0.700
[1,  1100] loss: 0.749 Accuracy:0.693
[1,  1200] loss: 0.715 Accuracy:0.701
[1,  1300] loss: 0.730 Accuracy:0.712
[1,  1400] loss: 0.713 Accuracy:0.710
[1,  1500] loss: 0.683 Accuracy:0.733
[1,  1600] loss: 0.696 Accuracy:0.721
[1,  1700] loss: 0.689 Accuracy:0.730
[1,  1800] loss: 0.716 Accuracy:0.720
[1,  1900] loss: 0.744 Accuracy:0.709
[1,  2000] loss: 0.700 Accuracy:0.729
[1,  2100] loss: 0.760 Accuracy:0.712
[1,  2200] loss: 0.658 Accuracy:0.751
[1,  2300] loss: 0.741 Accuracy:0.711
[1,  2400] loss: 0.662 Accuracy:0.752
[1,  2500] loss: 0.760 Accuracy:0.716
[1,  2600] loss: 0.695 Accuracy:0.736
[1,  2700] l

# inference (fun)

In [None]:
text = "all the focus is now on the biggest T20 league in the world "

In [None]:
def get_input_ids_inf(text):
  tokens = tokenizer(text)
  input_ids = vocab_transform(tokens)
  return torch.LongTensor(input_ids).unsqueeze(0)

In [None]:
get_input_ids_inf(text).shape

torch.Size([1, 13])

In [None]:
model_inference = model.to('cpu')
with torch.inference_mode(True):
  logits = model_inference(get_input_ids_inf(text),[len(tokenizer(text))])
  print(torch.nn.functional.softmax(logits,dim=2))

tensor([[[0.0418, 0.6990, 0.0538, 0.2055]]])
