In [44]:
'''
Aryaman Pandya 
Sequential Machine Learning 
Building a Vanilla RNN 
Model and trainer implementation 
Following https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/rnn/rnn_bi_multilayer_lstm_own_csv_agnews.ipynb
implementation minus the memory unit for now 
'''
import torch 
import pandas as pd
import numpy as np
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import plotly
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.utils.data import Dataset, DataLoader

#Class definition of Vanilla RNN 
class VanillaRNN(nn.Module): 
    
    def __init__(self, vocab_size, embed_size, hidden_size, output_len, num_layers) -> None:
        super(VanillaRNN, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.hidden_size = hidden_size 
        self.output_len = output_len 
        
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
                                batch_first=True, bidirectional=True) #graph module to compute next hidden state 
        
        self.hidden2label = nn.Linear(2*hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropoutLayer = nn.Dropout(p=0.5)

    def forward(self, x):
        embedded = self.encoder(x)
        output, hidden = self.rnn(embedded)  # Pass the initial hidden state 'h' to the RNN
        
        
        # Flatten the output tensor to match the linear layer input size
        output = output.contiguous().view(-1, 2 * self.hidden_size)
        
        # Apply dropout to the concatenated hidden state
        hidden = self.dropoutLayer(hidden)
        
        # Linear layer and softmax
        label_space = self.hidden2label(hidden)
        output_probs = self.softmax(label_space)
        
        return output_probs

    def init_h(self):
        return torch.zeros(1, self.hidden_size)

In [45]:
def yield_tokens(data_iter, tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

In [46]:
train_iter = (AG_NEWS(split="train"))
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

VOCAB_SIZE = 5000
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"], max_tokens=VOCAB_SIZE)
vocab.set_default_index(vocab["<unk>"])
#train_loader = DataLoader(train_iter, batch_size = 8, shuffle = True, collate_fn = collate_batch)

In [47]:
vocab_size = len(vocab)

In [48]:
print(vocab_size)

5000


In [49]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [50]:
vocab.lookup_tokens([4999])

['wreckage']

In [51]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_list, text_list, lengths = [], [], []
    
    # Sort the batch in the descending order
    batch.sort(key=lambda x: len(x[1]), reverse=True)
    
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list, dtype=torch.int64)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    
    # Pad sequences
    text_list = pad_sequence(text_list, batch_first=True)
    
    return label_list.to(device), text_list.to(device), lengths

In [52]:
train_loader = DataLoader(train_iter, batch_size = 8, shuffle = True, collate_fn = collate_batch)

In [53]:
batch = next(iter(train_loader))

# Inspect the shape of the input data
input_data = batch[1]  # Assuming the input data is the first element of the batch
input_shape = input_data.shape[0]

In [54]:
input_shape

8

In [55]:
print(batch[0])
print(batch[1])
batch[2]

tensor([3, 0, 2, 2, 3, 1, 1, 0])
tensor([[1700,    3, 4593, 1729,  155,    4, 1608,  812, 1700,    8, 4593, 1053,
            5,  194,  155,    4, 1608, 3948,  812, 4049,    2,  214,    4,    0,
          285, 1061,    4,  919,   11,   20,  201,  206,  366,  299,    1,   41,
         1465, 1542,    3, 1177, 1429, 1547,    3, 1539,    3, 1543, 2715, 1540,
           41,  164,    0,  163,   41, 1532],
        [   0,   16, 1933,    0, 1610,  351,    3, 1407,   15,    5,   98,    6,
            0,   21,  607,   30, 4156,  727,    4,  203,   22,  541, 2024, 2820,
           15,  664,    4,    2, 1731,    1,    0,    0,    0, 1923,   10,   40,
            0,    6,  100,    8,   22, 2396,    1,    1,    1,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [   0,    0,  153, 1101,   10, 3560,  124, 4529,   13,   27,   14,   15,
         1146,  992,   41,  164,    1,    1,    1,   41,  163, 4749,  486,   11,
         3560, 1962,  747,    0, 1049,   30,    0,  782,  106,  3

tensor([54, 45, 46, 39, 30, 31, 29, 25])

In [56]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
BIDIRECTIONAL = True
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 4

In [57]:
model = VanillaRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [58]:
def train(model, train_loader, loss_function, optim, epochs):
    losses = [] #group losses for loss visualization 
    
    for epoch in range(epochs):
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)
        model.train()
    
        for i, batch_data in enumerate(train_loader):
            
            (y, x, x_size) = batch_data
            print("Labels: {}, data: {}, x_size.cpu(): {}".format(batch_data[0], x.shape,x_size.cpu()))
            h_s = model.init_h() #initialize hidden state 
            x_packed = nn.utils.rnn.pack_padded_sequence(x, x_size.cpu(), batch_first=True)

            logits = model(x, x_size)
            loss = loss_function(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss)

            if(i % 10):
              print("Step: {}/{}, current Epoch loss: {:.4f}".format(i, len(train_loader), loss))  

In [59]:
train(model, train_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS)

Epoch 1 / 50
----------
tensor([[ 853, 2106,    0,   59,    0,    3,  891,  402, 2729,  351,    3, 1147,
           13,   31,   14,   53,   11,  204, 1236,    3,    2, 1908,    0,    8,
            0,    3, 2613,    5,    0,    0,    1,  853, 2106, 4818,  435,    8,
            0,    4,  290,   32,    0,    1,    5,   85,   16,    9, 1039,    6,
         2896,    7, 3260,    0,   38,   66,    2, 3070,   55,  220,    5,    0,
            3,    0, 2434,    1, 2106,   35, 1226,    4, 1608,   59,    2,    0,
            8,  109,    5, 2729,  707,   10,  237, 2879,    3,    8,    0,  313,
          276,    0,    0,    0, 2034, 1323,    3,  389,    4,    2,    0,    6,
            2, 1908,    1],
        [ 552, 1761,   51,    1,    9,    1,    4, 1096, 1443, 4143,  381,   13,
           31,   14,   31,   15,   30,  476,    0,  820,   18,    0,  208, 4584,
           20, 1443, 4143, 1025,   26,   55,   49,   33, 4916, 1208,    4,  123,
            0,    3,    0,   49, 1707,    2,  689,    8, 



RuntimeError: `lengths` array must be sorted in decreasing order when `enforce_sorted` is True. You can pass `enforce_sorted=False` to pack_padded_sequence and/or pack_sequence to sidestep this requirement if you do not need ONNX exportability.