In [173]:
'''
Aryaman Pandya 
Sequential Machine Learning 
Building a Vanilla RNN 
Model and trainer implementation 
Following https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/rnn/rnn_bi_multilayer_lstm_own_csv_agnews.ipynb
implementation minus the memory unit for now 
'''
import torch 
import pandas as pd
import numpy as np
from torch.utils.data import random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import plotly
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.utils.data import Dataset, DataLoader

#Class definition of Vanilla RNN 
class VanillaRNN(nn.Module): 
    
    def __init__(self, vocab_size, embed_size, hidden_size, output_len, num_layers) -> None:
        super(VanillaRNN, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.hidden_size = hidden_size 
        self.output_len = output_len 
        
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers,
                                batch_first=True, bidirectional=True)
        
        self.hidden2label = nn.Linear(2*hidden_size, 4)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropoutLayer = nn.Dropout(p=0.5)

    def forward(self, x, x_len):
        embedded = self.encoder(x)
        x_packed = nn.utils.rnn.pack_padded_sequence(embedded, x_len, batch_first=True, enforce_sorted=False)
        output, hidden = self.rnn(x_packed)  # Pass the initial hidden state 'h' to the RNN
        
        
        hidden = self.dropoutLayer(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        # Linear layer and softmax
        label_space = self.hidden2label(hidden)
        
        return label_space

    def init_h(self):
        return torch.zeros(1, self.hidden_size)

In [174]:
train_iter = AG_NEWS(split='train')

# Convert to list to enable random splitting
train_dataset = list(train_iter)

#80-20 train-val split 
train_size = int(len(train_dataset) * 0.8)  
val_size = len(train_dataset) - train_size  
train_data, val_data = random_split(train_dataset, [train_size, val_size])

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

VOCAB_SIZE = 5000

# Build vocab based on the train_data
train_data_iter = (text for _, text in train_data)
vocab = build_vocab_from_iterator(yield_tokens(train_data_iter), specials=["<unk>"], max_tokens=VOCAB_SIZE)
vocab.set_default_index(vocab["<unk>"])

In [175]:
vocab_size = len(vocab)

In [176]:
print(vocab_size)

5000


Building the vocabulary builds a dictionary of the most frequently observed words. This dictionary however, is pretty meaningless- it doesn't encode any semantic information about the words and is a simple string to integer mapping for further processing. In our nn model, the encoder (nn.Embedding) takes these integers and maps them to a higher dimensional space in which semantics and meaning is embedded. For example synonyms would be close to one another in vector space. nn.Embedding learns a look-up table that takes in indices of words and returns the corresponding embedding vectors. 

In [177]:
vocab(['word', 'probably', 'unknown', 'gibberish'])

[2167, 1693, 4524, 0]

In [178]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [179]:
vocab.lookup_tokens([4999])

['viktor']

In [180]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_list, text_list, lengths = [], [], []
    
    # Sort the batch in the descending order
    batch.sort(key=lambda x: len(x[1]), reverse=True)
    
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list, dtype=torch.int64)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    
    # Pad sequences
    text_list = pad_sequence(text_list, batch_first=True)
    
    return label_list.to(device), text_list.to(device), lengths

In [181]:
train_loader = DataLoader(train_data, batch_size = 8, shuffle = True, collate_fn = collate_batch)
val_loader = DataLoader(val_data, batch_size = 8, shuffle = False, collate_fn = collate_batch)

In [182]:
batch = next(iter(train_loader))

# Inspect the shape of the input data
input_data = batch[1]  # Assuming the input data is the first element of the batch
input_shape = input_data.shape[0]

In [183]:
input_shape

8

In [184]:
print(batch[0])
print(batch[1])
batch[2]

tensor([3, 0, 0, 3, 1, 1, 1, 1], device='cuda:0')
tensor([[4118,  942,   10, 1394,    4,  388,   22, 2841,  425, 2252,   23,   73,
           13,   27,   14,   15,    5,  131,  328,   11,  213, 1576,   54, 4118,
           41,    5,  254,  252,    1,   27,    1,  229,    1, 2447,    1, 2438,
           80,    0,    1,  360, 2442, 2437, 2441,    0,    1, 1091,  253,   10,
           57,   26,   77,   88,   16,    9,   41,    5,  254,  252,    1,   27,
            1,  229,    1, 2447,    1, 2438,   80,    0,    1,  360, 2442, 2437,
         2441, 2650,    1, 1091,  253, 1394,  213,    0,   33,  388,   22, 2841,
          425, 2252,  181,    3,    0,    2,  247,   11, 3863,  388,    6,    5,
          381, 4118,  792,    1],
        [   0, 1011, 3210,    0,   76,    0,  394, 1470,    0,    0,    0,    3,
          298,    3, 2969,  110,    4,   30,    0,  242,  297,    3,  176,   34,
           48,   85,  865,   30,    0,   10,    2,    0,    4,    0,   24,    2,
          242,  528,    3

tensor([100,  46,  39,  35,  37,  38,  31,  34])

In [185]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
BIDIRECTIONAL = True
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 4

In [186]:
model = VanillaRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [189]:
def train(model, train_loader, val_loader, loss_function, optim, epochs, device):
    losses = [] #group losses for loss visualization 
    running_loss = 0.0
    for epoch in range(epochs):
        model.train()
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)
    
        for i, batch_data in enumerate(train_loader):
            
            model.train()
            (y, x, x_size) = batch_data
            #print("Labels: {}, data: {}, x_size.cpu(): {}".format(batch_data[0], x.shape,x_size.cpu()))

            logits = model(x, x_size.cpu())
            #print("Target size: {}, pred_size: {}".format(y.size(), logits.size()))
            loss = loss_function(logits, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            running_loss += loss.item()
            losses.append(loss)

            if (i+1) % 1000 == 0:
                print("Step: {}, average training loss over last 2000 steps: {:.4f}".format(i+1, running_loss/1000))
                running_loss = 0.0
            
            model.eval()
            val_loss = 0.0
        
        with torch.no_grad():
            for i, batch_data in enumerate(val_loader):
                (y, x, x_size) = batch_data
                y, x, x_size = y.to(device), x.to(device), x_size.to(device)
                
                logits = model(x, x_size.cpu())
                loss = loss_function(logits, y)
                
                val_loss += loss.item()
        
        print("Epoch: {}, validation loss: {:.4f}".format(epoch+1, val_loss/len(val_loader)))

In [190]:
train(model, train_loader, val_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS, DEVICE)

Epoch 1 / 50
----------
Step: 1000, average training loss over last 2000 steps: 1.2775
Step: 2000, average training loss over last 2000 steps: 1.1867
Step: 3000, average training loss over last 2000 steps: 1.1150
Step: 4000, average training loss over last 2000 steps: 1.0906
Step: 5000, average training loss over last 2000 steps: 1.0556
Step: 6000, average training loss over last 2000 steps: 1.0363
Step: 7000, average training loss over last 2000 steps: 0.9761
Step: 8000, average training loss over last 2000 steps: 0.8901
Step: 9000, average training loss over last 2000 steps: 0.8625
