In [11]:
'''
Aryaman Pandya 
Sequential Machine Learning 
Building a Vanilla RNN 
Model and trainer implementation 
Following https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/rnn/rnn_bi_multilayer_lstm_own_csv_agnews.ipynb
implementation minus the memory unit for now 
'''
import torch 
from torch.utils.data import random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import plotly
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.utils.data import DataLoader

#Class definition of Vanilla RNN 
class VanillaRNN(nn.Module): 
    
    def __init__(self, vocab_size, embed_size, hidden_size, output_len, num_layers) -> None:
        super(VanillaRNN, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.hidden_size = hidden_size 
        self.output_len = output_len 
        
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers,
                                batch_first=True, bidirectional=True)
        
        self.hidden2label = nn.Linear(2*hidden_size, 4)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropoutLayer = nn.Dropout(p=0.5)

    def forward(self, x, x_len):
        embedded = self.encoder(x)
        x_packed = nn.utils.rnn.pack_padded_sequence(embedded, x_len, batch_first=True, enforce_sorted=False)
        output, hidden = self.rnn(x_packed)  # Pass the initial hidden state 'h' to the RNN
        print(hidden.shape)
        
        hidden = self.dropoutLayer(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        # Linear layer and softmax
        label_space = self.hidden2label(hidden)
        
        return label_space

In [12]:
train_iter = AG_NEWS(split='train')

# Convert to list to enable random splitting
train_dataset = list(train_iter)

#80-20 train-val split 
train_size = int(len(train_dataset) * 0.8)  
val_size = len(train_dataset) - train_size  
train_data, val_data = random_split(train_dataset, [train_size, val_size])

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

VOCAB_SIZE = 5000

# Build vocab based on the train_data
train_data_iter = (text for _, text in train_data)
vocab = build_vocab_from_iterator(yield_tokens(train_data_iter), specials=["<unk>"], max_tokens=VOCAB_SIZE)
vocab.set_default_index(vocab["<unk>"])

In [13]:
vocab_size = len(vocab)

In [14]:
print(vocab_size)

5000


Building the vocabulary builds a dictionary of the most frequently observed words. This dictionary however, is pretty meaningless- it doesn't encode any semantic information about the words and is a simple string to integer mapping for further processing. In our nn model, the encoder (nn.Embedding) takes these integers and maps them to a higher dimensional space in which semantics and meaning is embedded. For example synonyms would be close to one another in vector space. nn.Embedding learns a look-up table that takes in indices of words and returns the corresponding embedding vectors. 

In [15]:
vocab(['word', 'probably', 'unknown', 'gibberish'])

[2272, 1627, 4631, 0]

In [16]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [17]:
vocab.lookup_tokens([4999])

['alabama']

In [18]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_list, text_list, lengths = [], [], []
    
    # Sort the batch in the descending order
    batch.sort(key=lambda x: len(x[1]), reverse=True)
    
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list, dtype=torch.int64)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    
    # Pad sequences
    text_list = pad_sequence(text_list, batch_first=True)
    
    return label_list.to(device), text_list.to(device), lengths

In [19]:
train_loader = DataLoader(train_data, batch_size = 8, shuffle = True, collate_fn = collate_batch)
val_loader = DataLoader(val_data, batch_size = 8, shuffle = False, collate_fn = collate_batch)

In [20]:
batch = next(iter(train_loader))

# Inspect the shape of the input data
input_data = batch[1]  # Assuming the input data is the first element of the batch
input_shape = input_data.shape[0]

In [21]:
input_shape

8

In [22]:
print(batch[0])
print(batch[1])

tensor([1, 2, 2, 0, 2, 2, 2, 1], device='cuda:0')
tensor([[ 337,   12,    9, 3831,    0,   42, 4623, 3831,    0,    3,    2,    0,
          177,  337,  113,    6,    2,  722,  347,  175,    3,   38,    0,  926,
           34,  199, 2729,    1,    2,  175,  441,   25,   30,  356,  532,    4,
          416,  136,    2,  288,  177,  337,  113,    7,    2,  381,    1,    0,
            0,    0],
        [1673, 1196,    4,    0,  685,   13,   27,   14,   27,   15,  402,   16,
            9, 1673, 1116,  435,  635,    1,  262,    0,    5,    0, 1930,  685,
           10,   56,    3, 4144,   30,    0, 1216,    3,   34,   59,   91,    0,
          460,  963,    5,  726,    3,    0,   39,    2,  133,  729,   11,  481,
          220,    1],
        [1065,    3,   59,    0,  150,  439,   23,   73,   13,   27,   14,   15,
          150,   77, 3551,  439,   10,   60,   19,  359, 3126,   17,  939,   91,
           11,   59,    8,    0,   95,    0,  749,  664,    3,  232,  348,  188,
          435, 

In [23]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
BIDIRECTIONAL = True
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 4

In [24]:
model = VanillaRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
def train(model, train_loader, val_loader, loss_function, optim, epochs, device):
    losses = [] #group losses for loss visualization 
    running_loss = 0.0
    for epoch in range(epochs):
        model.train()
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)
    
        for i, batch_data in enumerate(train_loader):
            
            model.train()
            (y, x, x_size) = batch_data
            #print("Labels: {}, data: {}, x_size.cpu(): {}".format(batch_data[0], x.shape,x_size.cpu()))

            logits = model(x, x_size.cpu())
            #print("Target size: {}, pred_size: {}".format(y.size(), logits.size()))
            loss = loss_function(logits, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            running_loss += loss.item()
            losses.append(loss)

            if (i+1) % 1000 == 0:
                print("Step: {}, average training loss over last 2000 steps: {:.4f}".format(i+1, running_loss/1000))
                running_loss = 0.0
            
            model.eval()
            val_loss = 0.0
        
        with torch.no_grad():
            for i, batch_data in enumerate(val_loader):
                (y, x, x_size) = batch_data
                y, x, x_size = y.to(device), x.to(device), x_size.to(device)
                
                logits = model(x, x_size.cpu())
                loss = loss_function(logits, y)
                
                val_loss += loss.item()
        
        print("Epoch: {}, validation loss: {:.4f}".format(epoch+1, val_loss/len(val_loader)))

In [26]:
train(model, train_loader, val_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS, DEVICE)

Epoch 1 / 50
----------
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8, 256])
torch.Size([4, 8

KeyboardInterrupt: 