In [127]:
'''
Aryaman Pandya 
Sequential Machine Learning 
Building a Vanilla RNN 
Model and trainer implementation 
Following https://github.com/rasbt/deeplearning-models/blob/master/pytorch_ipynb/rnn/rnn_bi_multilayer_lstm_own_csv_agnews.ipynb
implementation minus the memory unit for now 
'''
import torch 
import pandas as pd
import numpy as np
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import plotly
from torchtext.datasets import AG_NEWS
from torch import nn
from torch.utils.data import Dataset, DataLoader

#Class definition of Vanilla RNN 
class VanillaRNN(nn.Module): 
    
    def __init__(self, vocab_size, embed_size, hidden_size, output_len, num_layers) -> None:
        super(VanillaRNN, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.hidden_size = hidden_size 
        self.output_len = output_len 
        
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
                                batch_first=True, bidirectional=True) #graph module to compute next hidden state 
        
        self.hidden2label = nn.Linear(2*hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropoutLayer = nn.Dropout()

    def forward(self, x, text_len):
        embedded = self.encoder(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_len)
        output, hidden = self.rnn(packed_embedded)  # Pass the initial hidden state 'h' to the RNN
        
        
        # Flatten the output tensor to match the linear layer input size
        output = output.contiguous().view(-1, 2 * self.hidden_size)
        
        # Apply dropout to the flattened output
        output = self.dropoutLayer(output)
        
        # Pass the output through the linear layer
        output = self.hidden2label(output)
        
        # Apply softmax activation to get probabilities
        output = self.softmax(output)
        
        return output, hidden

    def init_h(self):
        return torch.zeros(1, self.len_h)

In [128]:
def yield_tokens(data_iter, tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

In [129]:
train_iter = (AG_NEWS(split="train"))
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

VOCAB_SIZE = 5000
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"], max_tokens=VOCAB_SIZE)
vocab.set_default_index(vocab["<unk>"])
#train_loader = DataLoader(train_iter, batch_size = 8, shuffle = True, collate_fn = collate_batch)

In [130]:
vocab_size = len(vocab)

In [131]:
print(vocab_size)

5000


In [132]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [133]:
vocab(['testing', 'absolutegibberish', ',.,.'])

[1770, 0, 0]

In [134]:
def collate_batch(batch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [135]:
train_loader = DataLoader(train_iter, batch_size = 8, shuffle = True, collate_fn = collate_batch)

In [136]:
batch = next(iter(train_loader))

# Inspect the shape of the input data
input_data = batch[1]  # Assuming the input data is the first element of the batch
input_shape = input_data.shape[0]

In [137]:
input_shape

297

In [138]:
print(batch[1])

tensor([   0, 3760,  806,  363,  252,  510,   16,    9,  433,   16,    9,    8,
         509,   16,    9, 3760,  806,    0,   11,   40,    0,    0, 3121,    1,
           0,    0,  474,  304, 2238,   41, 3174,    0, 3159,    5,    0,    0,
         674,   17,  287,    0, 3913, 1807,   80,  428,  672, 2015,   98,    5,
          16, 1450,   11,    2,  128,  488,   16,  281,  428,  212,    2, 2159,
          34,  394,    6, 3988,   24,   19,    5,   98,    6, 2572, 4754, 3789,
          32,    0,  145,    1,    0, 1587, 1793,    7,  127,  426,  351,    3,
        1147,   15,    0, 1709, 1587, 2218,   59,    7,    2,  127,  426,    6,
           2,  313, 1082,   57,  115,    3, 1117,    0,    3, 3878,    3, 3878,
           4,    0,    0,    6,    2, 3552, 1816,    1, 1587,    3,   75,  192,
        4757,    2,  353,   48,   97,    8, 4030, 2078, 1880, 3953,    2, 4350,
           7, 3644,    3,   35,    0,   24, 1390, 3389,    8,    5, 3578,    6,
           0, 4940,    1,    1,    1,   

In [139]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
BIDIRECTIONAL = True
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 4

In [140]:
model = VanillaRNN(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS)