# Full RNN Classifier in Pytorch

## a. Process Dataset
1. Read Dataset
2. Tokenize
3. Build Vocab
4. Numericalize
5. Apply Transforms
6. Batching

## b. Define Model

## c.Train Model

In [1]:
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torchtext.transforms as T
from torchtext.data.functional import to_map_style_dataset
import time

#### Build Iterator

In [2]:
#If reading directly from a file
# FILE_PATH = 'data/deu.txt'
# data_pipe = dp.iter.IterableWrapper([FILE_PATH])
# data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
# data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

In [3]:
data_pipe=to_map_style_dataset(torchtext.datasets.AG_NEWS(split=('train')))

In [4]:
for la in data_pipe:
    print(la)
    break

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


#### Tokenize - from a sentence to a list of tokens

In [5]:
## lets Tokenize
import spacy
eng = spacy.load("en_core_web_sm")

In [6]:
def getTokens(text):
    return [token.text for token in eng.tokenizer(text)]
getTokens("Hi how are you")

['Hi', 'how', 'are', 'you']

#### Build Vocab
build_vocab_from_iterator needs an iterator that yields a list of tokens, so we need to build that iterator

In [7]:
def tokeniterator(data_pipe):
    for label,text in data_pipe:
        yield getTokens(text)

In [8]:
vocab=build_vocab_from_iterator(
    tokeniterator(data_pipe),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
vocab.set_default_index(vocab['<unk>'])

In [9]:
vocab(['here', 'is', 'an', 'example'])

[540, 27, 37, 6113]

In [10]:
len(vocab)

62544

#### Numericalize -  using vocab
So far our data_pipe is just an iterator that yields sentences, not even a list of tokens
We will now transform the data_pipe to yield a list of indices of the tokesn

In [11]:
def getIndices(sample):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    label,text=sample
    tokenized_text=getTokens(text)
    transformed_indices=text_tranform(tokenized_text)
    return transformed_indices,label

In [12]:
# data_pipe = data_pipe.map(getIndices)

In [13]:
# for sample in data_pipe:
#     print(sample)
#     break

## Important - Our sequences are of Variable Length and we are padding them using T.ToTensor(0)

#### Make Batches

In [14]:
def collate_fn(batch):
    indices_and_labels=[getIndices(sample) for sample in batch]
    tensors,targets=zip(*indices_and_labels)
    ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
    # padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
    # vocabulary.
    tensors=T.ToTensor(0)(list(tensors))
    # ADJUST LABELS FROM 1,2,3,4 to 0,1,2,3
    targets=[x-1 for x in targets]
    targets=T.ToTensor(0)(list(targets))
    return tensors,targets

In [15]:
BATCH_SIZE = 64  # batch size for training
train_iter=DataLoader(data_pipe,batch_size=BATCH_SIZE,collate_fn=collate_fn,shuffle=True,drop_last=True)

In [16]:
i=iter(train_iter)

In [17]:
next(i)

(tensor([[    1,    41, 10425,  ...,     0,     0,     0],
         [    1,  1548,  2782,  ...,   278,    58,     2],
         [    1,  3917,    78,  ...,     0,     0,     0],
         ...,
         [    1,  2570,  3728,  ...,     0,     0,     0],
         [    1,  1640,     7,  ...,     0,     0,     0],
         [    1,  8962,   259,  ...,     0,     0,     0]]),
 tensor([0, 0, 1, 1, 3, 3, 2, 0, 0, 0, 3, 0, 2, 2, 1, 2, 0, 3, 3, 3, 1, 0, 2, 1,
         2, 3, 0, 0, 2, 2, 1, 1, 0, 3, 0, 0, 3, 2, 1, 0, 0, 1, 1, 1, 1, 3, 2, 1,
         2, 0, 1, 2, 1, 2, 3, 1, 3, 0, 2, 3, 0, 0, 2, 2]))

# Define Model - Figure out initializing hidden state and general RNN architecture

In [18]:
import torch.nn as nn
import torch

In [19]:
class ProperRNN(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_size,num_classes):
        super(ProperRNN,self).__init__()
        self.hidden_size=hidden_size
        self.embed=nn.Embedding(vocab_size,embed_dim)
        self.rnn=nn.RNN(embed_dim,hidden_size,batch_first=True)
        self.fc=nn.Linear(hidden_size,num_classes)

    def forward(self,x):
        x=self.embed(x)
        out,_=self.rnn(x,torch.randn(1*1,x.shape[0],self.hidden_size))
        out=out[:,-1,:]
        out=self.fc(out)
        return out
        

In [20]:
vocab_size=len(vocab)
embed_dim=300
hidden_size=10
num_classes=4

In [21]:
prnn=ProperRNN(vocab_size,embed_dim,hidden_size,num_classes)

In [22]:
o=prnn(next(i)[0])

In [23]:
o.shape

torch.Size([64, 4])

# Train the model

In [24]:
# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate



criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(prnn.parameters(), lr=LR)

In [25]:
prnn.train()
epochs=5
correct_preds, total_count = 0, 0
log_interval = 500
start_time = time.time()
for epoch in range(1,epochs+1):
    for idx,(tensors,targets) in enumerate(train_iter):
        optimizer.zero_grad()
        predicted_labels=prnn(tensors)
        loss=criterion(predicted_labels,targets)
        loss.backward()
        optimizer.step()
        correct_preds +=(predicted_labels.argmax(1)==targets).sum().item()
        total_count += targets.size(0)
        if idx%log_interval==0 and idx>0:
            elapsed=time.time() - start_time
            print(
            "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| accuracy {:8.3f}".format(
                        ##### Weird - in order for len(train_iter) to work data_pipe must be converted using to_map_style_dataset
                        #### after which you cannot do data_pipe.map(func) even more weird
                        #### to circumvent the transform will need to go into the collate_fn
                        #### so all transform inside collate_fn and then len(dataloader) will also work as long as data_pipe was converted
                        #### to dataset using to_map_style_dataset
                        epoch, idx, len(train_iter), correct_preds / total_count
                    )
                )
            correct_preds, total_count = 0, 0
            start_time = time.time()

| epoch   1 |   500/ 1875 batches | accuracy    0.249
| epoch   1 |  1000/ 1875 batches | accuracy    0.252
| epoch   1 |  1500/ 1875 batches | accuracy    0.252
| epoch   2 |   500/ 1875 batches | accuracy    0.251
| epoch   2 |  1000/ 1875 batches | accuracy    0.248
| epoch   2 |  1500/ 1875 batches | accuracy    0.251
| epoch   3 |   500/ 1875 batches | accuracy    0.248
| epoch   3 |  1000/ 1875 batches | accuracy    0.247
| epoch   3 |  1500/ 1875 batches | accuracy    0.249
| epoch   4 |   500/ 1875 batches | accuracy    0.250
| epoch   4 |  1000/ 1875 batches | accuracy    0.252
| epoch   4 |  1500/ 1875 batches | accuracy    0.247
| epoch   5 |   500/ 1875 batches | accuracy    0.252
| epoch   5 |  1000/ 1875 batches | accuracy    0.251
| epoch   5 |  1500/ 1875 batches | accuracy    0.250
