# Full RNN Classifier in Pytorch

## a. Process Dataset
1. Read Dataset
2. Tokenize
3. Build Vocab
4. Numericalize
5. Apply Transforms
6. Batching

## b. Define Model

## c.Train Model

In [1]:
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torchtext.transforms as T
from torchtext.data.functional import to_map_style_dataset
import time
import torch
import torch.nn as nn

In [2]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
else:
    mps_device = torch.device("mps")

#### Build Iterator

In [3]:
#If reading directly from a file
# FILE_PATH = 'data/deu.txt'
# data_pipe = dp.iter.IterableWrapper([FILE_PATH])
# data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
# data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

In [4]:
data_pipe=to_map_style_dataset(torchtext.datasets.AG_NEWS(split=('train')))

In [5]:
for la in data_pipe:
    print(la)
    break

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


#### Tokenize - from a sentence to a list of tokens

In [6]:
## lets Tokenize
import spacy
eng = spacy.load("en_core_web_sm")

In [7]:
def getTokens(text):
    return [token.text for token in eng.tokenizer(text)]
getTokens("Hi how are you")

['Hi', 'how', 'are', 'you']

#### Build Vocab
build_vocab_from_iterator needs an iterator that yields a list of tokens, so we need to build that iterator

In [8]:
def tokeniterator(data_pipe):
    for label,text in data_pipe:
        yield getTokens(text)

In [9]:
vocab=build_vocab_from_iterator(
    tokeniterator(data_pipe),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
vocab.set_default_index(vocab['<unk>'])

In [10]:
vocab(['here', 'is', 'an', 'example'])

[540, 27, 37, 6113]

In [11]:
len(vocab)

62544

#### Numericalize -  using vocab
So far our data_pipe is just an iterator that yields sentences, not even a list of tokens
We will now transform the data_pipe to yield a list of indices of the tokesn

In [12]:
def getIndices(sample):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    label,text=sample
    tokenized_text=getTokens(text)
    transformed_indices=text_tranform(tokenized_text)
    return transformed_indices,label

In [13]:
# data_pipe = data_pipe.map(getIndices)

In [14]:
# for sample in data_pipe:
#     print(sample)
#     break

## Important - Our sequences are of Variable Length and we are padding them using T.ToTensor(0)

#### Make Batches

In [13]:
def collate_fn(batch):
    indices_and_labels=[getIndices(sample) for sample in batch]
    tensors,targets=zip(*indices_and_labels)
    ## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
    # padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
    # vocabulary.
    tensor_lengths=torch.tensor([len(t) for t in tensors])
    tensors=T.ToTensor(0)(list(tensors))
    # ADJUST LABELS FROM 1,2,3,4 to 0,1,2,3
    targets=[x-1 for x in targets]
    targets=T.ToTensor(0)(list(targets))
    return tensors,tensor_lengths,targets

In [16]:
inputs,lengths,targets=collate_fn(
    [
        [1,"i am sports op"],
        [2,"i am news"]
    ]
)
inputs,lengths,targets

(tensor([[    1,  7963,  3194,  1682, 17642,     2],
         [    1,  7963,  3194,   256,     2,     0]]),
 tensor([6, 5]),
 tensor([0, 1]))

In [34]:
hidden_size=10
rn=nn.LSTM(1,hidden_size,batch_first=True)

In [35]:
inputs=torch.tensor(inputs,dtype=torch.float32)

  inputs=torch.tensor(inputs,dtype=torch.float32)


In [36]:
inputs=inputs.reshape(inputs.shape[0],inputs.shape[1],1)

In [37]:
pps=torch.nn.utils.rnn.pack_padded_sequence(inputs,lengths,batch_first=True)

In [21]:
# pps.data

In [39]:
oo=rn(pps,(torch.randn(1*1,inputs.shape[0],hidden_size),torch.randn(1*1,inputs.shape[0],hidden_size)))

In [40]:
ooo,(hhh,_)=oo

In [41]:
hhh.reshape(2,10).shape

torch.Size([2, 10])

In [42]:
torch.nn.utils.rnn.pad_packed_sequence(ooo,batch_first=True)[0][:,-1,:].shape

torch.Size([2, 10])

In [43]:
fcc=nn.Linear(10,4)

In [44]:
nn.utils.rnn.unpack_sequence(ooo)

[tensor([[-0.0827, -0.1664,  0.0024, -0.3924,  0.1788,  0.1572,  0.2666, -0.0468,
          -0.2979,  0.1421],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.5236,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9188,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9886,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9985,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [ 0.0139,  0.3212, -0.5118, -0.4108,  0.2276, -0.4654,  0.1440,  0.2449,
          -0.0964,  0.2409]], grad_fn=<IndexBackward0>),
 tensor([[ 2.6629e-01,  9.1640e-03, -9.3617e-02,  1.5350e-01, -9.0192e-02,
           7.4383e-02,  2.2065e-01, -1.6623e-02,  1.1016e-01,  2.3466e-01],
         [ 4.4213e-01,  0.0000e+00, -0.0000e+00,  2.5275e-01,  0.0000e+00,
          -6.6914e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.9761e-01],
    

In [45]:
fcc(nn.utils.rnn.unpack_sequence(ooo)[0])

tensor([[ 0.2745, -0.4486, -0.1028,  0.0007],
        [ 0.5427, -0.5159, -0.2167,  0.0884],
        [ 0.6368, -0.5869, -0.3198, -0.0343],
        [ 0.6534, -0.5995, -0.3380, -0.0560],
        [ 0.6558, -0.6012, -0.3406, -0.0590],
        [ 0.6005, -0.3504, -0.2885, -0.0825]], grad_fn=<AddmmBackward0>)

In [46]:
torch.nn.utils.rnn.unpack_sequence(ooo)

[tensor([[-0.0827, -0.1664,  0.0024, -0.3924,  0.1788,  0.1572,  0.2666, -0.0468,
          -0.2979,  0.1421],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.5236,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9188,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9886,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [-0.2055,  0.0000, -0.0000, -0.8609,  0.0000, -0.9985,  0.0000,  0.0000,
          -0.0000,  0.3793],
         [ 0.0139,  0.3212, -0.5118, -0.4108,  0.2276, -0.4654,  0.1440,  0.2449,
          -0.0964,  0.2409]], grad_fn=<IndexBackward0>),
 tensor([[ 2.6629e-01,  9.1640e-03, -9.3617e-02,  1.5350e-01, -9.0192e-02,
           7.4383e-02,  2.2065e-01, -1.6623e-02,  1.1016e-01,  2.3466e-01],
         [ 4.4213e-01,  0.0000e+00, -0.0000e+00,  2.5275e-01,  0.0000e+00,
          -6.6914e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  3.9761e-01],
    

# Define Model - Figure out initializing hidden state and general RNN architecture

# Train the model

# Important Observations - RNN Model
## 1. Train Accuracy jumped from unmoving 25% to about 55% after feeding a packed sequence to the RNN.
## It took some work to make use of the packed sequence that is the output of the RNN. So decided to 
## use the hidden layer instead after reshaping, then feed to the linear layer. The accuracy jump
## shows promise and does make sense, we are now skipping the padded indices during the forward and backward pass
## which were probably adding a lot of noise causing the accuracy to be at 25%

## Next step will be to try LSTM and see if the long and short memory helps achieve better accuracy scores

# 2. LSTM has WON! 96.2 % Accuracy!

In [14]:
BATCH_SIZE = 64  # batch size for training
train_iter=DataLoader(data_pipe,batch_size=BATCH_SIZE,collate_fn=collate_fn,shuffle=True,drop_last=True)


In [31]:
class ProperRNN(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_size,num_classes):
        super(ProperRNN,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.rnn=nn.RNN(embed_dim,hidden_size,batch_first=True)
        self.fc=nn.Linear(hidden_size,num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.rnn.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.rnn.bias_ih_l0.data.zero_()
        self.fc.bias.data.zero_()
    
    def forward(self,text_tensors,text_lengths):
        text_tensors=self.embedding(text_tensors)
        pps=torch.nn.utils.rnn.pack_padded_sequence(text_tensors,text_lengths,batch_first=True,enforce_sorted=False)
        out,hhh=self.rnn(pps,torch.randn(1*1,text_tensors.shape[0],self.hidden_size))
        # out
        # out=out[:,-1,:]
        hhh=hhh.reshape(len(text_lengths),self.hidden_size)
        out=self.fc(hhh)
        return out

vocab_size=len(vocab)
embed_dim=300
hidden_size=10
num_classes=4
prnn=ProperRNN(vocab_size,embed_dim,hidden_size,num_classes)


# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(prnn.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

##### TRAIN #####
prnn.train()
correct_preds, total_count = 0, 0
log_interval = 500
start_time = time.time()
for epoch in range(1,EPOCHS+1):
    for idx,(tensors,tensor_lengths,targets) in enumerate(train_iter):
        optimizer.zero_grad()
        predicted_labels=prnn(tensors,tensor_lengths)
        loss=criterion(predicted_labels,targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(prnn.parameters(), 0.1)
        optimizer.step()
        correct_preds +=(predicted_labels.argmax(1)==targets).sum().item()
        total_count += targets.size(0)
        if idx%log_interval==0 and idx>0:
            elapsed=time.time() - start_time
            print(
            "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| accuracy {:8.3f}".format(
                        ##### Weird - in order for len(train_iter) to work data_pipe must be converted using to_map_style_dataset
                        #### after which you cannot do data_pipe.map(func) even more weird
                        #### to circumvent the transform will need to go into the collate_fn
                        #### so all transform inside collate_fn and then len(dataloader) will also work as long as data_pipe was converted
                        #### to dataset using to_map_style_dataset
                        epoch, idx, len(train_iter), correct_preds / total_count
                    )
                )
            correct_preds, total_count = 0, 0
            start_time = time.time()


| epoch   1 |   500/ 1875 batches | accuracy    0.253
| epoch   1 |  1000/ 1875 batches | accuracy    0.266
| epoch   1 |  1500/ 1875 batches | accuracy    0.323
| epoch   2 |   500/ 1875 batches | accuracy    0.389
| epoch   2 |  1000/ 1875 batches | accuracy    0.431
| epoch   2 |  1500/ 1875 batches | accuracy    0.464
| epoch   3 |   500/ 1875 batches | accuracy    0.485
| epoch   3 |  1000/ 1875 batches | accuracy    0.495
| epoch   3 |  1500/ 1875 batches | accuracy    0.503
| epoch   4 |   500/ 1875 batches | accuracy    0.508
| epoch   4 |  1000/ 1875 batches | accuracy    0.517
| epoch   4 |  1500/ 1875 batches | accuracy    0.524
| epoch   5 |   500/ 1875 batches | accuracy    0.525
| epoch   5 |  1000/ 1875 batches | accuracy    0.533
| epoch   5 |  1500/ 1875 batches | accuracy    0.533
| epoch   6 |   500/ 1875 batches | accuracy    0.542
| epoch   6 |  1000/ 1875 batches | accuracy    0.542
| epoch   6 |  1500/ 1875 batches | accuracy    0.549
| epoch   7 |   500/ 1875 ba

In [16]:
class ProperLSTM(nn.Module):
    def __init__(self,vocab_size,embed_dim,hidden_size,num_classes):
        super(ProperLSTM,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.rnn=nn.LSTM(embed_dim,hidden_size,batch_first=True)
        self.fc=nn.Linear(hidden_size,num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.rnn.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.rnn.bias_ih_l0.data.zero_()
        self.fc.bias.data.zero_()
    
    def forward(self,text_tensors,text_lengths):
        text_tensors=self.embedding(text_tensors)
        pps=torch.nn.utils.rnn.pack_padded_sequence(text_tensors,text_lengths,batch_first=True,enforce_sorted=False)
        out,(hhh,_)=self.rnn(pps,(torch.randn(1*1,text_tensors.shape[0],self.hidden_size),torch.randn(1*1,text_tensors.shape[0],self.hidden_size)))
        # out
        # out=out[:,-1,:]
        hhh=hhh.reshape(len(text_lengths),self.hidden_size)
        out=self.fc(hhh)
        return out

vocab_size=len(vocab)
embed_dim=300
hidden_size=10
num_classes=4
plstm=ProperLSTM(vocab_size,embed_dim,hidden_size,num_classes)


# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(plstm.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

##### TRAIN #####
plstm.train()
correct_preds, total_count = 0, 0
log_interval = 500
start_time = time.time()
for epoch in range(1,EPOCHS+1):
    for idx,(tensors,tensor_lengths,targets) in enumerate(train_iter):
        optimizer.zero_grad()
        predicted_labels=plstm(tensors,tensor_lengths)
        loss=criterion(predicted_labels,targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(plstm.parameters(), 0.1)
        optimizer.step()
        correct_preds +=(predicted_labels.argmax(1)==targets).sum().item()
        total_count += targets.size(0)
        if idx%log_interval==0 and idx>0:
            elapsed=time.time() - start_time
            print(
            "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| accuracy {:8.3f}".format(
                        ##### Weird - in order for len(train_iter) to work data_pipe must be converted using to_map_style_dataset
                        #### after which you cannot do data_pipe.map(func) even more weird
                        #### to circumvent the transform will need to go into the collate_fn
                        #### so all transform inside collate_fn and then len(dataloader) will also work as long as data_pipe was converted
                        #### to dataset using to_map_style_dataset
                        epoch, idx, len(train_iter), correct_preds / total_count
                    )
                )
            correct_preds, total_count = 0, 0
            start_time = time.time()


| epoch   1 |   500/ 1875 batches | accuracy    0.493
| epoch   1 |  1000/ 1875 batches | accuracy    0.787
| epoch   1 |  1500/ 1875 batches | accuracy    0.846
| epoch   2 |   500/ 1875 batches | accuracy    0.870
| epoch   2 |  1000/ 1875 batches | accuracy    0.881
| epoch   2 |  1500/ 1875 batches | accuracy    0.886
| epoch   3 |   500/ 1875 batches | accuracy    0.895
| epoch   3 |  1000/ 1875 batches | accuracy    0.906
| epoch   3 |  1500/ 1875 batches | accuracy    0.901
| epoch   4 |   500/ 1875 batches | accuracy    0.914
| epoch   4 |  1000/ 1875 batches | accuracy    0.917
| epoch   4 |  1500/ 1875 batches | accuracy    0.916
| epoch   5 |   500/ 1875 batches | accuracy    0.926
| epoch   5 |  1000/ 1875 batches | accuracy    0.928
| epoch   5 |  1500/ 1875 batches | accuracy    0.927
| epoch   6 |   500/ 1875 batches | accuracy    0.935
| epoch   6 |  1000/ 1875 batches | accuracy    0.936
| epoch   6 |  1500/ 1875 batches | accuracy    0.936
| epoch   7 |   500/ 1875 ba

# Important Observations - Embedding Bag Model
## 1. Train Accuracy jumped from unmoving 25% to about 78% after 10 epochs when applied the learning rate changes
```
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
```
insert clip gradient norm between backward and step
```
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
```
## 2. Train Accuracy jumped from 78% to 96% on passing in the padding_idx to Embedding Bag.
Obviously when the embedding bag sums/means the weights of the embeddings
we want it to ignore the weights of the padding index
padding is just a convenience feature for pytorch training
```
self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,padding_idx=0)
```

In [33]:
class EmbeddingBagClassifier(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_classes):
        super(EmbeddingBagClassifier,self).__init__()
        self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,padding_idx=0)
        self.fc=nn.Linear(embed_dim,num_classes)
        self.init_weights()


    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self,x):
        x=self.embedding(x)
        out=self.fc(x)
        return out

vocab_size=len(vocab)
embed_dim=300
num_classes=4
embedbagmodel=EmbeddingBagClassifier(vocab_size,embed_dim,num_classes)

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(embedbagmodel.parameters(), lr=LR, momentum=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

#### TRAIN#####
embedbagmodel.train()
epochs=10
correct_preds, total_count = 0, 0
log_interval = 500
start_time = time.time()
for epoch in range(1,epochs+1):
    for idx,(tensors,tensor_lengths,targets) in enumerate(train_iter):
        tensors,targets=tensors,targets
        optimizer.zero_grad()
        predicted_labels=embedbagmodel(tensors)
        loss=criterion(predicted_labels,targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(embedbagmodel.parameters(), 0.1)
        optimizer.step()
        correct_preds +=(predicted_labels.argmax(1)==targets).sum().item()
        total_count += targets.size(0)
        if idx%log_interval==0 and idx>0:
            elapsed=time.time() - start_time
            print(
            "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| accuracy {:8.3f}".format(
                        ##### Weird - in order for len(train_iter) to work data_pipe must be converted using to_map_style_dataset
                        #### after which you cannot do data_pipe.map(func) even more weird
                        #### to circumvent the transform will need to go into the collate_fn
                        #### so all transform inside collate_fn and then len(dataloader) will also work as long as data_pipe was converted
                        #### to dataset using to_map_style_dataset
                        epoch, idx, len(train_iter), correct_preds / total_count
                    )
                )
            correct_preds, total_count = 0, 0
            start_time = time.time()

| epoch   1 |   500/ 1875 batches | accuracy    0.715
| epoch   1 |  1000/ 1875 batches | accuracy    0.854
| epoch   1 |  1500/ 1875 batches | accuracy    0.877
| epoch   2 |   500/ 1875 batches | accuracy    0.895
| epoch   2 |  1000/ 1875 batches | accuracy    0.902
| epoch   2 |  1500/ 1875 batches | accuracy    0.904
| epoch   3 |   500/ 1875 batches | accuracy    0.915
| epoch   3 |  1000/ 1875 batches | accuracy    0.920
| epoch   3 |  1500/ 1875 batches | accuracy    0.919
| epoch   4 |   500/ 1875 batches | accuracy    0.926
| epoch   4 |  1000/ 1875 batches | accuracy    0.929
| epoch   4 |  1500/ 1875 batches | accuracy    0.929
| epoch   5 |   500/ 1875 batches | accuracy    0.936
| epoch   5 |  1000/ 1875 batches | accuracy    0.935
| epoch   5 |  1500/ 1875 batches | accuracy    0.935
| epoch   6 |   500/ 1875 batches | accuracy    0.939
| epoch   6 |  1000/ 1875 batches | accuracy    0.942
| epoch   6 |  1500/ 1875 batches | accuracy    0.941
| epoch   7 |   500/ 1875 ba