### in this notebook we'll play with RNNs to get an understanding of how they work

note: at the point of creating this notebook, my experience with RNN is close to none

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

Disregarding overfitting, etc., we'll create a model that tries to predict the next letter/sequence of letters in the alphabet

### prepare the dataset for testing tokenization

In [7]:
dataset = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

tokenize the dataset. (change the letters to numbers)

In [8]:
dataset_tokenized = [i for i in np.arange(len(dataset))]

In [9]:
dataset_tokenized

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

In [10]:
dataset_tokenized = torch.FloatTensor(dataset_tokenized)

In [11]:
dataset_tokenized

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.])

In [12]:
dataset_tokenized.dtype

torch.float32

now one hot the tokenized tensor

In [13]:
F.one_hot(dataset_tokenized)

RuntimeError: one_hot is only applicable to index tensor.

hmm.. i think the tokenized tensor should be of datatype integer 

In [14]:
dataset_tokenized = [i for i in np.arange(len(dataset))]
dataset_tokenized = torch.IntTensor(dataset_tokenized)
dataset_tokenized

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25], dtype=torch.int32)

In [29]:
F.one_hot(dataset_tokenized)

RuntimeError: one_hot is only applicable to index tensor.

huh. that didnt work either

further research showed that the tokenized input tensor for .one_hot should be an int64 tensor: https://github.com/ray-project/ray/issues/11401#issuecomment-721700627

In [32]:
dataset_onehot = F.one_hot(dataset_tokenized.long())
dataset_onehot

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 

okay. that worked. .long() converts the tensor to int64

In [34]:
dataset_onehot.dtype

torch.int64

#### we'll create a vocab dict which we will use later

In [18]:
vocab_dict = {dataset[i]:i for i in np.arange(len(dataset))}

In [19]:
vocab_dict

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

### test an rnn cell 

In [None]:
rnntest = nn.RNN(input_size=4, hidden_size=12, num_layers=3)

In [71]:
rnntest

RNN(4, 12, num_layers=3)

create a sample input

In [72]:
sampleinp = torch.FloatTensor((1,2,3,4))
sampleinp

tensor([1., 2., 3., 4.])

In [73]:
sampleinp.shape

torch.Size([4])

In [74]:
sampleinp.unsqueeze_(0).unsqueeze_(0)

tensor([[[1., 2., 3., 4.]]])

In [80]:
sampleinp.shape

torch.Size([1, 1, 4])

read about input shape: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

In [75]:
sampleout = rnntest(sampleinp)

In [76]:
sampleout

(tensor([[[ 0.8106, -0.1224, -0.0200, -0.2534, -0.5853,  0.6180, -0.1380,
           -0.1661, -0.1332, -0.3216,  0.5315,  0.1052]]],
        grad_fn=<StackBackward0>),
 tensor([[[-0.9076,  0.3143,  0.3579,  0.8390,  0.9330,  0.9481,  0.4786,
            0.0115, -0.7938, -0.8034, -0.1920, -0.2838]],
 
         [[-0.3131,  0.4416, -0.6035, -0.2584, -0.5068, -0.4455, -0.0284,
           -0.3195, -0.6628,  0.6726, -0.0493,  0.8301]],
 
         [[ 0.8106, -0.1224, -0.0200, -0.2534, -0.5853,  0.6180, -0.1380,
           -0.1661, -0.1332, -0.3216,  0.5315,  0.1052]]],
        grad_fn=<StackBackward0>))

you can see above that sampleout[0] has the output of the final layer, and sampleoutput[1] stores the outputs of all the layers in the stacked recurrent network. sampleout[0] and the last row of sampleout[1] are the same.

In [77]:
len(sampleout)

2

In [78]:
sampleout[0].shape

torch.Size([1, 1, 12])

In [79]:
sampleout[1].shape

torch.Size([3, 1, 12])

### run a few random tests 

#### what if the input shape does not match 

In [88]:
sampleinp1 = torch.FloatTensor((1,2,3,4)).unsqueeze_(0).unsqueeze_(0)
sampleinp1

tensor([[[1., 2., 3., 4.]]])

In [89]:
rnntest1 = nn.RNN(input_size=3, hidden_size=12, num_layers=3)

In [90]:
rnntest1(sampleinp1)

RuntimeError: input.size(-1) must be equal to input_size. Expected 3, got 4

### Understand the output of the RNN, and how the weights are multiplied with the inputs and hidden states

look at the equation here: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

In [80]:
sampleinp2 = torch.FloatTensor((1,0,0,0)).unsqueeze_(0).unsqueeze_(0)
rnntest2 = nn.RNN(input_size=4, hidden_size=3, num_layers=2, batch_first=True)
h0 = torch.zeros(2, 1, 3)
sampleout2 = rnntest2(sampleinp2, h0)
sampleout2

(tensor([[[0.1575, 0.8601, 0.1959]]], grad_fn=<TransposeBackward1>),
 tensor([[[ 0.7231, -0.7849,  0.3994]],
 
         [[ 0.1575,  0.8601,  0.1959]]], grad_fn=<StackBackward0>))

In [183]:
h0

tensor([[[0., 0., 0.]],

        [[0., 0., 0.]]])

below cell shows all the weights and biases in both the layers. the first 4 tensors belong to the first layer in the stacked rnn, and the last 4 tensors belong to the top layer in the stacked rnn.

lets look at the first four tensors, ie, the first layer in the stacked rnn. cross check with the equation provided in the pytorch rnn documentation - https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

the 1st tensor has the weights for the 4 inputs over 3 time steps (hidden size = time step) - Wih <br>
the 3rd tensor has the biases for the inputs for 3 time steps - bih

the 2st tensor has the weights for the 3 hidden states. on testing i see that this is always a square matrix. - Whh<br>
the 4th tensor has the biases for the hidden states for 3 time steps - bhh

xt is the input for that time step <br>
h(t-1) is the output of the previous time step without activation

In [184]:
rnntest2.all_weights

[[Parameter containing:
  tensor([[-0.4173,  0.4174,  0.2059, -0.4597],
          [-0.0723, -0.5551,  0.4747, -0.4508],
          [-0.1360,  0.3744,  0.3179,  0.1486]], requires_grad=True),
  Parameter containing:
  tensor([[ 0.2320, -0.0877, -0.5017],
          [-0.3981, -0.3421, -0.0617],
          [-0.0402,  0.1713,  0.5218]], requires_grad=True),
  Parameter containing:
  tensor([ 0.1127,  0.2711, -0.1279], requires_grad=True),
  Parameter containing:
  tensor([ 0.2040, -0.0273, -0.0363], requires_grad=True)],
 [Parameter containing:
  tensor([[ 0.4758,  0.1579, -0.4541],
          [-0.1924, -0.3847, -0.1160],
          [-0.1088, -0.2506, -0.5486]], requires_grad=True),
  Parameter containing:
  tensor([[-0.0106,  0.2868,  0.2903],
          [-0.1309, -0.0867, -0.0035],
          [ 0.0546,  0.0871, -0.2004]], requires_grad=True),
  Parameter containing:
  tensor([-0.1665,  0.5583, -0.1764], requires_grad=True),
  Parameter containing:
  tensor([ 0.1633, -0.1199, -0.2487], requires_

let's look at the first layer outputs separately

the weights for the 4 inputs over 3 time steps (hidden size = time step) - Wih:

In [185]:
rnntest2.weight_ih_l0

Parameter containing:
tensor([[-0.4173,  0.4174,  0.2059, -0.4597],
        [-0.0723, -0.5551,  0.4747, -0.4508],
        [-0.1360,  0.3744,  0.3179,  0.1486]], requires_grad=True)

the biases for the inputs for 3 time steps - bih:

In [186]:
rnntest2.bias_ih_l0

Parameter containing:
tensor([ 0.1127,  0.2711, -0.1279], requires_grad=True)

the weights for the 3 hidden states. on testing i see that this is always a square matrix. - Whh:

In [187]:
rnntest2.weight_hh_l0

Parameter containing:
tensor([[ 0.2320, -0.0877, -0.5017],
        [-0.3981, -0.3421, -0.0617],
        [-0.0402,  0.1713,  0.5218]], requires_grad=True)

the biases for the hidden states for 3 time steps - bhh:

In [188]:
rnntest2.bias_hh_l0

Parameter containing:
tensor([ 0.2040, -0.0273, -0.0363], requires_grad=True)

output:

In [189]:
sampleout2

(tensor([[[ 0.1078,  0.4022, -0.2884]]], grad_fn=<TransposeBackward1>),
 tensor([[[-0.1002,  0.1698, -0.2915]],
 
         [[ 0.1078,  0.4022, -0.2884]]], grad_fn=<StackBackward0>))

analyse the weights for the **first time step in the first layer**<br>
using the equation tanh(Wihxt + bih + Whhh(t-1) + bhh), <br>
Wih = [-0.4173,  0.4174,  0.2059, -0.4597] <br>
xt = [1,0,0,0] <br>
bih = [0.1127] <br>
Whh = [ 0.2320, -0.0877, -0.5017] <br>
h(t-1) = [0, 0, 0] <br>
bhh = [0.2040]

manually test the formula:

In [190]:
np.tanh(np.matmul([-0.4173,  0.4174,  0.2059, -0.4597], [1,0,0,0]) + 0.1127 \
        + np.matmul([ 0.2320, -0.0877, -0.5017],[0, 0, 0]) + 0.2040)

-0.10026199880626045

check the output of the first time step in the first layer 

In [191]:
sampleout2[1]

tensor([[[-0.1002,  0.1698, -0.2915]],

        [[ 0.1078,  0.4022, -0.2884]]], grad_fn=<StackBackward0>)

**They match! So we can confirm that our understanding of the weights and biases tensors is correct**