In [None]:
import torch
import torchtext
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence
from torch import nn

from torch import Tensor, dot, matmul

import torch.nn.functional as F

# **Basic Example**

In [None]:
seq = torch.FloatTensor([[3, 4, 5]])  
print(seq)
print(seq.shape)

tensor([[3., 4., 5.]])
torch.Size([1, 3])


In [None]:
# Defining a basic RNN layer

rnn = nn.RNN(input_size=1, hidden_size=1, num_layers=1, bias = False, batch_first=True)

# RNN expects input sequences to be in a particular format. By setting batch_first = True, 
# we set the input data format to be 'batch size, sequence length, # input features'

In [None]:
seq = seq.unsqueeze(2)
print(seq.shape)
print(seq)


torch.Size([1, 3, 1])
tensor([[[3.],
         [4.],
         [5.]]])


With the correct input format, we can now pass the input to the RNN layer. The RNN layer provides 2 outputs

1.All hidden states associated with a sequence, for all sequences in the batch

2.Just the very last hidden state for a sequence, for all sequences in the batch

In [None]:
out_all, out_last = rnn(seq)

In [None]:
print(f"Out all shape : {out_all.shape}")

print(f"Out last shape : {out_last.shape}")

Out all shape : torch.Size([1, 3, 1])
Out last shape : torch.Size([1, 1, 1])


There are 2 ways that we can acess the weights of the RNN layer.

1.Accessing individual parameters using their names weight_hh_10, weight_1h_10 and so on.

2.Using the state_dict() parameter to access all weights

In [None]:
rnn.weight_hh_l0

Parameter containing:
tensor([[-0.8218]], requires_grad=True)

In [None]:
rnn.weight_ih_l0

Parameter containing:
tensor([[-0.8769]], requires_grad=True)

In [None]:
rnn.state_dict()

OrderedDict([('weight_ih_l0', tensor([[-0.8769]])),
             ('weight_hh_l0', tensor([[-0.8218]]))])

Computing the output
RNN layers essentially take in a sequence and compute outputs for each time point in the input sequence. The weights that are used for computation remain the same for all time points.

The basic equation governing the computation is given by : tanh(wihXt + bih + whh h(t-1) + bhh)

where  represents the hidden state at time t

In [None]:
# Output states computed by the RNN layer
out_all

tensor([[[-0.9897],
         [-0.9909],
         [-0.9984]]], grad_fn=<TransposeBackward1>)

Hidden State 1

Note. Since this is the very first state (time = 1) and we dont have a hidden state preceding it, we assumne it be zero. Therefore, h0 is taken to be 0.

In [None]:
wih = rnn.weight_ih_l0
whh = rnn.weight_hh_l0

x = seq[0][0] # The first input feature of the first sequence

# Computing thw hidden state for time = 1
h1 = torch.tanh(Tensor(x*wih + whh*0))  
h1

tensor([[-0.9897]], grad_fn=<TanhBackward0>)

Hidden State 2

In [None]:
x = seq[0][1] # The second input feature of the first sequence

h2 = torch.tanh(Tensor(x*wih + whh*h1))  
h2

tensor([[-0.9909]], grad_fn=<TanhBackward0>)

Hidden State 3

In [None]:
x = seq[0][2] # The third and last input feature of the first sequence

h3 = torch.tanh(Tensor(x*wih + whh*h2))  
h3

tensor([[-0.9984]], grad_fn=<TanhBackward0>)

We can observe that :

1.RNN does a very basic computation repeatedly on all features of the given sequence

2.The output at a particular time stamp depends on the outputs at a previous time stamp

## Adding more features

We increase the complexity of the RNN computation by increasing the number of features at each sequence time stamp. Previously, each time stamp was represented by a single value. Now, we expand that to be represented by a feature vector

In [None]:
seq = torch.Tensor([[1,1,1],[1,2,1],[2,3,1], [1,3,1]])
print(seq.shape)
print(seq)
seq = seq.unsqueeze(0)
print(seq.shape)
print(seq)

torch.Size([4, 3])
tensor([[1., 1., 1.],
        [1., 2., 1.],
        [2., 3., 1.],
        [1., 3., 1.]])
torch.Size([1, 4, 3])
tensor([[[1., 1., 1.],
         [1., 2., 1.],
         [2., 3., 1.],
         [1., 3., 1.]]])


The seq variable represents a sequence of length 4, where each element (time-stamp) is represented by a feature vector of length 3.

We next define a RNN layer where we set input_size to be 3. This time, we also set bias to be True, so that we include a bias term in our calculations

In [None]:
# Defining a basic RNN layer

rnn = nn.RNN(input_size=3, hidden_size=1, num_layers=1, bias=True, batch_first=True)

In [None]:
out_all, out_last = rnn(seq)

print(f"Out all shape : {out_all.shape}")

print(f"Out last shape : {out_last.shape}")

Out all shape : torch.Size([1, 4, 1])
Out last shape : torch.Size([1, 1, 1])


## Computing outputs

In [None]:
out_all

tensor([[[0.1408],
         [0.8286],
         [0.9486],
         [0.9908]]], grad_fn=<TransposeBackward1>)

Hidden State 1

A minor modification compared to the previous code is that we will be using dot multiplication to multiply x with Wih and h(t-1) with whh .

In [None]:
wih = rnn.weight_ih_l0.squeeze(0)
whh = rnn.weight_hh_l0.squeeze(0)

bih = rnn.bias_ih_l0
bhh = rnn.bias_hh_l0

x = seq[0][0] # The first input feature of the first sequence

# Computing thw hidden state for time = 1
h1 = torch.tanh(Tensor(dot(x,wih) + bih  + dot(whh,Tensor([0.0])) + bhh))  
h1

tensor([0.1408], grad_fn=<TanhBackward0>)

Hidden State 2

In [None]:
x = seq[0][1] # The first input feature of the first sequence

# Computing thw hidden state for time = 1
h2 = torch.tanh(Tensor(dot(x,wih) + bih  + dot(h1,whh) + bhh))  
h2

tensor([0.8286], grad_fn=<TanhBackward0>)

Computing all states

We automate the manual computation of hidden states to verify our computation matches with the RNN layer output

In [None]:
output = []

h_previous = Tensor([0.0])

for i in range(seq.shape[1]):

    if i==0:
        x = seq[0][i]
        h_current = torch.tanh(Tensor(dot(x,wih) + bih + dot(h_previous, whh) + bhh))
        h_previous = h_current
        output.append(h_current.detach().numpy())

    else:
        x = seq[0][i]
        h_current = torch.tanh(Tensor(dot(x,wih) + bih + dot(h_previous, whh) + bhh))
        h_previous = h_current
        output.append(h_current.detach().numpy())


In [None]:
output

[array([0.14078327], dtype=float32),
 array([0.82855445], dtype=float32),
 array([0.94859374], dtype=float32),
 array([0.9908491], dtype=float32)]

## **Increasing Hidden Size**

Hidden size is number of features of the hidden state for RNN.So if you increase hidden size then you compute bigger feature as hidden state output.

Till now, we had hidden_size parameter fixed at 1. We increase this value and see how it affects the RNN computation

In [None]:
# Defining the RNN layer
rnn= nn.RNN(input_size=3, hidden_size=2, num_layers = 1, bias = True, batch_first=True)

In [None]:
out_all, out_last = rnn(seq)

print(f"Out all shape : {out_all.shape}")

print(f"Out last shape : {out_last.shape}")

Out all shape : torch.Size([1, 4, 2])
Out last shape : torch.Size([1, 1, 2])


We can see from the output shape that the size of the hidden states has increased to 2, corresponding to the increase in the hidden_size parameter to 2

In [None]:
rnn.state_dict()

OrderedDict([('weight_ih_l0', tensor([[ 0.5886, -0.0840,  0.1099],
                      [-0.1125,  0.0505, -0.1764]])),
             ('weight_hh_l0', tensor([[0.1692, 0.4006],
                      [0.4108, 0.7006]])),
             ('bias_ih_l0', tensor([0.5332, 0.4121])),
             ('bias_hh_l0', tensor([-0.5042, -0.1715]))])

Similarly, the RNN layer weight shapes have also changed in response to the new hidden_size parameter value

## Computing outputs

On increasing the hidden_size parameter to 2, we are essentially increase the size of the hidden states computed for each time-stamp. This essentially allows the hidden states to be more expressive and store more information.

In [None]:
out_all

tensor([[[0.5673, 0.0021],
         [0.5759, 0.2795],
         [0.8547, 0.3994],
         [0.6527, 0.6255]]], grad_fn=<TransposeBackward1>)

Hidden State 1

In [None]:
wih = rnn.weight_ih_l0
whh = rnn.weight_hh_l0

bih = rnn.bias_ih_l0
bhh = rnn.bias_hh_l0

x = seq[0][0] # The first input feature of the first sequence

# Computing thw hidden state for time = 1
h1 = torch.tanh(Tensor(matmul(x,wih.T) + bih  + matmul( torch.zeros([1,2]) , whh.T ) + bhh))  
h1

tensor([[0.5673, 0.0021]], grad_fn=<TanhBackward0>)

Computing for all states

In [None]:
output = []

h_previous = torch.zeros([1,2])  # Since the hidden_size parameter is 2, all hidden states will have a shape of [1,2]

for i in range(seq.shape[1]):

  x = seq[0][i]
  h_current = torch.tanh(Tensor(matmul(x,wih.T) + bih  + matmul(h_previous,whh.T) + bhh))
  h_previous = h_current
  output.append(h_current)

In [None]:
output

[tensor([[0.5673, 0.0021]], grad_fn=<TanhBackward0>),
 tensor([[0.5759, 0.2795]], grad_fn=<TanhBackward0>),
 tensor([[0.8547, 0.3994]], grad_fn=<TanhBackward0>),
 tensor([[0.6527, 0.6255]], grad_fn=<TanhBackward0>)]

## Building a Bi-Directional RNN

In [None]:
# Defining the RNN layer
rnn= nn.RNN(input_size=3, hidden_size=2, num_layers = 1, bias = True, batch_first=True, bidirectional=True)

In [None]:
out_all, out_last = rnn(seq)

print(f"Out all shape : {out_all.shape}")

print(f"Out last shape : {out_last.shape}")

Out all shape : torch.Size([1, 4, 4])
Out last shape : torch.Size([2, 1, 2])


In [None]:
out_all

tensor([[[ 0.6201, -0.5690, -0.8503,  0.3732],
         [ 0.7594, -0.9050, -0.9095,  0.6461],
         [ 0.4136, -0.9911, -0.9002,  0.9371],
         [ 0.6942, -0.9752, -0.9584,  0.9163]]], grad_fn=<TransposeBackward1>)

In [None]:
out_last

tensor([[[ 0.6942, -0.9752]],

        [[-0.8503,  0.3732]]], grad_fn=<StackBackward0>)

In [None]:
rnn.state_dict()

OrderedDict([('weight_ih_l0', tensor([[-0.6735,  0.2324,  0.5187],
                      [-0.5421, -0.4331,  0.1564]])),
             ('weight_hh_l0', tensor([[ 0.5985,  0.5866],
                      [-0.0808,  0.6500]])),
             ('bias_ih_l0', tensor([ 0.5015, -0.3184])),
             ('bias_hh_l0', tensor([0.1460, 0.4912])),
             ('weight_ih_l0_reverse', tensor([[ 0.4697, -0.3869, -0.4899],
                      [ 0.4505,  0.4845, -0.3718]])),
             ('weight_hh_l0_reverse', tensor([[ 0.3995,  0.3990],
                      [-0.0386, -0.3703]])),
             ('bias_ih_l0_reverse', tensor([-0.3801, -0.4178])),
             ('bias_hh_l0_reverse', tensor([-0.3645,  0.4508]))])

## Computing outputs - Forward Direction

For a bidirectional RNN layer with a hidden layer size of 2 and an input sequence of length 4, we get an output of size 4x4.

In the output, each row essentially captures the hidden state corresponding to a given time-stamp. In the previous example, each time stamp was represented by a vector of length 2 (because hidden_size = 2). Now, since its bidirectional, each hidden state is represented by a vector of length 4 ( 2 + 2)

For each timestamp, the first 2 values correspond to the forward run of the RNN and the last 2 values correspond to the backward run of the RNN.

## Hidden State 1 - Forward Direction

In [None]:
wih = rnn.weight_ih_l0
whh = rnn.weight_hh_l0

bih = rnn.bias_ih_l0
bhh = rnn.bias_hh_l0

# We represent all reverse weights using a '_' suffix
wih_ = rnn.weight_ih_l0_reverse
whh_ = rnn.weight_hh_l0_reverse

bih_ = rnn.bias_ih_l0_reverse
bhh_ = rnn.bias_hh_l0_reverse

x = seq[0][0] # The first input feature of the first sequence

# Computing thw hidden state for time = 1
h1 = torch.tanh(Tensor(matmul(x,wih.T) + bih  + matmul( torch.zeros([1,2]) , whh.T ) + bhh))  
h1

tensor([[ 0.6201, -0.5690]], grad_fn=<TanhBackward0>)

Computing all states - Forward Direction

In [None]:
output = []

h_previous = torch.zeros([1,2])  # Since the hidden_size parameter is 2, all hidden states will have a shape of [1,2]

for i in range(seq.shape[1]):

  x = seq[0][i]
  h_current = torch.tanh(Tensor(matmul(x,wih.T) + bih  + matmul(h_previous,whh.T) + bhh))
  h_previous = h_current
  output.append(h_current)


output

[tensor([[ 0.6201, -0.5690]], grad_fn=<TanhBackward0>),
 tensor([[ 0.7594, -0.9050]], grad_fn=<TanhBackward0>),
 tensor([[ 0.4136, -0.9911]], grad_fn=<TanhBackward0>),
 tensor([[ 0.6942, -0.9752]], grad_fn=<TanhBackward0>)]

At this stage, we can compare the computed hidden states with the RNN layer output out_all. We can observe that computed states match to the first 2 elements of all the RNN layer outputs

In [None]:
out_all[:,:,:2]

tensor([[[ 0.6201, -0.5690],
         [ 0.7594, -0.9050],
         [ 0.4136, -0.9911],
         [ 0.6942, -0.9752]]], grad_fn=<SliceBackward0>)

## Computing Outputs - Backward Direction

Hidden State 1 - Backward direction

In [None]:
x = seq[0][-1] # The very last element of the sequence is now treated as the first element in the backward run

# Computing thw hidden state for time = 4
h4_ = torch.tanh(Tensor(matmul(x,wih_.T) + bih_  + matmul( torch.zeros([1,2]) , whh_.T ) + bhh_))  
h4_

tensor([[-0.9584,  0.9163]], grad_fn=<TanhBackward0>)

Hidden State 2 - Backward direction

In [None]:
x = seq[0][-2] 

# Computing thw hidden state for time = 3
h3_ = torch.tanh(Tensor(matmul(x,wih_.T) + bih_  + matmul( h4_ , whh_.T ) + bhh_))  
h3_

tensor([[-0.9002,  0.9371]], grad_fn=<TanhBackward0>)

Hidden State 3 - Backward direction

In [None]:
x = seq[0][-3] 

# Computing thw hidden state for time = 3
h2_ = torch.tanh(Tensor(matmul(x,wih_.T) + bih_  + matmul( h3_ , whh_.T ) + bhh_))  
h2_

tensor([[-0.9095,  0.6461]], grad_fn=<TanhBackward0>)

Hidden State 4 - Backward direction

In [None]:
x = seq[0][-4] 

# Computing thw hidden state for time = 3
h1_ = torch.tanh(Tensor(matmul(x,wih_.T) + bih_  + matmul( h2_ , whh_.T ) + bhh_))  
h1_

tensor([[-0.8503,  0.3732]], grad_fn=<TanhBackward0>)

In [None]:
output_ = [h1_,h2_,h3_,h4_]
output_

[tensor([[-0.8503,  0.3732]], grad_fn=<TanhBackward0>),
 tensor([[-0.9095,  0.6461]], grad_fn=<TanhBackward0>),
 tensor([[-0.9002,  0.9371]], grad_fn=<TanhBackward0>),
 tensor([[-0.9584,  0.9163]], grad_fn=<TanhBackward0>)]

In [None]:
out_all[:,:,2:]   #Checking only the 2nd half of the RNN layer output

tensor([[[-0.8503,  0.3732],
         [-0.9095,  0.6461],
         [-0.9002,  0.9371],
         [-0.9584,  0.9163]]], grad_fn=<SliceBackward0>)

The final RNN layer output is the concatentation of hidden states from both the forward and backward runs. On doing so, we can compare our manually computed results with the RNN layer output

In [None]:
fullOutput = [ torch.cat( (output[i], output_[i]),1)  for i in range(4) ]
fullOutput

[tensor([[ 0.6201, -0.5690, -0.8503,  0.3732]], grad_fn=<CatBackward0>),
 tensor([[ 0.7594, -0.9050, -0.9095,  0.6461]], grad_fn=<CatBackward0>),
 tensor([[ 0.4136, -0.9911, -0.9002,  0.9371]], grad_fn=<CatBackward0>),
 tensor([[ 0.6942, -0.9752, -0.9584,  0.9163]], grad_fn=<CatBackward0>)]

In [None]:
out_all

tensor([[[ 0.6201, -0.5690, -0.8503,  0.3732],
         [ 0.7594, -0.9050, -0.9095,  0.6461],
         [ 0.4136, -0.9911, -0.9002,  0.9371],
         [ 0.6942, -0.9752, -0.9584,  0.9163]]], grad_fn=<TransposeBackward1>)


# Stacked RNNs

With Stacked RNNs, we explore the num_layers parameter of the RNN module. Stacked RNNs can be thought of individual RNN modules stacked together, with the output of one module acting as input to the next RNN module.



In [None]:
# Defining the RNN layer
rnn= nn.RNN(input_size=3, hidden_size=3, num_layers = 2, bias = True, batch_first=True, bidirectional=False)

In [None]:
out_all, out_last = rnn(seq)

print(f"Out all shape : {out_all.shape}")

print(f"Out last shape : {out_last.shape}")

Out all shape : torch.Size([1, 4, 3])
Out last shape : torch.Size([2, 1, 3])


In [None]:
out_all

tensor([[[ 0.7045, -0.6622,  0.5480],
         [ 0.6306, -0.5797,  0.3353],
         [ 0.7372, -0.7367,  0.2066],
         [ 0.7282, -0.7595,  0.2329]]], grad_fn=<TransposeBackward1>)

In [None]:
out_last

tensor([[[ 0.7242, -0.8361,  0.7038]],

        [[ 0.7282, -0.7595,  0.2329]]], grad_fn=<StackBackward0>)

In [None]:
rnn.state_dict()

OrderedDict([('weight_ih_l0', tensor([[-0.0067,  0.5169, -0.4285],
                      [ 0.4447, -0.5474,  0.3089],
                      [-0.1507,  0.2443,  0.1886]])),
             ('weight_hh_l0', tensor([[-0.1682, -0.2060, -0.0155],
                      [-0.2151, -0.5301, -0.4871],
                      [-0.4436, -0.2902, -0.5667]])),
             ('bias_ih_l0', tensor([-0.0591, -0.4937,  0.2946])),
             ('bias_hh_l0', tensor([-0.1003,  0.4496,  0.4387])),
             ('weight_ih_l1', tensor([[ 0.2131,  0.1153,  0.4454],
                      [-0.3610,  0.0018, -0.3609],
                      [-0.4575,  0.0097, -0.0236]])),
             ('weight_hh_l1', tensor([[ 0.0889, -0.0294, -0.3185],
                      [ 0.1371,  0.1975,  0.5522],
                      [ 0.3690,  0.3262, -0.1692]])),
             ('bias_ih_l1', tensor([ 0.1758, -0.5126,  0.3801])),
             ('bias_hh_l1', tensor([ 0.3563, -0.0351,  0.2165]))])

## Computing Outputs - Layer 1

In [None]:
# Extracting the weights for RNN Layer 1
wih_10 = rnn.weight_ih_l0
whh_10 = rnn.weight_hh_l0

bih_10 = rnn.bias_ih_l0
bhh_10 = rnn.bias_hh_l0

output_1 = []

h_previous = torch.zeros([1,3])  # Since the hidden_size parameter is 3, all hidden states will have a shape of [1,3]

for i in range(seq.shape[1]):

  x = seq[0][i]
  h_current = torch.tanh(Tensor(matmul(x,wih_10.T) + bih_10  + matmul(h_previous,whh_10.T) + bhh_10))
  h_previous = h_current
  output_1.append(h_current)

output_1

[tensor([[-0.0775,  0.1607,  0.7680]], grad_fn=<TanhBackward0>),
 tensor([[ 0.3862, -0.6794,  0.6708]], grad_fn=<TanhBackward0>),
 tensor([[ 0.7674, -0.4912,  0.7612]], grad_fn=<TanhBackward0>),
 tensor([[ 0.7242, -0.8361,  0.7038]], grad_fn=<TanhBackward0>)]

## Computing Outputs - Layer 2

In [None]:
# Extracting the weights for RNN Layer 1
wih_11 = rnn.weight_ih_l1
whh_11 = rnn.weight_hh_l1

bih_11 = rnn.bias_ih_l1
bhh_11 = rnn.bias_hh_l1

output_2 = []

h_previous = torch.zeros([1,3]) # Since the hidden_size parameter is 2, all hidden states will have a shape of [1,2]

for i in range(seq.shape[1]):
  
  x = seq[0][i]
  h_current = torch.tanh(Tensor(matmul(output_1[i],wih_11.T) + bih_11  + matmul(h_previous,whh_11.T) + bhh_11))
  h_previous = h_current
  output_2.append(h_current)

output_2

[tensor([[ 0.7045, -0.6622,  0.5480]], grad_fn=<TanhBackward0>),
 tensor([[ 0.6306, -0.5797,  0.3353]], grad_fn=<TanhBackward0>),
 tensor([[ 0.7372, -0.7367,  0.2066]], grad_fn=<TanhBackward0>),
 tensor([[ 0.7282, -0.7595,  0.2329]], grad_fn=<TanhBackward0>)]

In [None]:
out_all

tensor([[[ 0.7045, -0.6622,  0.5480],
         [ 0.6306, -0.5797,  0.3353],
         [ 0.7372, -0.7367,  0.2066],
         [ 0.7282, -0.7595,  0.2329]]], grad_fn=<TransposeBackward1>)

In [None]:
out_last

tensor([[[ 0.7242, -0.8361,  0.7038]],

        [[ 0.7282, -0.7595,  0.2329]]], grad_fn=<StackBackward0>)