# A Simple RNN from Scratch
---

*COSCI 223 - Machine Learning 3*

*Prepared by Sebastian C. Ibañez*

<a href="https://colab.research.google.com/github/aim-msds/msds2023-ml3/blob/main/notebooks/rnn/01-simple-rnn.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="float: left;"></a><br>

## Building A Simple RNN from Sractch with PyTorch
---

In this section, we'll use PyTorch's primitive operations to set-up a sim

In [1]:
import torch

In [2]:
# Create raw data
seq_len = 5 # sequence length

data = torch.arange(seq_len).float() + 1.

print(data.shape)
print(data)

torch.Size([5])
tensor([1., 2., 3., 4., 5.])


Let's reshape the data into a more general form.

In [3]:
# Reshape
batch_size = 1
input_size = 1 # number of features

x = data.view(batch_size, seq_len, input_size) # shape=(batch, timesteps, feature)
print(x.shape)
print(x)

torch.Size([1, 5, 1])
tensor([[[1.],
         [2.],
         [3.],
         [4.],
         [5.]]])


Next, let's initialize the hidden state.

In [4]:
# Initialize hidden state
hidden_size = 4
h0 = torch.zeros((input_size, hidden_size)) # shape=(input_size, hidden size)

print(h0.shape)
print(h0)

torch.Size([1, 4])
tensor([[0., 0., 0., 0.]])


Now let's initialize the weights of the RNN,

$$h_t = \tanh(W_{xh}x_t + W_{hh}h_{t-1} + b_h)$$

In [5]:
# Initialize random weights
torch.manual_seed(0)
W_xh = torch.randn((input_size, hidden_size))  # shape=(input_size, hidden size)
W_hh = torch.randn((hidden_size, hidden_size)) # shape=(hidden size,  hidden size)
b_h  = torch.zeros((hidden_size))              # shape=(hidden size)

print(W_xh.shape)
print(W_xh)
print(W_hh.shape)
print(W_hh)
print(b_h.shape)
print(b_h)

torch.Size([1, 4])
tensor([[ 1.5410, -0.2934, -2.1788,  0.5684]])
torch.Size([4, 4])
tensor([[-0.3561,  0.4372,  0.4913, -0.2041],
        [-0.0885,  0.5239, -0.6659,  0.8504],
        [-1.0438, -1.3453,  0.7854,  0.9928],
        [-0.1932, -0.3090,  0.5026, -0.8594]])
torch.Size([4])
tensor([0., 0., 0., 0.])


Now for the forward pass!

In [6]:
# Forward pass (w/ tanh activation)
h = h0
print(f'h0 = {h}')
for t in range(seq_len):
    h = torch.tanh(x[:, t, :]@W_xh + h@W_hh + b_h)
    print(f'h{t+1} = {h}')

h0 = tensor([[0., 0., 0., 0.]])
h1 = tensor([[ 0.9123, -0.2853, -0.9747,  0.5142]])
h2 = tensor([[ 0.9988,  0.6723, -0.9996, -0.6054]])
h3 = tensor([[ 1.0000,  0.8938, -1.0000,  0.9218]])
h4 = tensor([[ 1.0000,  0.6597, -1.0000,  0.7797]])
h5 = tensor([[ 1.0000,  0.3970, -1.0000,  0.9115]])


Here's a more efficient implementation of the forward pass using only 1 matrix multiplication.

In [7]:
# Forward pass (more efficient)
h = h0
print(f'h0 = {h}')
for t in range(seq_len):
    h = torch.tanh(torch.cat((x[:, t, :], h), axis=1)@torch.cat((W_xh, W_hh), axis=0) + b_h) # more efficient
    print(f'h{t+1} = {h}')

h0 = tensor([[0., 0., 0., 0.]])
h1 = tensor([[ 0.9123, -0.2853, -0.9747,  0.5142]])
h2 = tensor([[ 0.9988,  0.6723, -0.9996, -0.6054]])
h3 = tensor([[ 1.0000,  0.8938, -1.0000,  0.9218]])
h4 = tensor([[ 1.0000,  0.6597, -1.0000,  0.7797]])
h5 = tensor([[ 1.0000,  0.3970, -1.0000,  0.9115]])


Finally, we can verify that our model specification is correct using PyTorch's built-in [RNN class](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html).

Note that PyTorch uses a slightly different notation for their RNN equation:

$$h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})$$

In [8]:
import torch.nn as nn

torch.manual_seed(0)

# Create RNN
rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True) # by default, the expected shape is (timestep, batch, feature)

# Check parameters
for p in rnn.named_parameters():
    print(p)

('weight_ih_l0', Parameter containing:
tensor([[-0.0037],
        [ 0.2682],
        [-0.4115],
        [-0.3680]], requires_grad=True))
('weight_hh_l0', Parameter containing:
tensor([[-0.1926,  0.1341, -0.0099,  0.3964],
        [-0.0444,  0.1323, -0.1511, -0.0983],
        [-0.4777, -0.3311, -0.2061,  0.0185],
        [ 0.1977,  0.3000, -0.3390, -0.2177]], requires_grad=True))
('bias_ih_l0', Parameter containing:
tensor([ 0.1816,  0.4152, -0.1029,  0.3742], requires_grad=True))
('bias_hh_l0', Parameter containing:
tensor([-0.0806,  0.0529,  0.4527, -0.4638], requires_grad=True))


In [9]:
# Overwrite parameters to our random init
rnn.weight_ih_l0 = nn.Parameter(W_xh.T)
rnn.weight_hh_l0 = nn.Parameter(W_hh.T)
rnn.bias_hh_l0   = nn.Parameter(torch.zeros(rnn.bias_hh_l0.shape))
rnn.bias_ih_l0   = nn.Parameter(torch.zeros(rnn.bias_ih_l0.shape))

In [10]:
# Forward pass
output, h_n = rnn(x) # RNN returns a tuple of (all hidden states, last hidden state)
output

tensor([[[ 0.9123, -0.2853, -0.9747,  0.5142],
         [ 0.9988,  0.6723, -0.9996, -0.6054],
         [ 1.0000,  0.8938, -1.0000,  0.9218],
         [ 1.0000,  0.6597, -1.0000,  0.7797],
         [ 1.0000,  0.3970, -1.0000,  0.9115]]], grad_fn=<TransposeBackward1>)

In [11]:
h_n

tensor([[[ 1.0000,  0.3970, -1.0000,  0.9115]]], grad_fn=<StackBackward0>)