<a href="https://colab.research.google.com/github/arthurziegler/pytorch-deep-learning-course/blob/main/Notebooks/PyTorch_Understanding_RNN_Shapes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Things you should automatically know and have memorized
# N = number of samples
# T = sequence length
# D = number of input features
# M = number of hidden units
# K = number of output units

In [None]:
# Make some data
N = 1
T = 10
D = 3
M = 5
K = 2
X = np.random.randn(N, T, D)

In [None]:
# Make an RNN
class SimpleRNN(nn.Module):
  def __init__(self, n_inputs, n_hidden, n_outputs):
    super(SimpleRNN, self).__init__()
    self.D = n_inputs
    self.M = n_hidden
    self.K = n_outputs
    self.rnn = nn.RNN(
        input_size=self.D,
        hidden_size=self.M,
        nonlinearity='tanh',
        batch_first=True)
    self.fc = nn.Linear(self.M, self.K)
  
  def forward(self, X):
    # initial hidden states
    h0 = torch.zeros(1, X.size(0), self.M)

    # get RNN unit output
    out, _ = self.rnn(X, h0)

    # we only want h(T) at the final time step
    # out = self.fc(out[:, -1, :])
    out = self.fc(out)
    return out

In [None]:
# Instantiate the model
model = SimpleRNN(n_inputs=D, n_hidden=M, n_outputs=K)

In [None]:
# Get the output
inputs = torch.from_numpy(X.astype(np.float32))
out = model(inputs)
out

tensor([[[ 0.3412,  0.3875],
         [ 0.2429,  0.3475],
         [ 0.2228, -0.0777],
         [ 0.1691,  0.1503],
         [-0.0489,  0.5948],
         [ 0.2913,  0.2771],
         [-0.0660,  0.7480],
         [ 0.4096,  0.2019],
         [ 0.3737,  0.1150],
         [ 0.2932, -0.0769]]], grad_fn=<AddBackward0>)

In [None]:
out.shape

torch.Size([1, 10, 2])

In [None]:
# Save for later
Yhats_torch = out.detach().numpy()

In [None]:
W_xh, W_hh, b_xh, b_hh = model.rnn.parameters()

In [None]:
W_xh.shape

torch.Size([5, 3])

In [None]:
W_xh

Parameter containing:
tensor([[ 0.1991, -0.1659,  0.3862],
        [-0.1038,  0.1998,  0.2228],
        [ 0.2377,  0.2534, -0.4395],
        [-0.2058,  0.4410, -0.0693],
        [ 0.3054,  0.3011,  0.0590]], requires_grad=True)

In [None]:
W_xh = W_xh.data.numpy()
W_xh

array([[ 0.19913203, -0.16591424,  0.38615942],
       [-0.10381901,  0.19977957,  0.22281706],
       [ 0.23766339,  0.25335354, -0.4395464 ],
       [-0.20575394,  0.4410168 , -0.06931382],
       [ 0.30540317,  0.3010537 ,  0.05902731]], dtype=float32)

In [None]:
b_xh = b_xh.data.numpy()
W_hh = W_hh.data.numpy()
b_hh = b_hh.data.numpy()

In [None]:
# Did we do it right?
W_xh.shape, b_xh.shape, W_hh.shape, b_hh.shape

((5, 3), (5,), (5, 5), (5,))

In [None]:
# Now get the FC layer weights
Wo, bo = model.fc.parameters()

In [None]:
Wo = Wo.data.numpy()
bo = bo.data.numpy()
Wo.shape, bo.shape

((2, 5), (2,))

In [None]:
# See if we can replicate the output
h_last = np.zeros(M) # initial hidden state
x = X[0] # the one and only sample
Yhats = np.zeros((T, K)) # where we store the outputs

for t in range(T):
  h = np.tanh(x[t].dot(W_xh.T) + b_xh + h_last.dot(W_hh.T) + b_hh)
  y = h.dot(Wo.T) + bo # we only care about this value on the last iteration
  Yhats[t] = y
  
  # important: assign h to h_last
  h_last = h

# print the final output
print(Yhats)

[[ 0.34121065  0.38745031]
 [ 0.24290176  0.34748834]
 [ 0.22276316 -0.0776559 ]
 [ 0.16914628  0.15032023]
 [-0.04893893  0.59482221]
 [ 0.29125264  0.27710128]
 [-0.06601527  0.74795336]
 [ 0.40959356  0.20187585]
 [ 0.37368858  0.11500061]
 [ 0.29322981 -0.07689164]]


In [None]:
# Check
np.allclose(Yhats, Yhats_torch)

True

In [None]:
# Bonus exercise: calculate the output for multiple samples at once (N > 1)