In [None]:
%load_ext autoreload
%autoreload 2

import time
import numpy as np
from torch import nn
import torch
from d2l import torch as d2l

import mytorch
from mytorch import nn as mynn
from mytorch.nn.activation import Tanh

from numpy.linalg import norm as norm

In [None]:
# problem size
batch_size = 128
num_inputs = 2
num_hiddens = 20
num_layers = 1
seq_length = 5

# pytorch RNN
torch_rnn = nn.RNN(num_inputs, num_hiddens, num_layers, bias=False)
all_weights = torch_rnn.all_weights[0]
Wxh = torch_rnn.weight_ih_l0.data.detach().numpy().T
Whh = torch_rnn.weight_hh_l0.data.detach().numpy().T

# mytorch RNN
my_rnn = mynn.RNN(num_inputs, num_hiddens)
my_rnn.Wxh = Wxh
my_rnn.Whh = Whh

X = np.random.randn(seq_length, batch_size, num_inputs)
Y = np.random.randn(seq_length, batch_size, num_hiddens)
Xt = torch.tensor(X).float()
Yt = torch.tensor(Y).float()

# Compare `forward()`

In [None]:
torch_out, torch_state = torch_rnn(Xt)
my_out, my_state = my_rnn.forward(X)

print('Difference in outputs:', norm(my_out - torch_out.data.numpy()))
print('Difference in states:', norm(my_state - torch_state.data.numpy()))

# Compare `backward()`

In [None]:
optimizer = torch.optim.SGD(torch_rnn.parameters(), lr=0.1, momentum=0.0)
optimizer.zero_grad()
torch_loss_fn = nn.MSELoss()
torch_loss = torch_loss_fn(torch_out, Yt)
torch_loss.backward(retain_graph=True)
torch_dLdWxh = torch_rnn.weight_ih_l0.grad.data.numpy().T
torch_dLdWhh = torch_rnn.weight_hh_l0.grad.data.numpy().T

my_mse_fn = mynn.MSELoss()
my_mse = my_mse_fn.forward(my_out, Y)
dLdO = my_mse_fn.backward()
my_rnn.backward(dLdO)
my_dLdWxh = my_rnn.dLdWxh
my_dLdWhh = my_rnn.dLdWhh

print('Difference in dLdWxh:', norm(my_dLdWxh - torch_dLdWxh))
print('Difference in dLdWhh:', norm(my_dLdWhh - torch_dLdWhh))