In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
from torch import nn
from d2l import torch as d2l

import mytorch
from mytorch import nn as mynn
from models import MLP0, MLP1

In [None]:
# set up synthetic data
N = 10
num_inputs = 7
num_outputs = 2

# numpy/our versions
W = np.random.rand(num_inputs, num_outputs)
b = np.random.rand(num_outputs)
X = np.random.randn(N, num_inputs)
Y = X @ W + np.outer(np.ones(N), b) + 0.5 * np.random.randn(N, num_outputs)

# converted torch versions
Xt = torch.tensor(X).float()
Wt = torch.tensor(W).float()
bt = torch.tensor(b).float()
Yt = torch.tensor(Y).float()

# MLP0

## Test `forward()`

In [None]:
# initialize model and fix weights to true values
mlp0 = MLP0(num_inputs, num_outputs)
mlp0.layers[0].W = W
mlp0.layers[0].b = b

# initialize torch model, loss, optimizer
net = nn.Sequential(nn.Linear(num_inputs, num_outputs))
net[0].weight = nn.Parameter(Wt.T)
net[0].bias = nn.Parameter(bt)
optimizer = torch.optim.SGD(net.parameters(), lr=1, momentum=0.0)

my_out = mlp0.forward(X)
torch_out = net(Xt)

print('MyTorch:\n', my_out, '\n')
print('PyTorch:\n', torch_out.data, '\n')
print('Difference:', np.linalg.norm(my_out - torch_out.data.numpy()))

## Test `backward()`

In [None]:
my_mse_fn = mynn.MSELoss()
my_mse = my_mse_fn.forward(my_out, Y)
dLdZ = my_mse_fn.backward()
mlp0.backward(dLdZ)
my_dLdW = mlp0.layers[0].dLdW
my_dLdb = mlp0.layers[0].dLdb

optimizer.zero_grad()
torch_loss_fn = nn.MSELoss()
torch_loss = torch_loss_fn(torch_out, Yt)
torch_loss.backward(retain_graph=True)
torch_dLdW = net[0].weight.grad.data
torch_dLdb = net[0].bias.grad.data

print('MyTorch dLdW:\n', my_dLdW, '\n')
print('PyTorch dLdW:\n', torch_dLdW.T, '\n')
print('MyTorch dLdb:\n', my_dLdb, '\n')
print('PyTorch dLdb:\n', torch_dLdb, '\n')

print('Difference in dLdW:', np.linalg.norm(my_dLdW.T - torch_dLdW.numpy()))
print('Difference in dLdb:', np.linalg.norm(my_dLdb.flatten() - torch_dLdb.numpy()))

## Test a single optimization step

In [None]:
# my SGD step
my_optimizer = mytorch.optim.SGD(mlp0, lr=1)
my_optimizer.step()
my_Wk = mlp0.layers[0].W
my_bk = mlp0.layers[0].b

# torch SGD step
optimizer.zero_grad()
torch_loss.backward(retain_graph=True)
optimizer.step()
torch_Wk = net[0].weight.data
torch_bk = net[0].bias.data

print('MyTorch Wk:\n', my_Wk, '\n')
print('PyTorch Wk:\n', torch_Wk.T, '\n')
print('MyTorch bk:\n', my_bk, '\n')
print('PyTorch bk:\n', torch_bk)

print('Difference in Wk:', np.linalg.norm(my_Wk.T - torch_Wk.numpy()))
print('Difference in bk:', np.linalg.norm(my_bk.flatten() - torch_bk.numpy()))

# MLP1

## Test `forward()`

In [None]:
num_hiddens=3

# initialize torch model, loss, optimizer
net = nn.Sequential(nn.Linear(num_inputs, num_hiddens),
                   nn.ReLU(),
                   nn.Linear(num_hiddens, num_outputs),
                   nn.ReLU())
optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.0)

# initialize my network using torch W, b for each layer
W0 = net[0].weight.detach().numpy().T
b0 = net[0].bias.detach().numpy().T
W1 = net[2].weight.detach().numpy().T
b1 = net[2].bias.detach().numpy().T

mlp1 = MLP1(num_inputs, num_outputs, num_hiddens)
mlp1.layers[0].W = W0
mlp1.layers[0].b = b0
mlp1.layers[1].W = W1
mlp1.layers[1].b = b1

my_out = mlp1.forward(X)
torch_out = net(Xt)

print('MyTorch:\n', my_out, '\n')
print('PyTorch:\n', torch_out.data, '\n')
print('Difference:', np.linalg.norm(my_out - torch_out.data.numpy()))

## Test `backward()`

In [None]:
my_mse_fn = mynn.MSELoss()
my_mse = my_mse_fn.forward(my_out, Y)
dLdZ = my_mse_fn.backward()
mlp1.backward(dLdZ)
my_dLdW0 = mlp1.layers[0].dLdW
my_dLdb0 = mlp1.layers[0].dLdb
my_dLdW1 = mlp1.layers[1].dLdW
my_dLdb1 = mlp1.layers[1].dLdb

optimizer.zero_grad()
torch_loss_fn = nn.MSELoss()
torch_loss = torch_loss_fn(torch_out, Yt)
torch_loss.backward(retain_graph=True)
torch_dLdW0 = net[0].weight.grad.data
torch_dLdb0 = net[0].bias.grad.data
torch_dLdW1 = net[2].weight.grad.data
torch_dLdb1 = net[2].bias.grad.data

print('Difference in dLdW0:', np.linalg.norm(my_dLdW0 - torch_dLdW0.data.numpy()))
print('Difference in dLdb0:', np.linalg.norm(my_dLdb0.flatten() - torch_dLdb0.data.numpy()))
print('Difference in dLdW1:', np.linalg.norm(my_dLdW1 - torch_dLdW1.data.numpy()))
print('Difference in dLdb1:', np.linalg.norm(my_dLdb1.flatten() - torch_dLdb1.data.numpy()))

## Test a single optimization step

In [None]:
# my SGD step
my_optimizer = mytorch.optim.SGD(mlp1, lr=1)
my_optimizer.step()
my_Wk0 = mlp1.layers[0].W
my_bk0 = mlp1.layers[0].b
my_Wk1 = mlp1.layers[1].W
my_bk1 = mlp1.layers[1].b

# torch SGD step
optimizer.zero_grad()
torch_loss.backward(retain_graph=True)
optimizer.step()
torch_Wk0 = net[0].weight.data
torch_bk0 = net[0].bias.data
torch_Wk1 = net[2].weight.data
torch_bk1 = net[2].bias.data

print('Difference in Wk0:', np.linalg.norm(my_Wk0 - torch_Wk0.numpy().T))
print('Difference in bk0:', np.linalg.norm(my_bk0.flatten() - torch_bk0.numpy()))
print('Difference in Wk1:', np.linalg.norm(my_Wk1 - torch_Wk1.numpy().T))
print('Difference in bk1:', np.linalg.norm(my_bk1.flatten() - torch_bk1.numpy()))