In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
from mytorch import nn as mynn
from mytorch.optim import SGD
from torch import nn
import torch
from d2l import torch as d2l

# MSE Testing

In [None]:
# set up synthetic data
N = 10
num_inputs = 7
num_outputs = 3

# numpy/our versions
W = np.random.rand(num_inputs, num_outputs)
b = np.random.rand(num_outputs, 1)
X = np.random.randn(N, num_inputs)
Y = X @ W + np.outer(np.ones(N), b) + 0.5 * np.random.randn(N, num_outputs)

# converted torch versions
Xt = torch.tensor(X).float()
Wt = torch.tensor(W).float()
bt = torch.tensor(b).float()
Yt = torch.tensor(Y).float()

In [None]:
# initialize model and fix weights to true values
my_net = mynn.Linear(num_inputs, num_outputs)
my_net.W = W
my_net.b = b

# initialize torch model, loss, optimizer
net = nn.Linear(num_inputs, num_outputs)
net.weight = nn.Parameter(Wt.T)
net.bias = nn.Parameter(bt[:, 0])
torch_out = net(Xt)
optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.0)

## Test `forward()`

In [None]:
# torch loss function
torch_mse_fn = nn.MSELoss()
torch_mse = torch_mse_fn(torch_out, Yt)

# mytorch loss function
my_mse_fn = mynn.MSELoss()
my_mse = my_mse_fn.forward(torch_out.detach().numpy(), Y)

print('Torch MSE:', torch_mse.data)
print('My MSE:', my_mse, '\n')

## Test `backward()`

In [None]:
# MSE
optimizer.zero_grad()
torch_out = net(Xt)
torch_mse = torch_mse_fn(torch_out, Yt)
torch_mse.backward(retain_graph=True)
torch_dLdW = net.weight.grad.data
torch_dLdb = net.bias.grad.data

dLdZ = my_mse_fn.backward()
my_net.forward(X)
my_net.backward(dLdZ)
my_dLdW = my_net.dLdW
my_dLdb = my_net.dLdb

print('MyTorch dLdW:\n', my_dLdW, '\n')
print('PyTorch dLdW:\n', torch_dLdW.T, '\n')
print('MyTorch dLdb:\n', my_dLdb, '\n')
print('PyTorch dLdb:\n', torch_dLdb, '\n')

print('Difference in dLdW:', np.linalg.norm(my_dLdW.T - torch_dLdW.data.numpy()))
print('Difference in dLdb:', np.linalg.norm(my_dLdb.flatten() - torch_dLdb.data.numpy()))

# CE Testing

In [None]:
# set up synthetic data
N = 10
num_inputs = 7
num_outputs = 3

# numpy/our versions
W = np.random.rand(num_inputs, num_outputs)
b = np.random.rand(num_outputs, 1)
# generate random one-hot matrix
x = np.eye(num_outputs)
x[np.random.choice(x.shape[0], size=N)]
Y = np.eye(num_outputs)[np.random.choice(num_outputs, N)]

# converted torch versions
Xt = torch.tensor(X).float()
Wt = torch.tensor(W).float()
bt = torch.tensor(b).float()
Yt = torch.tensor(Y).float()

In [None]:
# initialize model and fix weights to true values
my_net = mynn.Linear(num_inputs, num_outputs)
my_net.W = W
my_net.b = b

# initialize torch model, loss, optimizer
net = nn.Linear(num_inputs, num_outputs)
net.weight = nn.Parameter(Wt.T)
net.bias = nn.Parameter(bt[:, 0])
torch_out = net(Xt)
optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.0)

## Test `forward()`

In [None]:
# torch loss functions
torch_ce_fn = nn.CrossEntropyLoss()
torch_ce = torch_ce_fn(torch_out, Yt)

# mytorch loss functions
my_ce_fn = mynn.CrossEntropyLoss()
my_ce = my_ce_fn.forward(torch_out.detach().numpy(), Y)

print('Torch CE:', torch_ce.data)
print('My CE:', my_ce, '\n')

## Test `backward()`

In [None]:
optimizer.zero_grad()
torch_out = net(Xt)
torch_ce = torch_ce_fn(torch_out, Yt)
torch_ce.backward(retain_graph=True)
torch_dLdW = net.weight.grad.data
torch_dLdb = net.bias.grad.data

dLdZ = my_ce_fn.backward()
my_net.forward(X)
my_net.backward(dLdZ)
my_dLdW = my_net.dLdW
my_dLdb = my_net.dLdb

print('MyTorch dLdW:\n', my_dLdW, '\n')
print('PyTorch dLdW:\n', torch_dLdW.T, '\n')
print('MyTorch dLdb:\n', my_dLdb, '\n')
print('PyTorch dLdb:\n', torch_dLdb, '\n')

print('Difference in dLdW:', np.linalg.norm(my_dLdW.T - torch_dLdW.data.numpy()))
print('Difference in dLdb:', np.linalg.norm(my_dLdb.flatten() - torch_dLdb.data.numpy()))