In [2]:
import torch
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(variable_name: str, derivative_tensor: torch.Tensor, tensor: torch.Tensor) -> bool:
  ex = torch.all(derivative_tensor == tensor.grad).item()
  app = torch.allclose(derivative_tensor, tensor.grad)
  maxdiff = (derivative_tensor - tensor.grad).abs().max().item()
  print(f'{variable_name:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')
  return app

In [3]:
sample_size = 10
input_neurons = 1
hidden_neurons = 2
output_features = 1
iterations = 100
learning_rate = .1
target_number = 2
g = torch.Generator().manual_seed(2147483647) # for reproducibility

X = torch.arange(-5, sample_size * input_neurons -5, dtype=torch.float32, requires_grad=True).view(sample_size, -1)
W1 = torch.ones(input_neurons, hidden_neurons)
W2 = torch.ones(hidden_neurons, output_features)
b1 = torch.zeros(hidden_neurons, dtype=torch.float32)
b2 = torch.zeros(output_features, dtype=torch.float32)
parameters = [W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True
for i in range(iterations):
    # forward pass
    # first layer
    h1 = X @ W1
    hb1 = h1 + b1
    a1 = hb1.relu()
    
    # second layer
    h2 = a1 @ W2
    hb2 = h2 + b2
    a2 = hb2.relu()
    losses = (target_number - a2).abs()
    total_loss = losses.mean()
    
    # backward pass
    for p in parameters:
        p.grad = None
    for t in [total_loss, losses, h1, hb1, a1, h2, hb2, a2,  W2, b2, X]:
        t.retain_grad()
    total_loss.backward()
    d_total_loss = torch.tensor(1)
    d_losses = torch.ones_like(losses) * d_total_loss/sample_size
    d_a2 = -torch.sign(target_number-a2) * d_losses
    d_hb2 = d_a2.clone()
    d_hb2[hb2<=0] = 0
    d_b2 = d_hb2.sum(0)
    d_h2 = d_hb2
    d_W2 = a1.T @ d_h2 
    d_a1 = d_h2 @ W2.T
    d_hb1 = d_a1.clone()
    d_hb1[a1<=0] = 0
    d_b1 = d_hb1.sum(0)
    d_h1 = d_hb1
    d_W1 = X.T @ d_h1
    d_X = d_h1 @ W1.T
    
    manual_grads = [d_W1, d_b1, d_W2, d_b2]
    
    d_hb = d_losses
    matches = [
        cmp('losses', d_losses, losses),
        cmp('a2', d_a2, a2),
        cmp('hb2', d_hb2, hb2),
        cmp('b2', d_b2, b2),
        cmp('h2', d_h2, h2),
        cmp('d_W2', d_W2, W2),
        cmp('d_a1', d_a1, a1),
        cmp('d_hb1', d_hb1, hb1),
        cmp('d_b1', d_b1, b1),
        cmp('d_h1', d_h1, h1),
        cmp('d_W1', d_W1, W1),
        # cmp('d_X', d_X, X)
    ]
    
    
    print(f'{i+1}/{iterations}: {total_loss=}')
    
    # updates
    for p, m in zip(parameters, manual_grads):
        auto_update = -learning_rate * p.grad
        manual_update = -learning_rate * m
        assert(False not in matches)
        p.data += manual_update


losses          | exact: True  | approximate: True  | maxdiff: 0.0
a2              | exact: True  | approximate: True  | maxdiff: 0.0
hb2             | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
h2              | exact: True  | approximate: True  | maxdiff: 0.0
d_W2            | exact: True  | approximate: True  | maxdiff: 0.0
d_a1            | exact: True  | approximate: True  | maxdiff: 0.0
d_hb1           | exact: True  | approximate: True  | maxdiff: 0.0
d_b1            | exact: True  | approximate: True  | maxdiff: 0.0
d_h1            | exact: True  | approximate: True  | maxdiff: 0.0
d_W1            | exact: True  | approximate: True  | maxdiff: 0.0
1/100: total_loss=tensor(2.4000, grad_fn=<MeanBackward0>)
losses          | exact: True  | approximate: True  | maxdiff: 0.0
a2              | exact: True  | approximate: True  | maxdiff: 0.0
hb2             | exact: True  | approximate: True  | maxdiff: 0.0
b2  

In [4]:
xg = torch.tensor([[ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [-0.1656],
        [ 0.3656],
        [ 0.3656],
        [ 0.3656]])

xg_bad = torch.tensor([[ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [-0.1656],
        [ 0.1656],
        [ 0.1656],
        [ 0.1656]])

wg = torch.tensor([[0.7280, 0.7280]])

hg = torch.tensor([[ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [-0.0910, -0.0910],
        [ 0.0910,  0.0910],
        [ 0.0910,  0.0910],
        [ 0.0910,  0.0910]])


wg @ hg.T

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.1325,  0.1325,
          0.1325,  0.1325]])

In [5]:
a = torch.arange(1,10).view(3,-1)
a[0][1]

tensor(2)

In [24]:
k = torch.randn(100000000)*5

In [27]:
k.std()

tensor(5.0003)

In [37]:
a = torch.ones((3,2)) * 3
a

tensor([[3., 3.],
        [3., 3.],
        [3., 3.]])

In [41]:
b = torch.ones(8).view(2,2,2) * 2
b


tensor([[[2., 2.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]]])

In [42]:
a @ b

tensor([[[12., 12.],
         [12., 12.],
         [12., 12.]],

        [[12., 12.],
         [12., 12.],
         [12., 12.]]])