In [1]:
import torch
import torch.nn as nn

# PyTorch custom derivative test

A test about how PyTorch takes derivatives w.r.t. x for a backpropagation if we have more than two layers. Custom functions allow completely change a derivation behavior of PyTorch. Probably x has a special attribute inside PyTorch that marks it as an input. PyTorch tracks that.

Please, pay attention to the following part of the code:

```
            grad_input = grad_output.mm(weight) + dumb # ADDED INTENTIONALLY

```
This is an incorrect derivation made intentionally to check how PyTorch behave in this case. We can see that backproping to the first layer takes a total derivative w.r.t x.



In [2]:
dumb = torch.randn([100, 64])
class LinearFunction(torch.autograd.Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t()) #         + torch.ones([100, 64])
        print(output.shape)
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        print('grad_ouput', grad_output)
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            global dumb
            grad_input = grad_output.mm(weight) + dumb # ADDED INTENTIONALLY
            print(grad_input.shape)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias


In [3]:
class Linear(torch.nn.Module):
    def __init__(self, input_features, output_features, bias=True):
        super(Linear, self).__init__()
        self.input_features = input_features
        self.output_features = output_features

        # nn.Parameter is a special kind of Tensor, that will get
        # automatically registered as Module's parameter once it's assigned
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters require gradients by default.
        self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)

        # Not a very smart way to initialize weights
        self.weight.data.uniform_(-0.1, 0.1)
        if bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)

    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return LinearFunction.apply(input, self.weight, self.bias)

    def extra_repr(self):
        # (Optional)Set the extra information about this module. You can test
        # it by printing an object of this class.
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )


In [4]:
fc = Linear(3,64)

In [5]:
tensor = torch.ones([100,3])

In [6]:
first_layer_x = 1
class FC(torch.nn.Module):
    def __init__(self):
        super(FC, self).__init__()
        self.fc1 = Linear(3,64,bias=True)
        self.fc2 = Linear(64,128,bias=True)
    
    def forward(self, x):        
        x = self.fc1(x)
        print('A first layer output', x)
        global first_layer_x
        first_layer_x = x
        print('First layer x', first_layer_x)
        print('x', x)
        x = self.fc2(x)
        print('A second layer output', x)
        return x
        

In [7]:
model = FC()

In [8]:
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

In [9]:
criterion = torch.nn.MSELoss()

In [10]:
param = list(model.parameters())

In [11]:
tensor.matmul(param[0].T) + param[1]

tensor([[-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        ...,
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155]],
       grad_fn=<AddBackward0>)

In [12]:
output = model(tensor)

torch.Size([100, 64])
A first layer output tensor([[-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        ...,
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155]],
       grad_fn=<LinearFunctionBackward>)
First layer x tensor([[-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        ...,
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155]],
       grad_fn=<LinearFunctionBack

In [13]:
output

tensor([[ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524],
        [ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524],
        [ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524],
        ...,
        [ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524],
        [ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524],
        [ 0.0616, -0.0346, -0.0717,  ...,  0.0043, -0.0171,  0.1524]],
       grad_fn=<LinearFunctionBackward>)

In [14]:
loss = criterion(output, torch.ones([100,128]))

In [15]:
loss

tensor(1.0228, grad_fn=<MseLossBackward>)

Loss calculation according to: https://pytorch.org/docs/stable/_modules/torch/nn/functional.html

In [16]:
torch.mean(torch.pow(output - torch.ones([100, 128]), 2))

tensor(1.0228, grad_fn=<MeanBackward0>)

A derivative of `torch.mean(torch.pow(output - torch.ones([100, 64]), 2))` is 2 * (output - torch.ones([100,64]))/6400. 

In [17]:
100*128/2

6400.0

In [18]:
(output - torch.ones([100, 128]))/6400

tensor([[-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        ...,
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001]],
       grad_fn=<DivBackward0>)

In [19]:
loss_gradient = ((output - torch.ones([100, 128]))/6400)

In [20]:
loss_gradient

tensor([[-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        ...,
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001]],
       grad_fn=<DivBackward0>)

In [21]:
loss.backward()

grad_ouput tensor([[-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        ...,
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001],
        [-0.0001, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0001]])
torch.Size([100, 64])
grad_ouput tensor([[ 0.0914, -1.4854,  0.1134,  ..., -0.2377,  1.0037, -0.7683],
        [ 0.4085,  0.1006, -0.6509,  ...,  1.7163,  0.9196, -0.5435],
        [-0.5303,  0.8741,  1.5429,  ..., -1.2192,  0.7187,  0.7971],
        ...,
        [-2.1106,  0.2159, -0.4652,  ...,  0.4250, -1.3479, -0.2294],
        [ 0.9369, -0.7744,  2.0606,  ..., -0.8292, -0.5899, -0.1206],
        [-1.0319,  0.6146, -0.3172,  ...,  0.2297, -1.4789,  1.0998]])


In [28]:
print(loss_gradient.shape)
print(param[2])
print(first_layer_x)
loss_gradient.matmul(param[2]) + dumb



torch.Size([100, 128])
Parameter containing:
tensor([[ 0.0806, -0.0817, -0.0623,  ...,  0.0449,  0.0076, -0.0028],
        [ 0.0179, -0.0492,  0.0152,  ..., -0.0833,  0.0754, -0.0307],
        [ 0.0373,  0.0031, -0.0319,  ...,  0.0046, -0.0867,  0.0502],
        ...,
        [-0.0001,  0.0490,  0.0659,  ..., -0.0413, -0.0097, -0.0660],
        [ 0.0660, -0.0291, -0.0094,  ..., -0.0817, -0.0620,  0.0590],
        [ 0.0987,  0.0350,  0.0048,  ..., -0.0320, -0.0627, -0.0613]],
       requires_grad=True)
tensor([[-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        ...,
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155],
        [-0.2975,  0.0106, -0.1670,  ...,  0.1100, -0.0406, -0.0155]],
       grad_fn=<LinearFunctionBackward>)


tensor([[ 0.0914, -1.4854,  0.1134,  ..., -0.2377,  1.0037, -0.7683],
        [ 0.4085,  0.1006, -0.6509,  ...,  1.7163,  0.9196, -0.5435],
        [-0.5303,  0.8741,  1.5429,  ..., -1.2192,  0.7187,  0.7971],
        ...,
        [-2.1106,  0.2159, -0.4652,  ...,  0.4250, -1.3479, -0.2294],
        [ 0.9369, -0.7744,  2.0606,  ..., -0.8292, -0.5899, -0.1206],
        [-1.0319,  0.6146, -0.3172,  ...,  0.2297, -1.4789,  1.0998]],
       grad_fn=<AddBackward0>)

In [27]:
for par in model.parameters():
    print(par)

Parameter containing:
tensor([[-0.0478, -0.0918, -0.0705],
        [ 0.0702,  0.0622, -0.0562],
        [-0.0820, -0.0493,  0.0142],
        [-0.0884,  0.0755, -0.0355],
        [-0.0109,  0.0869, -0.0419],
        [ 0.0202,  0.0658, -0.0808],
        [ 0.0736, -0.0859, -0.0946],
        [ 0.0393, -0.0757,  0.0435],
        [-0.0081, -0.0914,  0.0919],
        [ 0.0432, -0.0647, -0.0833],
        [-0.0751, -0.0023,  0.0267],
        [-0.0642,  0.0523,  0.0440],
        [-0.0812,  0.0039, -0.0312],
        [-0.0841, -0.0803, -0.0699],
        [-0.0367,  0.0693, -0.0602],
        [ 0.0396,  0.0787,  0.0327],
        [-0.0245, -0.0190, -0.0222],
        [-0.0507,  0.0758, -0.0181],
        [-0.0978, -0.0492, -0.0030],
        [ 0.0631, -0.0086, -0.0760],
        [-0.0721, -0.0774, -0.0514],
        [ 0.0171, -0.0902, -0.0275],
        [-0.0265, -0.0178, -0.0691],
        [ 0.0136,  0.0746,  0.0173],
        [ 0.0158, -0.0189,  0.0811],
        [ 0.0393, -0.0257, -0.0180],
        [-0.0094

In [23]:
import torch

In [2]:
w = torch.nn.Parameter(torch.randn([100,64]))
x = torch.randn([3,64])
b = torch.nn.Parameter(torch.randn([100]))
output = (x.matmul(w.T) + b)**2

In [3]:
criterion = torch.nn.MSELoss()
loss = criterion(output, torch.ones([3,100]))
loss.backward()

In [4]:
with torch.no_grad():
    loss_gradient = ((output - torch.ones([3, 100]))/150)

In [5]:
print(loss_gradient)

tensor([[ 6.1762e-02, -6.3776e-03,  7.6651e-01,  6.4886e-01,  4.2851e-01,
          4.1092e-01,  7.7465e-01,  5.7314e-02,  4.7015e-01,  2.6201e-01,
          1.4099e-01,  8.8756e-02,  1.6239e-02,  3.1141e-02,  1.4890e-01,
          1.1460e-03,  4.6536e-02,  1.1856e+00,  2.2837e-01,  1.1026e+00,
          3.4822e-02, -6.0711e-03,  1.1283e-01,  1.9060e-01,  4.4046e-02,
          4.0397e-02,  1.2414e-01,  1.3519e-01,  3.5951e-01,  1.7745e+00,
          2.0226e-01,  2.9339e-01, -1.1761e-03,  7.1945e-01,  1.8567e-01,
          2.1091e+00,  2.3302e-02,  1.2636e+00,  1.2043e+00,  1.1705e+00,
          1.5859e+00,  1.8218e-01,  3.1120e-02,  1.1143e+00,  4.4103e-02,
          6.7623e-01,  1.7845e+00,  1.0779e+00,  1.2061e+00,  1.2667e+00,
          3.4298e-01,  3.3192e-01,  1.0344e+00,  1.1039e-02,  1.8017e-01,
          4.8738e-01,  6.7580e-02, -5.2218e-03,  1.1003e+00,  3.3413e-01,
          2.7089e-01,  1.4560e+00,  4.8384e-02,  1.8120e-01,  8.7348e-02,
          1.3658e-01,  3.6332e-01,  1.

In [33]:
with torch.no_grad():
    print(loss_gradient.shape)
    print(((loss_gradient*2*(x.matmul(w.T) + b))).sum(0))


torch.Size([3, 100])
tensor([-4.3873e-01, -2.9728e+01, -4.9838e+01,  1.2509e+01, -4.6836e+00,
        -2.2341e+01, -1.6781e+01,  3.9827e-01,  7.9640e+00,  5.8687e+01,
         1.6957e+01, -5.2180e+00,  3.3703e-01,  2.0096e+01, -3.9937e+00,
        -1.5398e+01, -1.0486e-01,  3.1658e+01,  2.7188e+00, -6.2306e+00,
        -8.4976e+00, -9.8287e-01, -3.4577e+01, -8.3390e-01, -2.1032e-01,
         1.8376e+00,  2.4963e+00,  4.6461e+01,  7.9930e+00,  5.8139e+01,
        -2.3670e+01, -4.0903e+01,  2.5247e+00, -2.7125e+00,  2.0858e+00,
        -6.3743e+01, -1.2298e+01, -3.2662e+01, -1.0005e+01,  1.2866e+01,
         1.0230e+01,  3.1542e+00, -1.4912e-01, -1.8328e+01,  6.3512e+00,
        -2.5274e+01,  6.2754e+01, -2.7495e+01, -2.6257e+01,  3.5017e+01,
         4.8046e+01,  5.1212e+00,  1.6375e+01,  1.8017e+01, -3.2962e-01,
        -1.8105e+01,  3.7614e+00, -1.0906e+00, -4.1908e+01,  2.7313e+00,
         6.2380e+00,  3.9891e+01,  1.4830e+00,  1.1020e+02, -6.4700e-01,
         1.3245e+00, -5.2067e+

In [31]:
b.grad

tensor([-4.3873e-01, -2.9728e+01, -4.9838e+01,  1.2509e+01, -4.6836e+00,
        -2.2341e+01, -1.6781e+01,  3.9827e-01,  7.9640e+00,  5.8687e+01,
         1.6957e+01, -5.2180e+00,  3.3703e-01,  2.0096e+01, -3.9937e+00,
        -1.5398e+01, -1.0486e-01,  3.1658e+01,  2.7188e+00, -6.2306e+00,
        -8.4976e+00, -9.8287e-01, -3.4577e+01, -8.3390e-01, -2.1032e-01,
         1.8376e+00,  2.4963e+00,  4.6461e+01,  7.9930e+00,  5.8139e+01,
        -2.3670e+01, -4.0903e+01,  2.5247e+00, -2.7125e+00,  2.0858e+00,
        -6.3743e+01, -1.2298e+01, -3.2662e+01, -1.0005e+01,  1.2866e+01,
         1.0230e+01,  3.1542e+00, -1.4912e-01, -1.8328e+01,  6.3512e+00,
        -2.5274e+01,  6.2754e+01, -2.7495e+01, -2.6257e+01,  3.5017e+01,
         4.8046e+01,  5.1212e+00,  1.6375e+01,  1.8017e+01, -3.2962e-01,
        -1.8105e+01,  3.7614e+00, -1.0906e+00, -4.1908e+01,  2.7313e+00,
         6.2380e+00,  3.9891e+01,  1.4830e+00,  1.1020e+02, -6.4700e-01,
         1.3245e+00, -5.2067e+01, -2.1265e+01,  5.8

In [31]:
output.grad_fn

<PowBackward0 at 0x7fe4207078d0>