In [1]:

import torch
import torch.nn as nn
from IPython.display import Image
import numpy as np

In [2]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(2,2)
        self.s1 = nn.Sigmoid()
        self.fc2 = nn.Linear(2,2)
        self.s2 = nn.Sigmoid()
        self.fc1.weight = torch.nn.Parameter(torch.Tensor([[0.15,0.2],[0.250,0.30]]))
        self.fc1.bias = torch.nn.Parameter(torch.Tensor([0.35]))
        self.fc2.weight = torch.nn.Parameter(torch.Tensor([[0.4,0.45],[0.5,0.55]]))
        self.fc2.bias = torch.nn.Parameter(torch.Tensor([0.6]))

    def forward(self, x):
        x= self.fc1(x)
        x = self.s1(x)
        x= self.fc2(x)
        x = self.s2(x)
        return x

net = Net()
print(net)

Net(
  (fc1): Linear(in_features=2, out_features=2, bias=True)
  (s1): Sigmoid()
  (fc2): Linear(in_features=2, out_features=2, bias=True)
  (s2): Sigmoid()
)


In [3]:
# parameters: weight and bias
print(list(net.parameters()))
# input data
weight2 = list(net.parameters())[2]
print('weight2',weight2)
data = torch.Tensor([0.05,0.1]);data

[Parameter containing:
tensor([[0.1500, 0.2000],
        [0.2500, 0.3000]], requires_grad=True), Parameter containing:
tensor([0.3500], requires_grad=True), Parameter containing:
tensor([[0.4000, 0.4500],
        [0.5000, 0.5500]], requires_grad=True), Parameter containing:
tensor([0.6000], requires_grad=True)]
weight2 Parameter containing:
tensor([[0.4000, 0.4500],
        [0.5000, 0.5500]], requires_grad=True)


tensor([0.0500, 0.1000])

In [4]:
# output of last layer
out = net(data)
target = torch.Tensor([0.01,0.99])  # a dummy target, for example
criterion = nn.MSELoss()
loss = criterion(out, target); loss

tensor(0.2984, grad_fn=<MseLossBackward0>)

In [5]:
# A simple hook class that returns the input and output of a layer during forward/backward pass
class Hook():
    net = Net()

    def __init__(self, module, backward=False):
        if backward==False:
            self.hook = module.register_forward_hook(self.hook_fn)

        else:
            self.hook = module.register_backward_hook(self.hook_fn)
    def hook_fn(self, module, input, output):
        self.input = input
        self.output = output
    def close(self):
        self.hook.remove()

In [6]:
# register hooks on each layer
print(list(net._modules.items()))

[('fc1', Linear(in_features=2, out_features=2, bias=True)), ('s1', Sigmoid()), ('fc2', Linear(in_features=2, out_features=2, bias=True)), ('s2', Sigmoid())]


In [7]:


hookF = [Hook(layer[1]) for layer in list(net._modules.items())]
hookB = [Hook(layer[1],backward=True) for layer in list(net._modules.items())]
# run a data batch
out=net(data)
# backprop once to get the backward hook results
out.backward(torch.tensor([1,1],dtype=torch.float),retain_graph=True)
#! loss.backward(retain_graph=True)  # doesn't work with backward hooks,
#! since it's not a network layer but an aggregated result from the outputs of last layer vs target

print('***'*3+'  Forward Hooks Inputs & Outputs  '+'***'*3)
for hook in hookF:
    print(hook.input)
    print(hook.output)
    print('---'*17)
print('\n')
print('***'*3+'  Backward Hooks Inputs & Outputs  '+'***'*3)
for hook in hookB:
    print(hook.input)
    print(hook.output)
    print('---'*17)

*********  Forward Hooks Inputs & Outputs  *********
(tensor([0.0500, 0.1000]),)
tensor([0.3775, 0.3925], grad_fn=<AddBackward0>)
---------------------------------------------------
(tensor([0.3775, 0.3925], grad_fn=<AddBackward0>),)
tensor([0.5933, 0.5969], grad_fn=<SigmoidBackward0>)
---------------------------------------------------
(tensor([0.5933, 0.5969], grad_fn=<SigmoidBackward0>),)
tensor([1.1059, 1.2249], grad_fn=<AddBackward0>)
---------------------------------------------------
(tensor([1.1059, 1.2249], grad_fn=<AddBackward0>),)
tensor([0.7514, 0.7729], grad_fn=<SigmoidBackward0>)
---------------------------------------------------


*********  Backward Hooks Inputs & Outputs  *********
(tensor([0.0392, 0.0435]), tensor([0.0827]))
(tensor([0.0392, 0.0435]),)
---------------------------------------------------
(tensor([0.0392, 0.0435]),)
(tensor([0.1625, 0.1806]),)
---------------------------------------------------
(tensor([0.1868, 0.1755]), tensor([0.3623]))
(tensor([0.18



## What is the input and output of forward and backward pass?
Things to notice:
Because backward pass runs from back to the start, it's parameter order should be reversed compared to the forward pass. Therefore, to be it clearer, I'll use a different naming convention below.
For forward pass, previous layer of layer 2 is layer1; for backward pass, previous layer of layer 2 is layer 3.
Model output is the output of last layer in forward pass.
layer.register_backward_hook(module, input, output)

Input: previous layer's output
Output: current layer's output
layer.register_backward_hook(module, grad_out, grad_in)

#### Grad_in: gradient of model output wrt. layer output       # from forward pass
= a tensor that represent the error of each neuron in this layer (= gradient of model output wrt. layer output = how much it should be improved)
For the last layer: eg. [1,1] <=> gradient of model output wrt. itself, which means calculate all gradients as normal
It can also be considered as a weight map: eg. [1,0] turn off the second gradient; [2,1] put double weight on first gradient etc.
#### Grad_out: Grad_in * (gradient of layer output wrt. layer input)
= next layer's error(due to chain rule)
Check the print from the cell above to confirm and enhance your understanding!

In [8]:
# Confirm the calculations with the print result above
# the 4th layer - sigmoid
forward_output = np.array([0.7514, 0.7729])
grad_in = np.array([1,1])  # sigmoid layer
# grad of sigmoid(x) wrt x is: sigmoid(x)(1-sigmoid(x))
grad_out = grad_in*(forward_output*(1-forward_output)); grad_out

array([0.18679804, 0.17552559])

In [9]:
# the 3th layer - linear
print([0.1868, 0.1755])  # grad_input * (grad of Wx+b = (w1*x1+w2*x2)+b wrt W)
print(0.1868 + 0.1755)   # grad of Wx+b wrt b o

grad_in = torch.Tensor(grad_out)
grad_in.view(1,-1) @ weight2;grad_out  # grad of layer output wrt input: wx+b => w



[0.1868, 0.1755]
0.36229999999999996


array([0.18679804, 0.17552559])

In [10]:
# the 2nd layer - sigmoid
forward_output=np.array([0.5933, 0.5969])
grad_in=np.array([0.1625, 0.1806])
grad_in*(forward_output*(1-forward_output)) # grad * (grad of sigmoid(x) wrt x)

array([0.03921046, 0.04345424])

In [11]:
# gradient of loss wrt parameters
net.zero_grad()
loss.backward(retain_graph=True)
[print(p.grad) for p in net.parameters()]

tensor([[0.0004, 0.0009],
        [0.0005, 0.0010]])
tensor([0.0187])
tensor([[ 0.0822,  0.0827],
        [-0.0226, -0.0227]])
tensor([0.1004])


[None, None, None, None]

## Modify gradients with hooks
Hook function doesn't change gradients by default
But if return is called, the returned value will be the gradient output

#### Guided backpropagation with hooks - Visualize CNN (deconv)

In [12]:
class Guided_backprop():
    """
        Visualize CNN activation maps with guided backprop.

        Returns: An image that represent what the network learnt for recognizing
        the given image.

        Methods: First layer input that minimize the error between the last layers output,
        for the given class, and the true label(=1).

        ! Call visualize(image) to get the image representation
    """
    def __init__(self,model):
        self.model = model
        self.image_reconstruction = None
        self.activation_maps = []
        # eval mode
        self.model.eval()
        self.register_hooks()

    def register_hooks(self):

        def first_layer_hook_fn(module, grad_out, grad_in):
            """ Return reconstructed activation image"""
            print('grad_out', grad_out[0])
            self.image_reconstruction = grad_out[0]

        def forward_hook_fn(module, input, output):
            """ Stores the forward pass outputs (activation maps)"""
            self.activation_maps.append(output)

        def backward_hook_fn(module, grad_out, grad_in):
            """ Output the grad of model output wrt. layer (only positive) """

            # Gradient of forward_output wrt. forward_input = error of activation map:
            # for relu layer: grad of zero = 0, grad of identity = 1
            grad = self.activation_maps[-1] # corresponding forward pass output
            print('grad.shape',grad.shape)
            grad[grad>0] = 1 # grad of relu when > 0

            # set negative output gradient to 0 #!???
            positive_grad_out = torch.clamp(input=grad_out[0],min=0.0)
            print('positive_grad_out.shape',positive_grad_out.shape)

            # backward grad_out = grad_out * (grad of forward output wrt. forward input)
            new_grad_out = positive_grad_out * grad

            del self.forward_outputs[-1]

            # For hook functions, the returned value will be the new grad_out
            return (new_grad_out,)

        # !!!!!!!!!!!!!!!! change the modules !!!!!!!!!!!!!!!!!!
        # only conv layers, no flattened fc linear layers
        modules = list(self.model.features._modules.items())

        # register hooks to relu layers
        for name, module in modules:
            if isinstance(module, nn.ReLU):
                module.register_forward_hook(forward_hook_fn)
                module.register_backward_hook(backward_hook_fn)

        # register hook to the first layer
        first_layer = modules[0][1]
        first_layer.register_backward_hook(first_layer_hook_fn)

    def visualize(self, input_image, target_class):
        # last layer output
        model_output = self.model(input_image)
        print(model_output.shape)
        self.model.zero_grad()

        # only calculate gradients wrt. target class
        # set the other classes to 0: eg. [0,0,1]
        grad_target_map = torch.zeros(model_output.shape,
                                      dtype=torch.float)
        grad_target_map[0][target_class] = 0

        model_output.backward(grad_target_map)

        # Convert Pytorch variable to numpy array
        # [0] to get rid of the first channel (1,3,224,224)
        result = self.image_reconstruction.data.numpy()[0]
        return result

In [13]:
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.models import vgg19
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

# define a 1 image dataset
dataset = datasets.ImageFolder(root='~/PYTORCH/data4/', transform=transform)

# define the dataloader to load that single image
dataloader = DataLoader(dataset=dataset, shuffle=False, batch_size=1)

In [14]:
#vgg19()

In [15]:
model = vgg19()
back = Guided_backprop(model=model)

In [16]:
img, target = next(iter(dataloader))
img.shape

torch.Size([1, 3, 224, 224])

In [17]:
back.register_hooks()

In [18]:
back.visualize(img, target)



torch.Size([1, 1000])
grad.shape torch.Size([1, 512, 14, 14])
positive_grad_out.shape torch.Size([1, 512, 14, 14])


AttributeError: 'Guided_backprop' object has no attribute 'forward_outputs'

In [None]:
a = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)
c = a*b

In [None]:
d = torch.tensor(4., requires_grad=True)
e = c * d

In [None]:
e.backward()


In [None]:
print(a, a.grad)
print(b, b.grad)
print(d, d.grad)

In [None]:
a = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)
c = a*b

def c_hook(grad):
    print(grad)
    return grad + 2

c.register_hook(c_hook)
c.register_hook(lambda grad : print(grad))
c.retain_grad()

In [None]:
d = torch.tensor(4., requires_grad=True)
d.register_hook(lambda grad: grad + 100)
e = c * d

e.retain_grad()
e.register_hook(lambda grad: grad * 2)
e.retain_grad()

In [None]:
e.backward()

In [19]:
c.grad

NameError: name 'c' is not defined

In [20]:
class SumNet(nn.Module):
    def __init__(self):
        super(SumNet, self).__init__()

    @staticmethod
    def forward(a,b,c):
        d =  a+b+c
        return d

def forward_pre_hook(module, inputs):
    print('inputs', inputs)
    a,b = inputs
    return a + 10, b

def forward_hook(module, inputs, outputs):
    print(inputs)
    print("ssss",outputs)
    return outputs + 100

def main():
    sum_net = SumNet()

    sum_net.register_forward_pre_hook(forward_pre_hook)
    sum_net.register_forward_hook(forward_hook)

    a = torch.tensor(1., requires_grad=True)
    b = torch.tensor(2., requires_grad=True)
    c = torch.tensor(3., requires_grad=True)
    print(a.grad)

    d = sum_net(a,b,c=c)
    d.backward()
    print(a.grad)

    print('d', d)

main()



None
inputs (tensor(1., requires_grad=True), tensor(2., requires_grad=True))
(tensor(11., grad_fn=<AddBackward0>), tensor(2., requires_grad=True))
ssss tensor(16., grad_fn=<AddBackward0>)
tensor(1.)
d tensor(116., grad_fn=<AddBackward0>)


In [21]:
class MyMultiply(nn.Module):
    def __init__(self):
        super(MyMultiply, self).__init__()

    @staticmethod
    def forward(a,b,c):
        return (a**b) * c


def backward_hook(module, grad_inputsss, grad_outputsss):
    print('module', module)
    print('grad_input', grad_inputsss)
    print('grad_output', grad_outputsss)


def main():
    my_multiply = MyMultiply()
    my_multiply.register_backward_hook(backward_hook)

    a = torch.tensor(1., requires_grad=True)
    b = torch.tensor(2., requires_grad=True)
    c = torch.tensor(3., requires_grad=True)


    d = my_multiply(a,b,c=c)

    d.backward()
    print(a.grad)
    print(b.grad)
    print(c.grad)

main()





module MyMultiply()
grad_input (tensor(3.), tensor(1.))
grad_output (tensor(1.),)
tensor(6.)
tensor(0.)
tensor(1.)


In [22]:
a = torch.randn(4)
print(a)

torch.clamp(a, min=-0.5, max=0.56)



tensor([-0.7431,  0.6251, -1.1688, -1.1759])


tensor([-0.5000,  0.5600, -0.5000, -0.5000])

In [23]:
min = torch.linspace(-1, 1, steps=4)
print(min)
torch.clamp(a, min=min)

tensor([-1.0000, -0.3333,  0.3333,  1.0000])


tensor([-0.7431,  0.6251,  0.3333,  1.0000])