In [27]:
import torch

In [45]:
# Automatic Differentiation with torch.autograd

# When training neural networks, the most frequently used algorithm is back propagation. 
# In this algorithm, parameters (model weights) are adjusted according to the gradient 
# of the loss function with respect to the given parameter.

# To compute those gradients, PyTorch has a built-in differentiation engine called 
# torch.autograd. It supports automatic computation of gradient for any computational graph.

# Consider the simplest one-layer neural network, with input x, parameters w and b, and 
# some loss function. It can be defined in PyTorch in the following manner:

x = torch.ones(5)
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
y = torch.zeros(3)

In [46]:
z = torch.matmul(x, w) + b

In [47]:
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [48]:
loss

tensor(1.3836, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [49]:
# In this network, w and b are parameters, which we need to optimize. 
# Thus, we need to be able to compute the gradients of loss function 
# with respect to those variables. In order to do that, we set the 
# requires_grad property of those tensors.

# A function that we apply to tensors to construct computational graph 
# is in fact an object of class Function. This object knows how to 
# compute the function in the forward direction, and also how to compute 
# its derivative during the backward propagation step. A reference to 
# the backward propagation function is stored in grad_fn property of a 
# tensor. You can find more information of Function in the documentation.

print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x000001612D616230>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x000001612BE16D40>


In [50]:
# Computing Gradients:
# To optimize weights of parameters in the neural network, we need to compute the derivatives 
# of our loss function with respect to parameters, namely, we need dloss/dw and dloss/db
# under some fixed values of x and y. To compute those derivatives, we call loss.backward(), 
# and then retrieve the values from w.grad and b.grad:

loss.backward()

In [51]:
#We can only obtain the grad properties for the leaf nodes of 
# the computational graph, which have requires_grad property 
# set to True. For all other nodes in our graph, gradients will not be available.

# We can only perform gradient calculations using backward once on 
# a given graph, for performance reasons. If we need to do several 
# backward calls on the same graph, we need to pass retain_graph=True 
# to the backward call.

print(w.grad)
print(b.grad)

tensor([[0.2567, 0.1565, 0.2903],
        [0.2567, 0.1565, 0.2903],
        [0.2567, 0.1565, 0.2903],
        [0.2567, 0.1565, 0.2903],
        [0.2567, 0.1565, 0.2903]])
tensor([0.2567, 0.1565, 0.2903])


In [52]:
z.requires_grad

True

In [53]:
# Disabling Gradient Tracking:
# By default, all tensors with requires_grad=True are tracking their 
# computational history and support gradient computation. However, there 
# are some cases when we do not need to do that, for example, when we 
# have trained the model and just want to apply it to some input data, 
# i.e. we only want to do forward computations through the network. We 
# can stop tracking computations by surrounding our computation code with torch.no_grad() block:
with torch.no_grad():
    z_nograd = torch.matmul(x, w)+b
print(z_nograd.requires_grad)
    

False


In [60]:
print(f"Z tensor: {z}")
print(f"Z gradient: {z.requires_grad}")

Z tensor: tensor([ 1.2092, -0.1219,  1.9080], grad_fn=<AddBackward0>)
Z gradient: True


In [59]:
# Another way to achieve the same result is to use the detach() method on the tensor:
z_no_grad = z.detach()
print(f"Z tensor: {z_no_grad}")
print(f"Z gradient: {z_no_grad.requires_grad}")

Z tensor: tensor([ 1.2092, -0.1219,  1.9080])
Z gradient: False


In [None]:
# #There are reasons you might want to disable gradient tracking:
# To mark some parameters in your neural network as frozen parameters.

# To speed up computations when you are only doing forward pass, because 
# computations on tensors that do not track gradients would be more efficient.