In [1]:
import torch
import numpy as np

In [23]:
x = torch.randn(3,requires_grad = True) #specify this, now all operations over x is tracked for computational graph.
print(x)

tensor([-0.2716,  1.2864,  1.3919], requires_grad=True)


In [24]:
y = x+2
print(y)

tensor([1.7284, 3.2864, 3.3919], grad_fn=<AddBackward0>)


In [25]:
z = y*y
print(z)

tensor([ 2.9874, 10.8004, 11.5050], grad_fn=<MulBackward0>)


In [20]:
z = z.mean()#gradient only for scalar output
z.backward() #dz/dxz 
print(x.grad) #can be called only after z.backward()

tensor([ 0.6930, -0.0299,  0.9313])


#### z.backward(t) if t is vector of same size of z. But usually z would be scalar in the end so not used much

In [26]:
t = torch.rand(3)
z.backward(t)
print(x.grad)

tensor([2.8817, 2.0288, 5.4211])


#### preventing pytorch from tracking all grads, 3 ways:
1. x.requires_grad_(False)
2. y =x.detach -> Same values but no grad tracked
3. with torch.no_grad()

In [32]:
#method 1
x = torch.rand(4,requires_grad=True)
print(x)
x.requires_grad_(False)#inplace
print(x)

tensor([0.6967, 0.9514, 0.9182, 0.9564], requires_grad=True)
tensor([0.6967, 0.9514, 0.9182, 0.9564])


In [34]:
#method 2
x = torch.rand(4,requires_grad=True)
print(x)
y = x.detach()
print(y)

tensor([0.2919, 0.2432, 0.4247, 0.1595], requires_grad=True)
tensor([0.2919, 0.2432, 0.4247, 0.1595])


In [39]:
#3
x = torch.rand(3,requires_grad=True)
with torch.no_grad():
    #no grad calculated for any
    y=x+2
    z=y*y
    print(z)
y1=x+2
z1=y1*y1
print(z1)

tensor([5.4446, 5.4050, 4.2906])
tensor([5.4446, 5.4050, 4.2906], grad_fn=<MulBackward0>)


In [43]:
# we need to reset gradients very important, usually needed for optimizers. ie optim.step() or weight update then optim.zero_grad_()
weights = torch.ones(4,requires_grad=True)
for epoch in range(5):
    model_op = (weights*5).sum()
    model_op.backward()
    
    print(weights.grad)
    weights.grad.zero_()#gradients are stacked hence we need to reset after every iteration else incorrect results obtained.

tensor([5., 5., 5., 5.])
tensor([5., 5., 5., 5.])
tensor([5., 5., 5., 5.])
tensor([5., 5., 5., 5.])
tensor([5., 5., 5., 5.])


#### Backprop-> aim is to get gradient of loss with respect to input param and then apply optimisation technique.The computational graph tracks local gradients which are then used later during chain rule.(It tracks grad of current op wrt prev input)

![alt text](.png "Title")

In [45]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)
w = torch.tensor(1.0,requires_grad=True)
#fwd pass
y_hat = w*x
loss = (y_hat-y)**2
print(loss)
#loss will be (1-2)**2 = 1

#backward pass
loss.backward()
#now all gradients are calculated
#dloss/dw = dloss/dy * dy/dw 
print(w.grad)
#now weight update acc to gradients
#clear gradients else wrong computation.
#repeat in a loop.


tensor(1., grad_fn=<PowBackward0>)
tensor(-2.)
