In [51]:
import torch

## 1. Theory Behing Multivariate Calculus Backprop

<div style="text-align:center;">
<span style="font-size:30px;">Chain Rule</span>
</div>
 
- This is required to do the back propagation. 

- <code>Differenciations = Partial Differenciations(Jacobian Matrix) X Gradient Vector</code>

<img src="/Users/yohanabeysinghe/Mac/Codes/ML/Projects/Pytorch/images/image1.png" alt="image" width="500"> 
   
      

## 2. requires_grad  .backward() and .grad

- The autograd package provides automatic differentiation for all operations on Tensors. This calculates the Jacobian X Vector product.

- requires_grad = True -----> tracks all operations on the tensor.   

- .backward() -----> get all the gradients computed automatically at the end.

- .grad -----> the gradient of the partial derivate w.r.t. the tensor are accumilated to this attribute

In [52]:
x = torch.randn(3, requires_grad=True)
print(x)

y=x+2
print(y)

a = y*y*3
print(a)

b = a.mean()
print(b)

b.backward()
print(x.grad)

tensor([-0.1959,  0.0264,  0.3840], requires_grad=True)
tensor([1.8041, 2.0264, 2.3840], grad_fn=<AddBackward0>)
tensor([ 9.7643, 12.3191, 17.0509], grad_fn=<MulBackward0>)
tensor(13.0448, grad_fn=<MeanBackward0>)
tensor([3.6082, 4.0528, 4.7681])


- If requres_grad is not given as True, we get the following error.  

- RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
x = torch.randn(3)
print(x)

y=x+2
print(y)

a = y*y*3
print(a)

b = a.mean()
print(b)

b.backward()
print(x.grad)

- grad_fn: references a Function that has created the Tensor

## 3. Backprop for Non Scalar Tensor Outputs

- If we have more than one output in the final tensor we will get the following error.

- RuntimeError: grad can be implicitly created only for scalar outputs

In [None]:
x = torch.randn(3, requires_grad=True)
print(x)

y=x+2
print(y)

a = y*y*3
print(a)

a.backward()
print(x.grad)

- Reason - For models with non-scalar product, the the <code>derivative = Jacobian x Vector</code> , a vector of same length as the input should be given as an argument.

In [None]:
x = torch.randn(3, requires_grad=True)
print(x)

# y=x+2
# print(y)

# a = y*y*3
# print(a)

a = x * 2
for _ in range(10):
    a = a * 2

print(a)
print(a.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)   #Externally given vector for 
#vector, Jacobian product.

a.backward(v)   # Passed as an attribute.
print(x.grad)

## 4. Stopping a Tensor From Tracking History

### 4.1. .requires_grad_()  
Changes an existing flag in-place.

In [None]:
a = torch.randn(2, 2)
print(a.requires_grad)  #.requires_grad outputs if this element is being tracked.
print(a.grad_fn)  #.grad_fn shows which backward function is occuring

b = ((a * 3) / (a - 1))
print(a.requires_grad)
print(b.grad_fn)  

a.requires_grad_(True)  #.requires_grad_() Turning the variable into a tracked tensor in place.
print(a.requires_grad)
print(a.grad_fn)        # But still .grad_fn() does not have any tracked functions yet.

In [None]:
x = torch.randn(3, requires_grad=True)
print(x)

x.requires_grad_(False) #Trailing underscores modify the variable in place.
print(x)

### 4.2. .detach()  
Get a new Tensor with the same content but no gradient computation:

In [None]:
x = torch.randn(3, requires_grad=True)
print(x)

y = x.detach()
print(y)

### 4.3. torch.no_grad()

In [None]:
x = torch.randn(3, requires_grad=True)
print(x)

with torch.no_grad():
    y = x+2
    print(y)

z = x+2
print(z)

tensor([-0.0011,  0.1541,  0.1616], requires_grad=True)
tensor([1.9989, 2.1541, 2.1616])
tensor([1.9989, 2.1541, 2.1616], grad_fn=<AddBackward0>)


## 5. Avoiding The Accumilation Error of Gradients  

### 5.1. .grad.zero_() 

- backward() accumulates the gradient for this tensor into .grad attribute. This migh accumilate them unnessarily.

- Use .zero_() to empty the gradients before a new optimization step!

In [None]:
weights = torch.ones(4, requires_grad=True)
print(weights)

for epoch in range(8):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])
tensor([12., 12., 12., 12.])
tensor([15., 15., 15., 15.])
tensor([18., 18., 18., 18.])
tensor([21., 21., 21., 21.])
tensor([24., 24., 24., 24.])


In [None]:
weights = torch.ones(4, requires_grad=True)
print(weights)

for epoch in range(2):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([1., 1., 1., 1.], requires_grad=True)
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


### 5.2. by optimizer.zero_grad()

In [None]:
weights = torch.ones(4, requires_grad=True)
print(weights)
optimizer = torch.optim.SGD(weights, lr=0.01)

# During training
optimizer.step()
optimizer.zero_grad()