In [None]:
import torch

# 1. Create a tensor 'x' with the value 2.0 and enable gradient tracking.
x = torch.tensor(2.0, requires_grad=True)

# 2. Define a simple quadratic function: y = x^2 + 2*x + 1
y = x**2 + 2*x + 1

# 3. Compute the gradient of 'y' with respect to 'x'.
#    Before doing this, what do you expect the gradient to be based on calculus?
#    (The derivative of x^2 + 2x + 1 is 2x + 2. At x=2, the gradient should be 2*(2) + 2 = 6)

# 4. Call the .backward() method on 'y' to compute the gradient.
# In this case, y output is already a scalar.
y.backward()

# 5. Access the computed gradient of 'x' using the .grad attribute.
print("Gradient of x:", x.grad)

# 6. Create another tensor 'w' with a random value and enable gradient tracking.
w = torch.randn(1, requires_grad=True)

# 7. Create a tensor 'b' with the value -1.0 and enable gradient tracking.
b = torch.tensor(-1.0, requires_grad=True)

# 8. Define a simple linear model output: prediction = w * x + b
prediction = w * x + b

# 9. Define a target value: target = 5.0
target = torch.tensor(5.0)

# 10. Calculate the mean squared error (MSE) loss: loss = (prediction - target)^2
loss = (prediction - target)**2

# 11. Compute the gradients of the loss with respect to 'w' and 'b'.
# To compute gradients for non-scalar tensors, we typically compute the gradient of a scalar output.
# In this case, 'loss' is already a scalar.
loss.backward()

# 12. Access and print the gradients of 'w' and 'b'.
print("Gradient of w:", w.grad)
print("Gradient of b:", b.grad)

# 13. Briefly explain in the comments what the .backward() method does and why it's crucial for training neural networks.
# Your explanation here:
# The .backward() method in PyTorch calculates the gradient of a tensor with respect to all the tensors that have requires_grad=True and were involved in the computation of that tensor.
# It traverses the computational graph backwards from the tensor on which it's called, applying the chain rule of calculus to compute these gradients.
# This is crucial for training neural networks because it allows us to determine how much each parameter (weight and bias) contributes to the loss.
# These gradients are then used by optimization algorithms (like gradient descent) to update the parameters in a direction that reduces the loss, thus improving the network's performance over iterations.

Gradient of x: tensor(6.)
Gradient of w: tensor([1.])
Gradient of b: tensor(1.)
