# Backpropagation with pyTorch

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch Version: {torch.__version__}")

PyTorch Version: 2.6.0+cu124


## Backpropagation and Autograd
> `PyTorch` autograd powers automatic differentiation. When performing operations on Tensors that are set with `requires_grad = True`, PyTorch builds a **Dynamic Computation Graph (DCG)**.
`torch.Tensor` is a fundamental data structure:
- `.grad`: An attribute of a Tensor that stores the gradient computed by `autograd` after `.backward()` is called *(it is None by default)*.
- `.grad_fn`: An attribute of a Tensor that refers to the function that created it.

### Using Scalar Tensors
1. Create `x` with `requires_grad = True`.
2. `y`, `z` are derived from `x`

In [5]:
x = torch.tensor(2.0, requires_grad=True)
print(f"x: {x}, x.requires_grad: {x.requires_grad}, x.grad_fn: {x.grad_fn}")

y = x ** 2
print(f"y: {y}, y.requires_grad: {y.requires_grad}, y.grad_fn: {y.grad_fn}")

z = 3 * y + 5
print(f"z: {z}, z.requires_grad: {z.requires_grad}, z.grad_fn: {z.grad_fn}")

z.backward()
print(f"Gradient of z with respect to x (dz/dx): {x.grad}")

x: 2.0, x.requires_grad: True, x.grad_fn: None
y: 4.0, y.requires_grad: True, y.grad_fn: <PowBackward0 object at 0x7cb133cebd30>
z: 17.0, z.requires_grad: True, z.grad_fn: <AddBackward0 object at 0x7cb133cebd30>
Gradient of z with respect to x (dz/dx): 12.0


### Multiple Inputs

In [8]:
a = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(4.0, requires_grad=True)

c = 2 * a + b ** 2
print(f"c: {c}, c.grad_fn: {c.grad_fn}")

c.backward()
print(f"Gradient of c w.r.t. a: {a.grad}")
print(f"Gradient of c w.r.t. b: {b.grad}")

c: 22.0, c.grad_fn: <AddBackward0 object at 0x7cb133c4ad70>
Gradient of c w.r.t. a: 2.0
Gradient of c w.r.t. b: 8.0


In [13]:
if a.grad is not None: a.grad.zero_()
if b.grad is not None: b.grad.zero_()
print(f"{a.grad}")

0.0


## Use Vector/Matrix Tensors

In [19]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
print(f"x: {x}")

y = x ** 2
print(f"y: {y}, y.grad_fn: {y.grad_fn}")

z = y.sum()
print(f"z: {z}, z.grad_fn: {z.grad_fn}")

x: tensor([1., 2., 3.], requires_grad=True)
y: tensor([1., 4., 9.], grad_fn=<PowBackward0>), y.grad_fn: <PowBackward0 object at 0x7cb133a92710>
z: 14.0, z.grad_fn: <SumBackward0 object at 0x7cb133a92710>


In [25]:
x = torch.randn(3, requires_grad=True)
print(f"x: {x}")
y = x * 2
y_sum = y.sum()
print(f"y : {y}")

prefill_grad = torch.tensor([0.1,1.0,0.01])

y.backward(gradient=prefill_grad)
print(x.grad)

x: tensor([0.9580, 1.3221, 0.8172], requires_grad=True)
y : tensor([1.9159, 2.6443, 1.6344], grad_fn=<MulBackward0>)
tensor([0.2000, 2.0000, 0.0200])


## Backpropagation with NN

In [30]:
X_numpy = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) # 3 samples, 2 features
Y_numpy = np.array([[3.0], [7.0], [11.0]], dtype=np.float32)            # 3 samples, 1 output target
                                                                    # Target: y = x1 + x2
X = torch.from_numpy(X_numpy)
Y_true = torch.from_numpy(Y_numpy)

# Parameters (weights and bias) for a linear layer: output = X @ W + b
# Input features: 2, Output features: 1
W = torch.randn(2, 1, requires_grad=True) # Shape: (num_input_features, num_output_features)
b = torch.randn(1, 1, requires_grad=True) # Shape: (1, num_output_features) - will broadcast

print(f"Initial W:\\n{W}")
print(f"Initial b:\\n{b}")

# --- Forward Pass ---
# Y_pred = X.mm(W) + b  (mm is matrix multiplication)
# Or using @ operator for matrix multiplication
Y_pred = X @ W + b
print(f"Predictions Y_pred:\\n{Y_pred}")

# --- Loss Calculation (Mean Squared Error) ---
# loss = (1/N) * sum((Y_pred - Y_true)^2)
loss = torch.mean((Y_pred - Y_true) ** 2)
print(f"Initial Loss: {loss.item()}") # .item() gets scalar value from a single-element tensor
# --- Backward Pass ---
# This computes d(loss)/dW and d(loss)/db
loss.backward()
# Gradients are now available in W.grad and b.grad
print(f"Gradient d(loss)/dW:\\n{W.grad}")
print(f"Gradient d(loss)/db:\\n{b.grad}")
# --- Optimizer Step (Conceptual) ---
# In a real training loop, an optimizer would use these gradients.
# For example, with SGD: W = W - learning_rate * W.grad
learning_rate = 0.01
# W = W - learning_rate * W.grad # This creates a new W, breaking computation graph for requires_grad
# For in-place updates that are tracked by autograd for parameters, do:
with torch.no_grad(): # IMPORTANT: Optimizer updates should not be part of gradient computation
    W -= learning_rate * W.grad
    b -= learning_rate * b.grad
# --- Zero Gradients ---
# After updating weights, gradients must be zeroed for the next iteration
W.grad.zero_()
b.grad.zero_()
print(f"W after one update:\\n{W}")
print(f"b after one update:\\n{b}")
# Let's do one more forward pass to see if loss decreased
Y_pred_new = X @ W + b
loss_new = torch.mean((Y_pred_new - Y_true) ** 2)
print(f"Loss after one update: {loss_new.item()}") # Should be less than initial loss


Initial W:\ntensor([[-1.9245],
        [ 0.4336]], requires_grad=True)
Initial b:\ntensor([[0.6641]], requires_grad=True)
Predictions Y_pred:\ntensor([[-0.3933],
        [-3.3753],
        [-6.3572]], grad_fn=<AddBackward0>)
Initial Loss: 140.14463806152344
Gradient d(loss)/dW:\ntensor([[ -80.8702],
        [-101.6207]])
Gradient d(loss)/db:\ntensor([[-20.7505]])
W after one update:\ntensor([[-1.1158],
        [ 1.4498]], requires_grad=True)
b after one update:\ntensor([[0.8716]], requires_grad=True)
Loss after one update: 20.921161651611328


In [31]:
# Re-initialize data and parameters for this snippet
X = torch.from_numpy(X_numpy)
Y_true = torch.from_numpy(Y_numpy)
# Define a simple model using nn.Module
class SimpleLinearModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleLinearModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim) # W and b are created here

    def forward(self, x):
        return self.linear(x)

# Instantiate the model
input_dim = 2
output_dim = 1
model = SimpleLinearModel(input_dim, output_dim)
print("Model structure:\\n", model)
print("\\nModel parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# Define a loss function
criterion = nn.MSELoss() # Mean Squared Error loss

# Define an optimizer
# It will manage the parameters of 'model'
optimizer = optim.SGD(model.parameters(), lr=0.01)

# --- Training Loop (Simplified single step) ---

# 1. Forward pass
Y_pred = model(X)

# 2. Compute loss
loss = criterion(Y_pred, Y_true)
print(f"\\nInitial Loss: {loss.item()}")

# 3. Zero gradients (IMPORTANT: before backward pass)
optimizer.zero_grad()
# Alternatively, for specific tensors: model.linear.weight.grad.zero_() if it exists

# 4. Backward pass (compute gradients)
loss.backward()

# Gradients are now in model.linear.weight.grad and model.linear.bias.grad
print(f"Gradient d(loss)/d(model.linear.weight):\\n{model.linear.weight.grad}")
print(f"Gradient d(loss)/d(model.linear.bias):\\n{model.linear.bias.grad}")

# 5. Optimizer step (update parameters)
optimizer.step()

# Check updated parameters
print("\\nParameters after one update:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# Let's do one more forward pass to see if loss decreased
Y_pred_new = model(X)
loss_new = criterion(Y_pred_new, Y_true)
print(f"Loss after one update: {loss_new.item()}")


Model structure:\n SimpleLinearModel(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)
\nModel parameters:
linear.weight tensor([[-0.2435,  0.2167]])
linear.bias tensor([-0.1473])
\nInitial Loss: 60.11170959472656
Gradient d(loss)/d(model.linear.weight):\ntensor([[-52.8770, -66.8994]])
Gradient d(loss)/d(model.linear.bias):\ntensor([-14.0224])
\nParameters after one update:
linear.weight tensor([[0.2853, 0.8857]])
linear.bias tensor([-0.0071])
Loss after one update: 8.638227462768555
