In [10]:
import torch

## Part 1: Chain rule for Back Propagation

In [60]:
# Define input values and parameters with gradient tracking enabled
x0 = torch.tensor(1.0)
x1 = torch.tensor(2.0)
w0 = torch.tensor(0.5)
w1 = torch.tensor(-0.3)
w2 = torch.tensor(0.1)  # Bias term

In [61]:
# Compute the linear transformation: z = w0*x0 + w1*x1 + w2
z = w0 * x0 + w1 * x1 + w2
k = torch.exp(-z)
h = 1 + k
f = 1/h

In [62]:
# Print the output and gradients
print("Output f (sigmoid(z)):", f.item())
print ("Output of h:", h.item())
print ("Output of k:", k.item())
print ("Output of z:", z.item())

Output f (sigmoid(z)): 0.5
Output of h: 2.0
Output of k: 1.0
Output of z: -2.2351741790771484e-08


In [66]:
# Gradient of f wrt h (df/dw0) = Gradient of f wrt h * Gradient of h wrt k * Gradient of k wrt z * Gradient of z wrt w0
df_dh = -1/(h**2)
dh_dk = 1.0
dk_dz = -torch.exp(-z)
dz_dw0 = x0
df_dw0 = df_dh * dh_dk * dk_dz * dz_dw0
print ("Gradient df/dh:", df_dh.item())
print("Gradient df/dw0:", df_dw0.item())
print ("Gradient df/dk:", dh_dk)
print ("Gradient df/dz:", dk_dz.item())

# Gradient of f wrt w1 (df/dw1) = Gradient of f wrt h * Gradient of h wrt k * Gradient of k wrt z * Gradient of z wrt w1
dz_dw1 = x1
df_dw1 = df_dh * dh_dk * dk_dz * dz_dw1
print("Gradient df/dw1:", df_dw1.item())

# Gradient of f wrt w2 (df/dw2) = Gradient of f wrt h * Gradient of h wrt k * Gradient of k wrt z * Gradient of z wrt w2
dz_dw2 = 1.0
df_dw2 = df_dh * dh_dk * dk_dz * dz_dw2
print("Gradient df/dw2:", df_dw2.item())

# Gradient of f wrt x0 (df/dx0) = Gradient of f wrt h * Gradient of h wrt k * Gradient of k wrt z * Gradient of z wrt x0
df_dx0 = df_dh * dh_dk * dk_dz * w0
print("Gradient df/dx0:", df_dx0.item())

# Gradient of f wrt x1 (df/dx1) = Gradient of f wrt h * Gradient of h wrt k * Gradient of k wrt z * Gradient of z wrt x1
df_dx1 = df_dh * dh_dk * dk_dz * w1
print("Gradient df/dx1:", df_dx1.item())

Gradient df/dh: -0.25
Gradient df/dw0: 0.25
Gradient df/dk: 1.0
Gradient df/dz: -1.0
Gradient df/dw1: 0.5
Gradient df/dw2: 0.25
Gradient df/dx0: 0.125
Gradient df/dx1: -0.07500000298023224


## Part 2: Back propagation using PyTorch

In [72]:
# Define input values and parameters with gradient tracking enabled
x0 = torch.tensor(1.0, requires_grad=True)
x1 = torch.tensor(2.0, requires_grad=True)
w0 = torch.tensor(0.5, requires_grad=True)
w1 = torch.tensor(-0.3, requires_grad=True)
w2 = torch.tensor(0.1, requires_grad=True)  # Bias term

In [73]:
# Compute the linear transformation: z = w0*x0 + w1*x1 + w2
z = w0 * x0 + w1 * x1 + w2
# Retain the gradient on z for inspection (z is an intermediate tensor)
z.retain_grad()

In [74]:
# Apply the Sigmoid activation function: f = sigmoid(z)
k = torch.exp(-z)
k.retain_grad()

h = 1 + k
h.retain_grad()

f = 1/h

In [75]:
# Perform backpropagation to compute gradients
f.backward()

In [79]:
# Print the output and gradients
print("Output f (sigmoid(z)):", f.item())
print ("Output of h:", h.item())
print ("Gradient df/dh:", h.grad.item())
print ("Output of k:", k.item())
print ("Gradient df/dk:", k.grad.item())
print ("Output of z:", z.item())
print("Gradient df/dz (z.grad):", z.grad.item())
print("Gradient df/dw0:", w0.grad.item())
print("Gradient df/dw1:", w1.grad.item())
print("Gradient df/dw2:", w2.grad.item())
print("Gradient df/dx0:", x0.grad.item())
print("Gradient df/dx1:", x1.grad.item())

Output f (sigmoid(z)): 0.5
Output of h: 2.0
Gradient df/dh: -0.25
Output of k: 1.0
Gradient df/dk: -0.25
Output of z: -2.2351741790771484e-08
Gradient df/dz (z.grad): 0.25
Gradient df/dw0: 0.25
Gradient df/dw1: 0.5
Gradient df/dw2: 0.25
Gradient df/dx0: 0.125
Gradient df/dx1: -0.07500000298023224


In [80]:
# Optionally, verify the derivative of the Sigmoid function manually
# The derivative of sigmoid is: sigmoid(z) * (1 - sigmoid(z))
expected_dfdz = f.item() * (1 - f.item())
print("Expected df/dz (from sigmoid derivative):", expected_dfdz)

Expected df/dz (from sigmoid derivative): 0.25
