Imports and Activation Functions

In [11]:
import numpy as np

# Define the sigmoid function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Define the ReLU function and its derivative
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

print("✅ Functions defined: sigmoid, sigmoid_derivative, relu, relu_derivative")

✅ Functions defined: sigmoid, sigmoid_derivative, relu, relu_derivative


Initialize Network Parameters

In [12]:
# Initialize weights and biases (same for both networks)
w1, b1 = 0.5, 0
w2, b2 = 0.5, 0
w3, b3 = 0.5, 0

# Input and true output
x = 1
y = 1

print(f"Weights: w1={w1}, w2={w2}, w3={w3}")
print(f"Input: x={x}, Target: y={y}")

Weights: w1=0.5, w2=0.5, w3=0.5
Input: x=1, Target: y=1


Sigmoid Network Forward Pass

In [13]:
print("=== Sigmoid Network ===")

# Forward pass (Sigmoid)
z1_sigmoid = w1 * x + b1
a1_sigmoid = sigmoid(z1_sigmoid)

z2_sigmoid = w2 * a1_sigmoid + b2
a2_sigmoid = sigmoid(z2_sigmoid)

z3_sigmoid = w3 * a2_sigmoid + b3
y_pred_sigmoid = sigmoid(z3_sigmoid)

# Loss
loss_sigmoid = 0.5 * (y - y_pred_sigmoid) ** 2

print("\nForward Pass:")
print(f"Layer 1 Output (a1): {a1_sigmoid:.4f}")
print(f"Layer 2 Output (a2): {a2_sigmoid:.4f}")
print(f"Predicted Output (y_pred): {y_pred_sigmoid:.4f}")
print(f"Loss: {loss_sigmoid:.4f}")

=== Sigmoid Network ===

Forward Pass:
Layer 1 Output (a1): 0.6225
Layer 2 Output (a2): 0.5772
Predicted Output (y_pred): 0.5717
Loss: 0.0917


Sigmoid Network Backward Pass

In [14]:
# Backward pass (Sigmoid)
dL_dy_pred_sigmoid = -(y - y_pred_sigmoid)

# Output layer
dy_pred_dz3_sigmoid = sigmoid_derivative(z3_sigmoid)
dL_dw3_sigmoid = dL_dy_pred_sigmoid * dy_pred_dz3_sigmoid * a2_sigmoid

# Layer 2
dL_da2_sigmoid = dL_dy_pred_sigmoid * dy_pred_dz3_sigmoid * w3
da2_dz2_sigmoid = sigmoid_derivative(z2_sigmoid)
dL_dw2_sigmoid = dL_da2_sigmoid * da2_dz2_sigmoid * a1_sigmoid

# Layer 1
dL_da1_sigmoid = dL_da2_sigmoid * da2_dz2_sigmoid * w2
da1_dz1_sigmoid = sigmoid_derivative(z1_sigmoid)
dL_dw1_sigmoid = dL_da1_sigmoid * da1_dz1_sigmoid * x

print("\nBackward Pass (Gradients):")
print(f"Gradient w.r.t. w3: {dL_dw3_sigmoid:.6f}")
print(f"Gradient w.r.t. w2: {dL_dw2_sigmoid:.6f}")
print(f"Gradient w.r.t. w1: {dL_dw1_sigmoid:.6f}")


Backward Pass (Gradients):
Gradient w.r.t. w3: -0.060540
Gradient w.r.t. w2: -0.007967
Gradient w.r.t. w1: -0.001504


ReLU Network Forward Pass
python

In [15]:
print("\n=== ReLU Network ===")

# Forward pass (ReLU)
z1_relu = w1 * x + b1
a1_relu = relu(z1_relu)

z2_relu = w2 * a1_relu + b2
a2_relu = relu(z2_relu)

z3_relu = w3 * a2_relu + b3
y_pred_relu = relu(z3_relu)

# Loss
loss_relu = 0.5 * (y - y_pred_relu) ** 2

print("\nForward Pass:")
print(f"Layer 1 Output (a1): {a1_relu:.4f}")
print(f"Layer 2 Output (a2): {a2_relu:.4f}")
print(f"Predicted Output (y_pred): {y_pred_relu:.4f}")
print(f"Loss: {loss_relu:.4f}")


=== ReLU Network ===

Forward Pass:
Layer 1 Output (a1): 0.5000
Layer 2 Output (a2): 0.2500
Predicted Output (y_pred): 0.1250
Loss: 0.3828


ReLU Network Backward Pass

In [16]:
# Backward pass (ReLU)
dL_dy_pred_relu = -(y - y_pred_relu)

# Output layer
dy_pred_dz3_relu = relu_derivative(z3_relu)
dL_dw3_relu = dL_dy_pred_relu * dy_pred_dz3_relu * a2_relu

# Layer 2
dL_da2_relu = dL_dy_pred_relu * dy_pred_dz3_relu * w3
da2_dz2_relu = relu_derivative(z2_relu)
dL_dw2_relu = dL_da2_relu * da2_dz2_relu * a1_relu

# Layer 1
dL_da1_relu = dL_da2_relu * da2_dz2_relu * w2
da1_dz1_relu = relu_derivative(z1_relu)
dL_dw1_relu = dL_da1_relu * da1_dz1_relu * x

print("\nBackward Pass (Gradients):")
print(f"Gradient w.r.t. w3: {dL_dw3_relu:.6f}")
print(f"Gradient w.r.t. w2: {dL_dw2_relu:.6f}")
print(f"Gradient w.r.t. w1: {dL_dw1_relu:.6f}")


Backward Pass (Gradients):
Gradient w.r.t. w3: -0.218750
Gradient w.r.t. w2: -0.218750
Gradient w.r.t. w1: -0.218750


Comparison of Gradients

In [17]:
print("\n=== Gradient Comparison ===")
print("Sigmoid Gradients:")
print(f"  w3: {dL_dw3_sigmoid:.6f} (Vanishes to near-zero!)")
print(f"  w2: {dL_dw2_sigmoid:.6f}")
print(f"  w1: {dL_dw1_sigmoid:.6f}")

print("\nReLU Gradients:")
print(f"  w3: {dL_dw3_relu:.6f} (Stable for active neurons)")
print(f"  w2: {dL_dw2_relu:.6f}")
print(f"  w1: {dL_dw1_relu:.6f}")


=== Gradient Comparison ===
Sigmoid Gradients:
  w3: -0.060540 (Vanishes to near-zero!)
  w2: -0.007967
  w1: -0.001504

ReLU Gradients:
  w3: -0.218750 (Stable for active neurons)
  w2: -0.218750
  w1: -0.218750
