In [7]:

import numpy as np

# Change: X, y
X = np.array([[1,0.5,1.2,-0.3],[1,1,0.8,1.5],[1,2.3,-0.7,0.5],[1,0,1.5,-1]])
y = np.array([1,2,3,1]).reshape(-1,1)

# Change: initialized weights for each layer
# W1: weights from input to first hidden layer
# W2: weights from first hidden to output layer
# Add W3, W4, etc. for deeper networks
W1=np.array([[0,0,0],[0.01,-0.02,0.03],[0.05,0.04,-0.01]])
W2=W1

In [8]:
def ReLU(z):
    return np.maximum(0, z)

def ReLU_derivative(o):
    return (o > 0).astype(float)
  
def squared_error_loss_derivative(y, y_hat):
    return 2*(y_hat-y)

def softmax(Z, W, X):
    z = X @ W
    z_max = np.max(z, axis=1, keepdims=True)  # return maximum value per row (axis=1: row-wise); 
                                              # keepdims=True: keep the dimension (2D) for later broadcasting
    exp_z = np.exp(z - z_max) # prevent overflow
    pred_Y = exp_z / np.sum(exp_z, axis=1, keepdims=True)

    # Clip predictions to prevent log(0)
    pred_Y = np.clip(pred_Y, eps, 1 - eps)

    cost   = np.sum(-(Y * np.log(pred_Y)))/X.shape[0]
    gradient = X.T @ (pred_Y-Y) / X.shape[0]

    return pred_Y, cost, gradient


#   def forward_pass(X, W1, W2, W3):
#   return y_hat, A2, O1, A3, O2
def forward_pass(X, W1, W2):
    O1=ReLU(X @ W1)
    A2 = np.column_stack([np.ones((O1.shape[0], 1)), O1])
    
    # Hidden layer 2
    # O2=ReLU(A2 @ W2)
    # A3 = np.column_stack([np.ones((O2.shape[0], 1)), O2])
    
    # Hidden layer 3    
    # O3=ReLU(A3 @ W3)
    # A4 = np.column_stack([np.ones((O3.shape[0], 1)), O3])

    y_hat = A2 @ W2
    
    # return y_hat, A2, O1, A3, O2, ..., A_n, O_{n-1}
    return y_hat, A2, O1

def backward_pass_output(y, y_hat, A_o, W_o, lr):
    N = y.shape[0]
    _,E_o,_ = softmax(y_hat,W_o,A_o)
    G_o = (A_o.T @ E_o)/N
    W_o_new = W_o - lr * G_o
    return E_o, G_o, W_o_new

def backward_pass_hidden(E_ladd1, W_ladd1, A_l, O_l, W_l, lr):
    N = A_l.shape[0]
    E_l = E_ladd1 @ W_ladd1[1:].T * ReLU_derivative(O_l)
    G_l = (A_l.T @ E_l)/N
    W_l_new = W_l - lr * G_l
    return E_l, G_l, W_l_new


In [10]:
# USAGE:

# lr=0.1

# y_hat, A2, O1, A3, O2 = forward_pass(X, W1, W2, W3)
# y_hat, A2, O1=forward_pass(X, W1, W2)
# print(f'Predicted output is {y_hat}')
# print(f'Input to output layer A2 is {A2}')

# E3, G3, W3_new = backward_pass_output(y, y_hat, A3, W3, lr)
# E2, G2, W2_new = backward_pass_output(y, y_hat, A2, W2, lr)
# print(f'Error at output layer E2 is {E2}')
# print(f'Gradient at output layer G2 is {G2}')
# print(f'Updated W2 is {W2_new}')

# E2, G2, W2_new = backward_pass_hidden(E3, W3, A2, O2, W2, lr)
# E1, G1, W1_new = backward_pass_hidden(E2, W2, X, O1, W1, lr)
# print(f'Error at hidden layer E1 is {E1}')
# print(f'Gradient at hidden layer G1 is {G1}')
# print(f'Updated W1 is {W1_new}')

lr=0.1

y_hat, A2, O1=forward_pass(X, W1, W2)
# print(f'Predicted output is {y_hat}')
# print(f'Input to output layer A2 is {}')

E2, G2, W2_new = backward_pass_output(y, y_hat, A2, W2, lr)
# print(f'Error at output layer E2 is {}')
# print(f'Gradient at output layer G2 is {}')
# print(f'Updated W2 is {}')

E1, G1, W1_new = backward_pass_hidden(E2, W2, X, O1, W1, lr)
print(f'Error at hidden layer E1 is {E1}')
print(f'Gradient at hidden layer G1 is {G1}')
print(f'Updated W1 is {W1_new}')


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3 is different from 4)

In [None]:
# =============================================================================
# COMPLETE EXAMPLE: Network with 2 hidden layers (template)
# =============================================================================
# Forward: X → W1 → O1 → W2 → O2 → W3 → y_hat
# Backward: y_hat ← W3 ← O2 ← W2 ← O1 ← W1 ← X
#
# # Configuration (add at top):
# W1 = np.array([...])  # input to hidden1
# W2 = np.array([...])  # hidden1 to hidden2
# W3 = np.array([...])  # hidden2 to output
#
# # Forward pass (modify function):
# def forward_pass(X, W1, W2, W3):
#     O1 = ReLU(X @ W1)
#     A2 = np.column_stack([np.ones((O1.shape[0], 1)), O1])
#     O2 = ReLU(A2 @ W2)
#     A3 = np.column_stack([np.ones((O2.shape[0], 1)), O2])
#     y_hat = A3 @ W3
#     return y_hat, A2, O1, A3, O2
#
# # Training:
# y_hat, A2, O1, A3, O2 = forward_pass(X, W1, W2, W3)
# E3, G3, W3_new = backward_pass_output(y, y_hat, A3, W3, lr)
# E2, G2, W2_new = backward_pass_hidden(E3, W3, A2, O2, W2, lr)
# E1, G1, W1_new = backward_pass_hidden(E2, W2, X, O1, W1, lr)

# =============================================================================
# COMPLETE EXAMPLE: Network with 3 hidden layers (template)
# =============================================================================
# Forward: X → W1 → O1 → W2 → O2 → W3 → O3 → W4 → y_hat
# Backward: y_hat ← W4 ← O3 ← W3 ← O2 ← W2 ← O1 ← W1 ← X
#
# # Configuration (add at top):
# W1 = np.array([...])  # input to hidden1
# W2 = np.array([...])  # hidden1 to hidden2
# W3 = np.array([...])  # hidden2 to hidden3
# W4 = np.array([...])  # hidden3 to output
#
# # Forward pass (modify function):
# def forward_pass(X, W1, W2, W3, W4):
#     O1 = ReLU(X @ W1)
#     A2 = np.column_stack([np.ones((O1.shape[0], 1)), O1])
#     O2 = ReLU(A2 @ W2)
#     A3 = np.column_stack([np.ones((O2.shape[0], 1)), O2])
#     O3 = ReLU(A3 @ W3)
#     A4 = np.column_stack([np.ones((O3.shape[0], 1)), O3])
#     y_hat = A4 @ W4
#     return y_hat, A2, O1, A3, O2, A4, O3
#
# # Training:
# y_hat, A2, O1, A3, O2, A4, O3 = forward_pass(X, W1, W2, W3, W4)
# E4, G4, W4_new = backward_pass_output(y, y_hat, A4, W4, lr)
# E3, G3, W3_new = backward_pass_hidden(E4, W4, A3, O3, W3, lr)
# E2, G2, W2_new = backward_pass_hidden(E3, W3, A2, O2, W2, lr)
# E1, G1, W1_new = backward_pass_hidden(E2, W2, X, O1, W1, lr)

# =============================================================================
# KEY PATTERNS TO REMEMBER
# =============================================================================
# 1. Forward pass parameters: X, W1, W2, ..., W_n (add weights as needed)
# 2. Forward pass returns: y_hat, A2, O1, A3, O2, ..., A_n, O_{n-1}
#    - Pattern: y_hat, then alternating A and O (A has bias, O doesn't)
# 3. Backward pass always starts with backward_pass_output()
# 4. Then call backward_pass_hidden() for each hidden layer in REVERSE order
# 5. Each backward_pass_hidden() needs: E_{l+1}, W_{l+1}, A_l, O_l, W_l, lr
#    - E and W from the NEXT layer (closer to output)
#    - A, O, W from the CURRENT layer