In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)


In [None]:
input_size = 4  #due to 4 input features we have
output_size = 3 #we have 3 categorical outputs
hidden_size = 5 #not a fixed number we decide it ( 5 neurons )

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))                       # Layer 1 biases
W2 = np.random.randn(hidden_size, output_size) * 0.01 # Layer 2 weights
b2 = np.zeros((1, output_size))                       # Layer 2 biases

print("W1 shape:", W1.shape)  # (4,5) - 4 inputs, 5 hidden neurons
print("W2 shape:", W2.shape)

# W1 = Layer 1's weight stored in a variable
# W2 = Layer 2's weight stored in a variable
# W3 = Layer 3's weight stored in a variable

#Biases are zero but they will update during training

W1 shape: (4, 5)
W2 shape: (5, 3)


In [None]:
y_train.shape

(120,)

In [None]:
print(b2)

[[0. 0. 0.]]


In [None]:
print(W1)

[[-0.01473486  0.02333305 -0.00904088 -0.00158036  0.01864921]
 [ 0.0146692   0.01898557  0.00670369 -0.0039497  -0.00203858]
 [ 0.00872726 -0.00454707 -0.00253922 -0.01116823 -0.01351247]
 [-0.01222766  0.01019446 -0.00577846 -0.00010357 -0.00147598]]


In [None]:
def relu(x):
  return np.maximum(0, x)

relu(9)

np.int64(9)

In [None]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [None]:
def forward(X, W1, b1, W2, b2):

    # Layer 1: Input to Hidden
    z1 = np.dot(X, W1) + b1      # Linear combination
    a1 = relu(z1)                # Activation

    # Layer 2: Hidden to Output
    z2 = np.dot(a1, W2) + b2     # Linear combination
    a2 = softmax(z2)             # Output probabilities

    return a2, a1, z1, z2


#forward propagation in theory means sending the data to next layers
#in practical it means calculating z of each neuron and applying activation fucntion
# # Test forward pass
# probabilities = a2
# hidden_out = a1
# hidden_z = z1
# output_z = z2

# New weights → New Z1, Z2 → New A1, A2
# Cycle repeat until accurate predictions


#testing now
probabilities, hidden_out, hidden_z, output_z = forward(X_train, W1, b1, W2, b2)
probabilities.shape[1]

3

In [None]:
probabilities[:3]

array([[0.33306053, 0.33322593, 0.33371354],
       [0.33299662, 0.33320533, 0.33379805],
       [0.33300389, 0.33323264, 0.33376347]])

In [None]:
def compute_loss(y, y_pred):

    m = y.shape[0]
    correct_log_probs = -np.log(y_pred[range(m), y])
    #kinda looks like this # Output: [0.9, 0.7, 0.5]
    #[-log(0.9), -log(0.7), -log(0.5)] ≈ [0.105, 0.357, 0.693]
    loss = np.sum(correct_log_probs) / m
    return loss

compute_loss(y_train, probabilities)

#we can also minus the probablities from 1.0 but we use log cuz it enhances the loss

np.float64(1.0985923195560021)

In [None]:
# in this code (y_pred[range(m), y] y_pred = probablities
# y_pred[range_m, y] = y_pred[[0,1,2,3,4,5,6,7,8,9], [2,0,1,2,1,0,2,1,0,1]]
#  y_pred will (create a loop for each and) it will say 'give me column 2 of row 0 from y_pred'
# numpy will automatically create pairs, and numpy itself is converting range(m) into an array of 120

In [None]:
 ## **5. BACKWARD PROPAGATION (LEARNING)**
def backward(X, y, y_pred, hidden, W1, b1, W2, b2):
    learning_rate=0.01
    m = X.shape[0]

    # Output layer error
    dz2 = y_pred.copy()
    dz2[range(m), y] -= 1
    #dz2[range(m), y] this gives the probablitiy of the actual class. and here we minus from 1 (predicted - actual(1) = error )
    dz2 /= m #average error

    # Hidden layer error
    dW2 = np.dot(hidden.T, dz2) #derivative W2
    db2 = np.sum(dz2, axis=0, keepdims=True) #derivative bias2

    dh = np.dot(dz2, W2.T)
    dz1 = dh * (hidden > 0)  # ReLU derivative

    # Input layer error
    dW1 = np.dot(X.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    # Weights update
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    return W1, b1, W2, b2
    print(dz2.shape)


In [None]:
# Manually shapes check
print("X_train shape:", X_train.shape)    # (120, 4)
print("W1 shape:", W1.shape)              # (4, 5)
print("W2 shape:", W2.shape)              # (5, 3)
print("b1 shape:", b1.shape)              # (1, 5)
print("b2 shape:", b2.shape)              # (1, 3)

# Forward pass test
y_pred, hidden, _, _ = forward(X_train, W1, b1, W2, b2)
print("y_pred shape:", y_pred.shape)      # (120, 3)
print("hidden shape:", hidden.shape)      # (120, 5)

# Backward pass test
W1, b1, W2, b2 = backward(X_train, y_train, y_pred, hidden, W1, b1, W2, b2)

X_train shape: (120, 4)
W1 shape: (4, 5)
W2 shape: (5, 3)
b1 shape: (1, 5)
b2 shape: (1, 3)
y_pred shape: (120, 3)
hidden shape: (120, 5)


In [None]:
# # Example:
# dz2_before = [[0.1, 0.8, 0.1], [0.9, 0.1, 0.0]]
# y = [1, 0]

# # dz2[[0,1], [1,0]] -= 1
# # dz2[0,1] = 0.8 - 1 = -0.2
# dz2_after = [[0.1, -0.2, 0.1], [-0.1, 0.1, 0.0]]

#every element  dividing by 120 # [[-0.2, 0.1, 0.1], ...] / 120 = [[-0.0016, 0.0008, 0.0008], ...]


In [None]:
#loss func : how much error
#back propagation: who is how much responsible for error?

# X = X_train (120×4) - Training features
# y = y_train (120,) - Actual labels
# y_pred = probabilities (120×3) - Forward pass
# hidden = a1 (120×5) - Hidden layer outputs (after ReLU)
# W2 = (5×3) - Hidden→Output weights


# dW2 = Input (a1) × Output Error (dz2)
# dW1 = Input (X) × Hidden Error (dz1)

#db2 is sum of coloum 1 2 and 3 and gives a 3 col array. db2 = [[0.15, -0.08, 0.23]]  # Example values

In [None]:
#  in forward :

# a1 (hidden) × W2 = z2 (output)

# Backward reverse:

# dz2 (output errors) × W2.T = dh (hidden errors)

# Code: dz2 × W2.T but conceptually:

# Output errors (dz2) → Weights (W2) → Hidden errors (dh)

# Forward: output = hidden × W2

# Backward: dh = dz2 × W2.T

# theory is correct  - flow correct, but code has different notation
# "We are doing W2.T only so that dimensions match and multiplication can happen  And error anyway becomes collective or average "

# dz1 = Z1 gradient/error

# it tells: Error in Hidden layers raw calculations (z1)
# after applying ReLU

In [None]:
# We use a small learning rate because the gradient is only an estimate, not a perfect direction.
#  The exact optimal weight change might be 0.2 instead of the calculated 0.3.
#  By taking small steps, we safely approach the maximum accuracy point without overshooting it.

In [None]:
# dz2 = Average error of output layer (z2)

# dz1 = Error of hidden layer (z1)

# dW2, db2 = Output weights & biases error

# dW1, db1 = Hidden weights & biases error

#in forward pass we Zeroed (0)negative neurons (inactive ) so in back prop we also making them zero cuz they didnt participate in the error
# z1 = [0.8, -0.3, 1.2, -0.5, 0.6]  # Raw calculations
# a1 = relu(z1) = [0.8, 0, 1.2, 0, 0.6]  # After ReLU

#W1, b1, W2, b2 we provide this and backword func gives the updated weights

In [None]:
def train_simple_ann(X_train, y_train, frequency=1000):
    # Weights initialize
    input_size = 4
    hidden_size = 5
    output_size = 3

    W1 = np.random.randn(input_size, hidden_size) * 0.01
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * 0.01
    b2 = np.zeros((1, output_size))

    losses = []

    for f in range(frequency):
        # Forward pass
        y_pred, hidden, _,_ = forward(X_train, W1, b1, W2, b2)

        # Loss calculate
        loss = compute_loss(y_train, y_pred)
        losses.append(loss)

        # Backward pass (learning)
        W1, b1, W2, b2 = backward(X_train, y_train, y_pred, hidden, W1, b1, W2, b2)

        if f % 100 == 0:
            accuracy = np.mean(np.argmax(y_pred, axis=1) == y_train)
            print(f"Frequency {f}: Loss = {loss:.4f}, Accuracy = {accuracy:.2f}")

    return W1, b1, W2, b2, losses

# Train the model
W1, b1, W2, b2, losses = train_simple_ann(X_train, y_train)

Frequency 0: Loss = 1.0986, Accuracy = 0.34
Frequency 100: Loss = 1.0979, Accuracy = 0.34
Frequency 200: Loss = 1.0930, Accuracy = 0.34
Frequency 300: Loss = 1.0627, Accuracy = 0.33
Frequency 400: Loss = 0.9674, Accuracy = 0.61
Frequency 500: Loss = 0.8464, Accuracy = 0.66
Frequency 600: Loss = 0.7652, Accuracy = 0.66
Frequency 700: Loss = 0.7066, Accuracy = 0.66
Frequency 800: Loss = 0.6611, Accuracy = 0.67
Frequency 900: Loss = 0.6246, Accuracy = 0.69


In [None]:
# _ means (y_pred, hidden, _ = forward(X_train, W1, b1, W2, b2) i dont care about this variable = skip value
#forward fucntion returns the z1 z2 but they get discard (_) cuz theyre not using in calculations

#frequency = 1000 ( not fixed variable but 1 frequcncy = 120 flowers means complete data )
# if f % 100 == 0: showing accuracy at every 100th (100, 200, 300..)
#np.argmax(y_pred, axis=1) this gives the max probablity class( it gives class like 0 1 2. unlike np.max)
#then values are compared np.argmax(y_pred, axis=1) == y_train [true,false,true] now the mean as 1+0+1/3