In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

## Load data and preprocessing

In [2]:

iris = load_iris()
X = iris.data  # Features: Sepal length, Sepal width, Petal length, Petal width
y = iris.target.reshape(-1, 1)  # Labels as column vector

In [3]:
print(X)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [4]:
print(np.unique(y))

[0 1 2]


## One-hot encode labels (3 classes)

In [5]:
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)


In [6]:
y

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def sigmoid(z):  
    return 1 / (1 + np.exp(-z))  # σ(z) = 1 / (1 + e^(-z))

In [10]:
def sigmoid_derivative(a):  
    return a * (1 - a)  # σ'(z) = σ(z) * (1 - σ(z)) its also y^

In [11]:
def softmax(z):  
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Numerical Stability trick
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)  # softmax (sig / summation of all sigs)

In [12]:
def cross_entropy_loss(y_true, y_pred):  
    m = y_true.shape[0]  # number of samples
    return -np.sum(y_true * np.log(y_pred + 1e-9)) / m  # L = -(1/m) Σ y_true * log(y_pred)

## Inputs:

- x → shape (m, 4) (m samples, 4 features each)

- W1 → shape (4, 8) (weights input → hidden)

- b1 → shape (1, 8) (bias hidden layer)

- W2 → shape (8, 3) (weights hidden → output)

- b2 → shape (1, 3) (bias output layer)

| Variable | Shape    | Meaning                                                               |
| -------- | -------- | --------------------------------------------------------------------- |
| **X**    | `(m, 4)` | Input features (m samples, 4 features per sample).                    |
| **W1**   | `(4, 8)` | Weights from input → hidden layer. Each column is a neuron’s weights. |
| **b1**   | `(1, 8)` | Bias for each hidden neuron.                                          |
| **Z1**   | `(m, 8)` | Weighted sum going into hidden neurons: `Z1 = X·W1 + b1`.             |
| **A1**   | `(m, 8)` | Activation output of hidden neurons: `A1 = sigmoid(Z1)`.              |
| **W2**   | `(8, 3)` | Weights from hidden → output layer.                                   |
| **b2**   | `(1, 3)` | Bias for each output neuron.                                          |
| **Z2**   | `(m, 3)` | Weighted sum going into output neurons: `Z2 = A1·W2 + b2`.            |
| **A2**   | `(m, 3)` | Final output probabilities: `A2 = softmax(Z2)`.                       |


In [13]:
def forward_pass(X, W1, b1, W2, b2):
    # Hidden layer
    Z1 = np.dot(X, W1) + b1           # Rule: Z1 = X · W1 + b1
    A1 = sigmoid(Z1)                  # Rule: A1 = σ(Z1)

    # Output layer
    Z2 = np.dot(A1, W2) + b2           # Rule: Z2 = A1 · W2 + b2
    A2 = softmax(Z2)                   # Rule: A2 = softmax(Z2)

    return Z1, A1, Z2, A2

## Outputs:

- Z1 → shape (m, 8)

- A1 → shape (m, 8)

- Z2 → shape (m, 3)

- A2 → shape (m, 3)

| Variable    | Shape    | Meaning                                                                                        |
| ----------- | -------- | ---------------------------------------------------------------------------------------------- |
| **y\_true** | `(m, 3)` | One-hot encoded labels for each sample.                                                        |
| **dZ2**     | `(m, 3)` | Error signal at output layer: `A2 - y_true`. This is how far predictions are from true labels. |
| **dW2**     | `(8, 3)` | Gradient of loss wrt W2: how much each hidden→output weight should change.                     |
| **db2**     | `(1, 3)` | Gradient of loss wrt b2: how much each output neuron’s bias should change.                     |
| **dA1**     | `(m, 8)` | Error signal propagated backwards into hidden layer: `dZ2·W2.T`.                               |
| **dZ1**     | `(m, 8)` | Error signal at hidden layer after activation derivative: `dA1 * sigmoid_derivative(A1)`.      |
| **dW1**     | `(4, 8)` | Gradient of loss wrt W1: how much each input→hidden weight should change.                      |
| **db1**     | `(1, 8)` | Gradient of loss wrt b1: how much each hidden neuron’s bias should change.                     |


## Inputs:

- X → shape (m, 4)

- y_true → shape (m, 3) (one-hot encoded labels)

- Z1, A1, A2 (from forward pass)

- W2 → shape (8, 3)

In [14]:
def backward_pass(X, y, Z1, A1, A2, W2):
    m = X.shape[0]

    # Output layer error
    dZ2 = A2 - y                      # Rule: ∂L/∂Z2 = A2 - y
    dW2 = (1/m) * np.dot(A1.T, dZ2)   # Rule: ∂L/∂W2 = (1/m) · A1^T · dZ2
    db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)  # Rule: ∂L/∂b2 = (1/m) Σ dZ2

    # Hidden layer error
    dA1 = np.dot(dZ2, W2.T)           # Rule: ∂L/∂A1 = dZ2 · W2^T
    dZ1 = dA1 * sigmoid_derivative(A1)  # Rule: ∂L/∂Z1 = ∂L/∂A1 ⊙ σ'(Z1)
    dW1 = (1/m) * np.dot(X.T, dZ1)    # Rule: ∂L/∂W1 = (1/m) · X^T · dZ1
    db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)  # Rule: ∂L/∂b1 = (1/m) Σ dZ1

    return dW1, db1, dW2, db2


## Outputs:

- dW1 → (4, 8)

- db1 → (1, 8)

- dW2 → (8, 3)

- db2 → (1, 3)

Forward pass takes weights + X → returns activations & predictions.

Backward pass takes activations + true labels → returns weight & bias gradients.

In [15]:
np.random.seed(42)
input_size = X_train.shape[1]  # 4 features
hidden_size = 8                # neurons in hidden layer
output_size = y_train.shape[1] # 3 classes

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

In [16]:
lr = 0.1
epochs = 500

for epoch in range(epochs):
    # Forward pass
    Z1, A1, Z2, A2 = forward_pass(X_train, W1, b1, W2, b2)

    # Loss
    loss = cross_entropy_loss(y_train, A2)

    # Backward pass
    dW1, db1, dW2, db2 = backward_pass(X_train, y_train, Z1, A1, A2, W2)

    # Update parameters (Gradient Descent Rule)
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 50, Loss: 1.0981
Epoch 100, Loss: 1.0952
Epoch 150, Loss: 1.0691
Epoch 200, Loss: 0.9253
Epoch 250, Loss: 0.6961
Epoch 300, Loss: 0.5580
Epoch 350, Loss: 0.4831
Epoch 400, Loss: 0.4336
Epoch 450, Loss: 0.3958
Epoch 500, Loss: 0.3645


In [None]:

_, A1_test, _, A2_test = forward_pass(X_test, W1, b1, W2, b2)
predictions = np.argmax(A2_test, axis=1)
true_labels = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == true_labels)

print(f"\nTest Accuracy: {accuracy*100:.2f}%")


Test Accuracy: 93.33%
