In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# loading in the data, then one-hot encoding the target variable (because it's categorical) and scaling the input variables (because they're continuous and on different scales, so we want to normalise them), finally splitting the data into training and testing sets

iris = datasets.load_iris()
X = iris.data
y = iris.target

encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1))

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_onehot, test_size=0.2, random_state=42
)

In [5]:
# we'll now define the architecture of MLP
input_size = X_train.shape[1]
hidden_size = 10
output_size = y_train.shape[1]

# also initialising weights and biases
# we use np.random.randn() to initialise the weights and np.zeros() to initialise the biases because it's a good practice to initialise the weights randomly and the biases to zero
W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

In [7]:
# we will need an activation function, I have decided to go with ReLU - the purpose of this function, in simple terms, is to basically round off all negative values to zero and keep the positive values as they are
def relu(x):
    return np.maximum(0, x)

# we will also need a softmax function for the output layer to get the probabilities of each class (3 in this case because of the iris dataset) - we use keepdims=True to keep the dimensions of the output the same as the input so that we can use it in the backpropagation step (which I will define later)
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# now we will define the forward propagation function which will take the input and return the output of the network, you can see that we are using the weights and biases that we initialised earlier and then each step is shown as per the architecture of the network and the defined activation functions for each layer
def forward_propagation(X):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return A1, A2

In [8]:
# with the functions defined above, we can do an example forward pass to see the output of the network
A1, A2 = forward_propagation(X_train)
print("Output of forward pass:", A2)

# the output of the network at this stage is not very useful because the weights and biases are random, but we can see that the output is a probability distribution over the 3 classes for each input

Output of forward pass: [[8.91156965e-06 9.99147198e-01 8.43890830e-04]
 [5.69179183e-05 8.38383645e-01 1.61559437e-01]
 [6.36602721e-02 8.24455668e-02 8.53894161e-01]
 [4.93263922e-05 9.95415627e-01 4.53504678e-03]
 [1.76209943e-05 9.98712312e-01 1.27006743e-03]
 [3.75302528e-01 2.21604673e-01 4.03092800e-01]
 [7.06543580e-02 5.10117646e-02 8.78333877e-01]
 [2.69319528e-04 9.85215858e-01 1.45148224e-02]
 [6.12449484e-05 9.94804300e-01 5.13445501e-03]
 [1.05728739e-05 9.94459804e-01 5.52962273e-03]
 [4.87194324e-01 1.14081787e-01 3.98723890e-01]
 [9.97451226e-02 1.91558826e-01 7.08696052e-01]
 [5.35614364e-02 4.75562339e-02 8.98882330e-01]
 [1.59472379e-04 9.90660491e-01 9.18003622e-03]
 [3.01221907e-04 9.80415641e-01 1.92831368e-02]
 [1.27761659e-01 4.91732570e-01 3.80505771e-01]
 [2.75074121e-01 2.02186507e-01 5.22739372e-01]
 [5.83103073e-02 1.56644789e-02 9.26025214e-01]
 [1.08612802e-01 1.33663091e-01 7.57724107e-01]
 [6.84082825e-04 3.38097606e-04 9.98977820e-01]
 [3.17537441e-01