In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [6]:
# PREPROCESSING

# loading in the data, then one-hot encoding the target variable (because it's categorical) and scaling the input variables (because they're continuous and on different scales, so we want to normalise them), finally splitting the data into training and testing sets

iris = datasets.load_iris()
X = iris.data
y = iris.target

encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(y.reshape(-1, 1))

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_onehot, test_size=0.2, random_state=42
)

In [3]:
# FUNCTIONS

# we will need an activation function, I have decided to go with ReLU - the purpose of this function, in simple terms, is to basically round off all negative values to zero and keep the positive values as they are
def relu(x):
    return np.maximum(0, x)

# we will also need a softmax function for the output layer to get the probabilities of each class (3 in this case because of the iris dataset) - we use keepdims=True to keep the dimensions of the output the same as the input so that we can use it in the backpropagation step (which I will define later)
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# now we will define the forward propagation function which will take the input and return the output of the network, you can see that we are using the weights and biases that we initialised earlier and then each step is shown as per the architecture of the network and the defined activation functions for each layer
def forward_propagation(X):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return A1, A2

# now we will define the loss function, which is the cross-entropy loss function in this case, which is used for classification problems
def compute_loss(y_true, y_pred):
    m = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-8)) / m
    return loss

# now we will define the backpropagation function which will take the input and the output of the network and return the gradients of the weights and biases
def backward_propagation(X, y_true, A1, A2):
    m = X.shape[0]
    
    dZ2 = A2 - y_true
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * (A1 > 0)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2

In [4]:
# MLP ARCHITECTURE

# we'll now define the architecture of MLP
input_size = X_train.shape[1]
hidden_size = 10
output_size = y_train.shape[1]

# also initialising weights and biases
# we use np.random.randn() to initialise the weights and np.zeros() to initialise the biases because it's a good practice to initialise the weights randomly and the biases to zero
W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

In [5]:
# TRAINING

# now we will train the model
A1, A2 = forward_propagation(X_train)
loss = compute_loss(y_train, A2)
dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, A1, A2)

# printing the loss and the gradients
print("Cross-entropy loss:", loss)
print("Gradients for W1:", dW1)
print("Gradients for b1:", db1)
print("Gradients for W2:", dW2)
print("Gradients for b2:", db2)

Cross-entropy loss: 4.334043860996714
Gradients for W1: [[ 0.15463296  0.36888635 -0.00958456 -0.40463802  0.16478134 -0.70570292
   0.35978339 -0.01281693  0.42710569 -0.24857933]
 [-0.07113153  0.09322556 -0.44499477  0.3464897   0.03177741  0.53449126
  -0.01285199 -0.11271197  0.08267068 -0.02454444]
 [ 0.16929814  0.36801776  0.10225235 -0.51020906  0.15721051 -0.86603342
   0.38315878  0.04003184  0.42186072 -0.26447068]
 [ 0.1618078   0.39164172  0.05737263 -0.48653422  0.1600853  -0.82946049
   0.4044847   0.03906057  0.44127529 -0.27410666]]
Gradients for b1: [[-0.15512208  0.25026316  0.22208807  0.34206629 -0.19958147  0.68282602
   0.33506686 -0.02515413  0.32272691 -0.21493174]]
Gradients for W2: [[-0.33931468  0.01907505  0.32023963]
 [ 0.0530791   0.37947252 -0.43255162]
 [-0.1937552  -0.15043805  0.34419325]
 [-0.76010726 -0.11808922  0.87819648]
 [-0.34367272 -0.1556986   0.49937132]
 [-0.95303032  0.04112224  0.91190807]
 [ 0.00293688  0.49218091 -0.49511779]
 [-0.028