## **Load iris data**

In [None]:
import pandas as pd
import numpy as np

In [None]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

--2025-04-25 08:48:31--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘iris.data’

iris.data               [ <=>                ]   4.44K  --.-KB/s    in 0s      

2025-04-25 08:48:31 (62.7 MB/s) - ‘iris.data’ saved [4551]



In [None]:
iris = pd.read_csv("iris.data",delimiter= ",",header=None)

In [None]:
# create train and test split

n_train = 110
n_test = 40

iris_train = iris.sample(n_train)
iris_test = iris.sample(n_test)


In [None]:
print(iris_train)

       0    1    2    3                4
41   4.5  2.3  1.3  0.3      Iris-setosa
119  6.0  2.2  5.0  1.5   Iris-virginica
36   5.5  3.5  1.3  0.2      Iris-setosa
73   6.1  2.8  4.7  1.2  Iris-versicolor
118  7.7  2.6  6.9  2.3   Iris-virginica
..   ...  ...  ...  ...              ...
94   5.6  2.7  4.2  1.3  Iris-versicolor
140  6.7  3.1  5.6  2.4   Iris-virginica
19   5.1  3.8  1.5  0.3      Iris-setosa
62   6.0  2.2  4.0  1.0  Iris-versicolor
0    5.1  3.5  1.4  0.2      Iris-setosa

[110 rows x 5 columns]


In [None]:
features = iris_train.iloc[:,0:4]
targets = iris_train.iloc[:,4].astype("category").cat.codes
labels = iris_train.iloc[:,4]

In [None]:
print(labels)

41         Iris-setosa
119     Iris-virginica
36         Iris-setosa
73     Iris-versicolor
118     Iris-virginica
            ...       
94     Iris-versicolor
140     Iris-virginica
19         Iris-setosa
62     Iris-versicolor
0          Iris-setosa
Name: 4, Length: 110, dtype: object


![nral net](./nn-backprop.jpg)

## **Define activation and loss functions**

In [None]:
def relu(x):
  return np.maximum(0,x)

In [None]:
def softmax(x):
    # For numerical stability, subtract the maximum value along the axis
    # This prevents potential overflow issues with large exponentials.
    x_max = np.max(x, axis=-1, keepdims=True)  # Keepdims, essential for broadcasting, axis will be along columns for 2d arrays since -1 selects last dimension
    #print(x_max)
    shifted_x = x - x_max

    exp_x = np.exp(shifted_x)

    # Calculate the sum of exponentials along the specified axis
    sum_exp_x = np.sum(exp_x, axis=-1, keepdims=True)

    softmax_output = exp_x / sum_exp_x
    return softmax_output

In [None]:
def cross_entropy_loss(y, y_pred):
    # Clipping for numerical stability (prevents log(0))
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)  # Clip to avoid log(0) or log(1)

    # Calculate cross-entropy loss
    loss = -np.mean(y * np.log(y_pred)) # Element-wise multiplication then mean
    return loss


## **Initialise the layers**

In [None]:
X = np.array(features.values) # Input layer
y = np.array(targets.values,dtype = "i") # targets
num_classes = 3 # Number of classes from predictions
y = np.eye(num_classes)[y] # one-hot encoded for categories


In [None]:
# mean standardise features
mu = X.mean(axis = 0)
sd = X.std(axis = 0)
X = (X - mu) / sd

In [None]:
#dimensions of our neural net
'''
Sizes:
 - m : number of examples in the dataset
 - nx : input size (number of variables being considered)
 - ny : output size (or number of classes)
 - nh: number of hidden units of the lth layer
 - L : number of layers in the network.
'''

m = X.shape[0]
nx = X.shape[1]
ny = np.unique(targets.values).shape[0]
nh = [nx,8]
L = len(nh) + 1

## **Neural network (Softmax activation)**

In [None]:
'''
We build a class to keep the code cleaner and reusable.
The code highlights the simplicity and elegance of the mathematics.
Because of how matrix multiplication works the inputs to each layer will always
have a number of rows equal to |X| = m, where m is the number of observations
in our batch.
It is also an interesting point that we find it hard to visualise anything
greater than 3 dimensions but can easily solve equations in
> 768 dimensional space with linear algebra and A-level calculus!
'''

class NN_softmax:
  def __init__(self):

    # Initialise weights and biases

    self.W = np.random.rand(nh[1],nx)
    self.WL = np.random.rand(ny,nh[1])
    '''
    each row represents the weights for a neuron in the next layer nh[l+1]
    and the columns the number of neurons in the current layer (nh[l] = nx when l = 0)
    '''
    self.b = np.zeros([1,nh[1]])
    self.bL = np.zeros([1,ny])

    '''
    each column represents the bias for a neuron in the layer nh[l+1]
    initialised as row vector of 0s. (only hidden and output layers use biases).
    '''

    '''
    Next we, tack on weights with biases.
    We will append column vector of 1s to inputs so that weighted sum can
    be expressed as single dot product
    '''
    self.W = np.concatenate((self.W,self.b.T),axis = 1) #along columns
    self.WL = np.concatenate((self.WL,self.bL.T),axis = 1)


  def forward(self,X,y):
    m = len(y) # number of observations
    self.X = np.concatenate((X,np.ones([m,1])),axis = 1) # append column of 1s to X
    self.y = y
    W = self.W
    WL = self.WL
    self.z = self.X @ W.T # + b is implicit
    self.a = np.concatenate((relu(self.z),np.ones([m,1])),axis = 1) # append column of 1s
    zL =  self.a @ WL.T # + bL is implicit
    self.aL = softmax(zL)
    C = cross_entropy_loss(y,self.aL)

    return C

  def backprop(self):
    X = self.X
    y = self.y
    aL =  self.aL
    a = self.a
    z = self.z
    WL = self.WL

    # /* output layer gradients */


    dC_dzL = aL - y # we can use this since y is one-hot encoded
    '''
    here matrix multiplication sums the partials across each
    observation for a total contribution to the loss
    NOT to apply the total-derivative which would be incorrect.
    '''
    #vector chain rule says to multiply the partials dC/dzL @ dzL/dwL
    self.dC_dwL =  (aL - y).T @ a #(150,3) @ (150,8)

   # /* hidden layer gradients */

    dzL_da = WL[:,:-1] #crucial how hidden layers output influences zL
                       #exclude the last column as these are biases
    dz_dw = X
    #we use relu activation derivtaive is a matrix of 1s and 0s.
    da_dz = (z > 0).astype(int)
    '''
    matrix multiply introduces total-derivative here!
    '''
    dC_dz = (dC_dzL @ dzL_da) * da_dz #dim of matrices: (150,3) @ (3,8) * (150,8)

    '''
    finally we sum partails across observations using matmul as before
    and NOT to introduce the total-derivative.

    '''
    self.dC_dw =  dC_dz.T @ dz_dw
    return

  def optimise(self):
      learning_rate = 0.01
      self.WL = self.WL - (learning_rate * self.dC_dwL)
      self.W = self.W - (learning_rate * self.dC_dw)
      #we dont need to update biases seperately as WL and W include them
      return



## **Training and Evaluation**

### **Train**

In [None]:
np.random.seed(42) # set seed for reproducibility

In [None]:
nn = NN_softmax()

In [None]:
# batch gradient descent (entire dataset at once as opposed to mini batches or stochastic gradient descent (mini_batchsize = 1))
import time
import tqdm

epochs  = 10000
cost = 0
pbar = tqdm.tqdm(range(epochs),desc  ="Epochs")

for e in pbar:
    #time.sleep(.05)
    cost = nn.forward(X,y)
    pbar.set_postfix_str(f"Cost: {cost}")
    nn.backprop()
    nn.optimise()
    #nn.aL # predictions

Epochs: 100%|██████████| 10000/10000 [00:13<00:00, 756.64it/s, Cost: 7.31810206063411e-05]


In [None]:
#nn.W.shape

### **Evaluate**

In [None]:
features_test = iris_test.iloc[:,0:4]
targets_test = iris_test.iloc[:,4].astype("category").cat.codes
labels_test = iris_train.iloc[:,4]

m_test = len(iris_test)

X_test = np.array(features_test.values) # Input layer
y_test = np.array(targets_test.values,dtype = "i") # targets
labels_test = np.array(labels_test.values) # map
#print(X)

# mean standardise features
mu = X_test.mean(axis = 0)
sd = X_test.std(axis = 0)
X_test = (X_test - mu) / sd

num_classes = 3 # Number of classes from predictions
y_test = np.eye(num_classes)[y_test] # one-hot encoded for categories



In [None]:
nn.forward(X_test,y_test)

np.float64(0.0001426394671996145)

### **Metrics**

In [None]:
preds = nn.aL
#preds[:,preds.argmax(axis = 1)]
preds_onehot = np.zeros_like(preds)
preds_onehot[range(len(preds)),preds.argmax(axis = 1)] = 1 # set predicted category to 1
#preds_onehot = np.eye(num_classes)[preds.argmax(axis = 1)] # one-hot encoded for categories
#len(y_test)
#print(preds_onehot)
#print(y_test)
true_positives =  np.all(preds_onehot == y_test,axis = 1)
#false_positive = ()

accuracy = np.sum(true_positives) / len(preds)

#recall # Versicolour
print("Accuracy on test:",accuracy)


Accuracy on test: 1.0
