Load dataset and preprocess

In [88]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import OneHotEncoder

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
# Ensure data and dimensions are correct
assert (x_train.shape == (50000, 32, 32, 3))
assert (y_train.shape == (50000, 1))
assert (x_test.shape == (10000, 32, 32, 3))
assert (y_test.shape == (10000,1))

# Preprocessing
x_train = x_train.astype(np.float32) / 255.0
x_test = x_test.astype(np.float32) / 255.0

# flatten x_train and x_test arrays
# New layout (50000, 32x32x3)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# One-hot encode y_train and y_test
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [92]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

class Sigmoid:
    def forward(self, x):
        # x can be any shape
        return sigmoid(x)

    def backward(self, a):
        # d/dx sigmoid(x) = s * (1 - s)
        s = sigmoid(a)
        return s * (1.0 - s)

class ReLU:
    def forward(self, x):
        return np.maximum(0.0, x)

    def backward(self, x):
        return (x > 0).astype(float)
    
class LeakyReLU:
    def __init__(self, a=0.01):
        # hyperparam a value, so we can tune it for testing
        self.a = a

    def forward(self, x):
        return np.maximum(x, self.a * x)
    
    def backward(self, x):
        return np.where(x > 0, 1.0, self.a)

In [90]:
class Softmax:
    def __init__(self):
        self.y_pred = None

    def forward(self, x):
        #LSE trick
        x_stable = np.exp(x - np.max(x, axis = 1, keepdims=True)) #If x is a batch (in the shape of: [batch_size, num_classes]), then np.max(x) would only return a single scalar value, which is the same value subtracted from all rows
        exp_sum = np.sum(x_stable, axis=1, keepdims=True)
        self.y_pred = x_stable / exp_sum
        return self.y_pred

    # Simplified dCE_Loss/dSoftMax 
    def backward(self, y_actual):
        return self.y_pred - y_actual

In [91]:
class Dropout:
    def __init__ (self, drop_rate):
        # Noting that the drop_rate resembles the probability that a unit will be set to 0. (e.g. 0.5 for 50%)
        self.drop_rate = drop_rate
        # Mask will store the indices of the units which are kept (set to 1) during training.
        self.mask = None


    def forward(self, x, training=True):
        """
        Performs the forward pass for the dropout layer

        args:
            x (np.array) is the input data
            training (bool) if true, apply dropout, else return input as is.
        
        returns:
            The output data after applying dropout (if training is set to True), or data is just passing through.
        """
        # Mask is created, 1 (True) to keep the unit at 0 (False) to drop it.
        # (1-drop_rate) is the probability of keeping a unit.
        if training:
            self.mask = np.random.rand(*x.shape) > self.drop_rate
            # Multiply the mask by 1/p to maintain the expected value of values.
            return x * self.mask*1/(1-self.drop_rate)
        else:
            return x
    
    def backward(self, d_out):
        """
        Performs the backwards pass for the dropout layer.

        args:
            d_out (np.array) is the gradient from the subsequent layer

        returns:
            np.ndarray is the gradient passed to the preceding layer
        """
        # The gradient only flows through the neurons that weren't dropped in the forward pass, and the same inverted scaling factor
        # 1/(1-drop_rate) is applied to the gradient.
        return d_out * self.mask * 1/(1-self.drop_rate)

Input layer: 3072 nodes<br>
Output classification layer: 10 nodes<br><br>To do:<br> Implement L1/L2 Regularisers<br>Look at bottom where training and testing is happening, Add metrics for analysis (for graphs)<br>Add Comments

There maybe some problems with Dropout where it isn't performing as well, test different dropout rates and see what performs well.<br><br>Generally everything else works so don't break anything if your adding

In [None]:
class NeuralNetwork:
    
    def __init__(self, x_train, y_train, hidden_layers, hidden_layer_sizes, activation_func, optimiser, learning_rate=0.03, dropout_rate=0.5):
        
        self.inputs = x_train
        self.outputs = y_train

        self.hidden_layers = hidden_layers

        if (len(hidden_layer_sizes) != hidden_layers):
            raise ValueError("Neurons array length mismatch with hidden layers amount")
        self.hidden_layers_sizes = hidden_layer_sizes

        if not isinstance(activation_func, (Sigmoid, ReLU, LeakyReLU)):
            raise TypeError("Activation function must be of type Sigmoid, ReLU or LeakyReLU")
        self.activation_func = activation_func

        self.learning_rate = learning_rate

        self.weights, self.biases = self.initalise_parameters()

        self.activations = [None] * (self.hidden_layers + 2)

        # Dropout instance per hidden layer
        self.dropout_layers = [Dropout(dropout_rate) for _ in range(self.hidden_layers)]
        
        # Might remove - recheck
        self.dropout_outputs = [None] * self.hidden_layers

        self.softmax = Softmax()

        # For Optimisers
        self.optimiser = optimiser    
        # RMS_Prop
        if optimiser == "RMS_Prop":
            self.r_weights, self.r_biases = self.create_zeroed_matrix_per_parameter()
        # Adam
        if optimiser == "Adam":
            self.adam_moment1_weights, self.adam_moment1_biases = self.create_zeroed_matrix_per_parameter()
            self.adam_moment2_weights, self.adam_moment2_biases = self.create_zeroed_matrix_per_parameter()
            self.adam_timestep = 0


    def initalise_parameters(self):
        network_layout = [len(self.inputs[0])]
        for i in range(self.hidden_layers):
            network_layout.append(self.hidden_layers_sizes[i])
        network_layout.append(len(self.outputs[0]))
        
        weights = []
        biases = []

        # Sigmoid weight initialisation
        if isinstance(self.activation_func, Sigmoid):
            for i in range(len(network_layout) -  1):
                input = network_layout[i]
                output = network_layout[i+1]
                weight_init = (np.random.uniform(low=-1, high=1, size=(output, input))) / (np.sqrt(input))
                bias = np.zeros(output)
                weights.append(weight_init)
                biases.append(bias)
        
        # ReLU weight initialisation
        else:
            for i in range(len(network_layout) -  1):
                input = network_layout[i]
                output = network_layout[i+1]
                weight_init = (np.random.uniform(low=-1, high=1, size=(output, input))) * (np.sqrt(2.0 / input))
                bias = np.zeros(output)
                weights.append(weight_init)
                biases.append(bias)

        return weights, biases
    
    
     # Creates empty matrices for weights/bias needed for the optimisers
    def create_zeroed_matrix_per_parameter(self):
        weights_velocity = []
        biases_velocity = []
        for i in range(len(self.weights)):
            weights_velocity.append(np.zeros(self.weights[i].shape))
            biases_velocity.append(np.zeros(self.biases[i].shape))
        return weights_velocity, biases_velocity
    
    
    def train(self, epochs):
        loss_history = []
        batch_size = 64
        
        for epoch in range(epochs):
            # Shuffle the training set every epoch
            x_shuffled, y_shuffled = self.shuffle_dataset()

            # Average Loss across all batches per epoch
            epoch_loss = 0
            batches_amount = 0
            
            for j in range(0, len(self.inputs), batch_size):
                # Mini-Batch GD, batch size of 64    
                batch_x = x_shuffled[j:j+batch_size]
                batch_y = y_shuffled[j:j+batch_size]

                cache = self.forwardpass(batch_x)

                loss = self.cross_entropy_loss(self.softmax.y_pred, batch_y)

                weight_gradients, bias_gradients = self.backprop(cache,batch_y, batch_size)

                self.update_weights(weight_gradients, bias_gradients)
                
                batches_amount += 1
                epoch_loss += loss
            
            avg_loss = epoch_loss / batches_amount
            # Loss tracking at every epoch rather every batch
            loss_history.append(avg_loss)
        
        return loss_history 

    
    def forwardpass(self, batch_x, training=True):
        cache = []
        self.activations[0] = batch_x 
        for j in range(self.hidden_layers + 1):
            
            # Recheck this block and usage of self.dropout_outputs if necessary
            if j==0:
                input_to_layer = self.activations[j]
            else:
                input_to_layer = self.dropout_outputs[j-1]

            
            z1 = np.dot(input_to_layer, self.weights[j].T) + self.biases[j]
                
            # Softmax
            if j == (len(self.weights) - 1):
                self.softmax.forward(z1)
                self.activations[j+1] = self.softmax.y_pred
                cache.append(z1)
                cache.append(self.softmax.y_pred)

            # All other layers
            else:
                a1 = self.activation_func.forward(z1)
                
                # Integrated Dropout
                # Recheck this block and usage of self.dropout_outputs if necessary
                if training:
                    a_dropout = self.dropout_layers[j].forward(a1, training=True)
                else:
                    a_dropout = a1

                self.activations[j+1] = a1
                self.dropout_outputs[j] = a_dropout
                cache.append(z1)
                cache.append(a1)
            
        return cache

    
    def backprop(self, cache, batch_y, batch_size=64):
        weight_gradients = []
        bias_gradients = []
        
        # Softmax layer backprop
        dsMax = self.softmax.backward(batch_y)
        dw_sMax = np.dot(dsMax.T, self.activations[-2])
        db_sMax = np.sum(dsMax, axis=0)

        # Average over batch size
        dw_sMax /= batch_size
        db_sMax /= batch_size

        # Store gradients in weight and bias lists
        weight_gradients.append(dw_sMax)
        bias_gradients.append(db_sMax)
        
        # Running gradient across entire network
        grad = dsMax

        j = len(self.weights) - 2
        for i in range(len(cache) - 4, -1, -2):
            z = cache[i]
            a = cache[i+1]
            prev_a = cache[i-1] if i > 0 else self.activations[0]

            da = np.dot(grad, self.weights[j+1])
            
            # Recheck this block and usage of self.dropout_outputs if necessary
            da = self.dropout_layers[j].backward(da)
            
            dz = da * self.activation_func.backward(z)
            
            dw = np.dot(dz.T, prev_a) / batch_size
            weight_gradients.append(dw)

            db = np.sum(dz, axis=0) / batch_size
            bias_gradients.append(db)
            
            grad = dz
            j -= 1

        weight_gradients.reverse()
        bias_gradients.reverse()
        return weight_gradients, bias_gradients
    
    
    def update_weights(self, weight_gradients, bias_gradients):
        if self.optimiser == "None":
            self.vanille_gradient_descent(weight_gradients, bias_gradients)
        elif self.optimiser == "RMS_Prop":
            self.RMS_Prop(weight_gradients, bias_gradients)
        elif self.optimiser == "Adam":
            self.adam(weight_gradients, bias_gradients)
        else:
            raise ValueError("Optimiser should be either: None, RMS_Prop or Adam")

    
    # Vanilla Mini-Batch GD 
    def vanille_gradient_descent(self, weight_gradients, bias_gradients):
        for i in range(len(weight_gradients)):
            self.weights[i] = self.weights[i] - (self.learning_rate * weight_gradients[i])
            self.biases[i] = self.biases[i] - (self.learning_rate * bias_gradients[i])
        return self.weights

    
    # Optimiser 1 - RMSProp
    # decay rate hyperparam can be tuned but 0.9 is stable
    def RMS_Prop(self, weight_gradients, bias_gradients, decay_rate=0.9, epsilon=1e-8):
        for i in range(len(self.weights)):
            self.r_weights[i] =  (decay_rate * self.r_weights[i]) + ((1 - decay_rate) * (weight_gradients[i] ** 2))
            self.r_biases[i] =  (decay_rate * self.r_biases[i]) + ((1 - decay_rate) * (bias_gradients[i] ** 2))
            self.weights[i] -= (self.learning_rate * weight_gradients[i]) / np.sqrt(self.r_weights[i] + epsilon)
            self.biases[i] -= (self.learning_rate * bias_gradients[i]) / np.sqrt(self.r_biases[i] + epsilon)
        return self.weights

    
    
    # Optimiser 2 - Adam
    def adam(self, weight_gradients, bias_gradients, momentum_Beta=0.9, RMS_Prop_decay=0.999, epsilon=1e-8):
        self.adam_timestep += 1
        # Uses Momentum and RMSProp
        for i in range(len(self.weights)):
            # Moment 1 - Momentum style
            self.adam_moment1_weights[i] = (momentum_Beta * self.adam_moment1_weights[i]) + ((1 - momentum_Beta) * weight_gradients[i])
            self.adam_moment1_biases[i] = (momentum_Beta * self.adam_moment1_biases[i]) + ((1 - momentum_Beta) * bias_gradients[i])
            
            # Moment 2 - RMS_Prop style
            self.adam_moment2_weights[i] = (RMS_Prop_decay * self.adam_moment2_weights[i]) + ((1 - RMS_Prop_decay) * (weight_gradients[i] ** 2))
            self.adam_moment2_biases[i] = (RMS_Prop_decay * self.adam_moment2_biases[i]) + ((1 - RMS_Prop_decay) * (bias_gradients[i] ** 2))
            
            # Use timestep, which is incremented per batch to correct biases
            moment1_weights_bias_corrected = self.adam_moment1_weights[i] / (1 - (momentum_Beta ** self.adam_timestep))
            moment1_biases_bias_corrected = self.adam_moment1_biases[i] / (1 - (momentum_Beta ** self.adam_timestep))
            moment2_weights_bias_corrected = self.adam_moment2_weights[i] / (1 - (RMS_Prop_decay ** self.adam_timestep))
            moment2_biases_bias_corrected = self.adam_moment2_biases[i] / (1 - (RMS_Prop_decay ** self.adam_timestep))
            # Actual update
            self.weights[i] = self.weights[i] - ((self.learning_rate * moment1_weights_bias_corrected) / np.sqrt(moment2_weights_bias_corrected + epsilon))
            self.biases[i] = self.biases[i] - ((self.learning_rate * moment1_biases_bias_corrected) / np.sqrt(moment2_biases_bias_corrected + epsilon))
        return self.weights

    
    
    # Runs the test data (not part of training)
    def run(self, x_test):
        original_inputs = self.inputs
        self.inputs = x_test
        # Ignore cache
        cache = self.forwardpass(x_test, training=False)
        # Creates 1d array where highest probability index = predicted class
        y_pred = np.argmax(self.softmax.y_pred, axis=1)

        self.inputs = original_inputs
        return y_pred

    
    
    def cross_entropy_loss(self, y_pred, y_true, epsilon=1e-8):
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.sum(y_true * np.log(y_pred), axis=1)
        return np.mean(loss)
    
    
    
    # Shuffle dataset for Mini-Batch GD, so it is representative of entire dataset
    def shuffle_dataset(self):
        random_indices = np.random.permutation(len(self.inputs)) # Set of random indices between 0 and end of dataset 
        x_shuffled = self.inputs[random_indices]
        y_shuffled = self.outputs[random_indices]
        return x_shuffled, y_shuffled

Training works correctly, loss is decreasing.<br>Test different hyperparameters to maximise performance

In [None]:
# Train the Neural Network
training_samples = 10000
x_train_small = x_train[:training_samples]
y_train_small = y_train[:training_samples]


relu = ReLU()
np.random.seed(20) # Set seed so results are reproducible across runs
nn = NeuralNetwork(x_train_small, y_train_small, hidden_layers=3, hidden_layer_sizes=[256, 128, 64] , activation_func=LeakyReLU(), optimiser="RMS_Prop", learning_rate=0.001, dropout_rate=0.1)
loss_history = nn.train(100)



# Check if loss is decreasing
print(f"\nFirst 10 epochs loss:")
for i in range(10):
    print(f"Epoch {i+1}: {loss_history[i]:.4f}")

print(f"\nLast 10 epochs loss:")
for i in range(-10, 0):
    print(f"Epoch {100+i+1}: {loss_history[i]:.4f}")


Layer 0 grad mean: 9.124788650536921e-06
Layer 1 grad mean: 8.096623098448365e-05
Layer 2 grad mean: 0.0007611731318984509
Layer 3 grad mean: 0.010733770372844186


KeyboardInterrupt: 

Basic Testing, Accuracy is currently around 42-43% for training set.

In [99]:
# Test the Neural Network
y_pred = nn.run(x_test)
y_true = np.argmax(y_test, axis=1)
# Convert into boolean array and then sum over all true elements
correct_predictions = np.sum((y_pred == y_true))
accuracy = correct_predictions / len(y_true)
print(correct_predictions)
print(f"\nAccuracy: {accuracy:.2%}")

3294

Accuracy: 32.94%
