Load dataset and preprocess

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.preprocessing import OneHotEncoder

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
# Ensure data and dimensions are correct
assert (x_train.shape == (50000, 32, 32, 3))
assert (y_train.shape == (50000, 1))
assert (x_test.shape == (10000, 32, 32, 3))
assert (y_test.shape == (10000,1))

# Preprocessing

# Min-Max scale normalisation,returns values between 0-1
x_train = (x_train - np.min(x_train)) / (np.max(x_train) - np.min(x_train))
# Change type to 4 byte float for performance if needed, currently 8 byte float
print(type(x_train[0][0][0][0]))

# flatten x_train and x_test arrays
# New layout (50000, 32x32x3)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)
print(x_train.shape)

# One-hot encode y_train and y_test
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)
print(y_train.shape)

<class 'numpy.float64'>
(50000, 3072)
(50000, 10)


In [2]:
import numpy as np

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        # x can be any shape
        self.out = sigmoid(x)
        return self.out

    def backward(self, dout):
        # d/dx sigmoid(x) = s * (1 - s)
        return dout * self.out * (1.0 - self.out)


class ReLU:
    def __init__(self):
        self.out = None

    def forward(self, x):
        self.out = np.maximum(0.0, x)
        return self.out

    def backward(self, dout):
        dx = dout.copy()
        dx[self.out <= 0.0] = 0.0
        return dx
    
class LeakyReLU:
    def __init__(self, a):
        self.out = None
        # hyperparam a value, so we can tune it for testing
        self.a = a

    def forward(self, x):
        self.out = np.maximum(x, self.a * x)
        return self.out
    
    def backward(self, dout):
        return np.where(dout >= 0, 1.0, self.a)

In [3]:
import numpy as np
class Softmax:
    def __init__(self):
        self.y_pred = None

    def forward(self, x, y):
        # May be overflow for larger numbers so potentially use different method
        exp_sum = np.sum(np.exp(x))
        self.y_pred = np.exp(x) / sum
        return self.y_pred

    # Using Cross-Entropy-Loss 
    def backward(self, y_actual):
        return self.y_pred - y_actual

In [1]:
import numpy as np

class Dropout:
    def __init__ (self, drop_rate):
        # Noting that the drop_rate resembles the probability that a unit will be set to 0. (e.g. 0.5 for 50%)
        self.drop_rate = drop_rate
        # Mask will store the indices of the units which are kept (set to 1) during training.
        self.mask = None


    def forward(self, x, training=True):
        """
        Performs the forward pass for the dropout layer

        args:
            x (np.array) is the input data
            training (bool) if true, apply dropout, else return input as is.
        
        returns:
            The output data after applying dropout (if training is set to True), or data is just passing through.
        """
        # Mask is created, 1 (True) to keep the unit at 0 (False) to drop it.
        # (1-drop_rate) is the probability of keeping a unit.
        if training:
            self.mask = np.random.rand(*x.shape) > self.drop_rate
            # Multiply the mask by 1/p to maintain the expected value of values.
            return x * self.mask*1/(1-self.drop_rate)
        else:
            return x
    
    def backward(self, d_out):
        """
        Performs the backwards pass for the dropout layer.

        args:
            d_out (np.array) is the gradient from the subsequent layer

        returns:
            np.ndarray is the gradient passed to the preceding layer
        """
        # The gradient only flows through the neurons that weren't dropped in the forward pass, and the same inverted scaling factor
        # 1/(1-drop_rate) is applied to the gradient.
        return d_out * self.mask * 1/(1-self.drop_rate)

Input layer: 3072 nodes<br>
Output classification layer: 10 nodes<br><br>To do: Simplify anything and there may be bugs<br> Implement L1/L2 Regularisers<br>Activation Functions should be chosen per layer, currently it's 1 for the entire network try<br>Add logs so we can use that information to create graphs for analysis

In [None]:
import numpy as np
class NeuralNetwork:
    def __init__(self, x_train, y_train, hidden_layers, hidden_layer_sizes, activation_func,learning_rate=0.03):
        
        self.inputs = x_train
        self.outputs = y_train

        self.hidden_layers = hidden_layers

        if (len(hidden_layer_sizes) != hidden_layers):
            raise ValueError("Neurons array length mismatch with hidden layers amount")
        self.hidden_layers_sizes = hidden_layer_sizes

        if not isinstance(activation_func, (Sigmoid, ReLU, LeakyReLU)):
            raise TypeError("Activation function must be of type Sigmoid, ReLU or LeakyReLU")
        self.activation_func = activation_func

        self.learning_rate = learning_rate

        self.weights, self.biases = self.create_weight_matrices()

        self.activations = [None] * (self.hidden_layers + 2)

        # Dropout instance per hidden layer
        self.dropout_layers = [Dropout(drop_rate=0.5) for _ in range(self.hidden_layers)]
    
        # For Optimisers
        # RMS_Prop
        self.r_weights, self.r_biases = self.create_zeroed_matrix_per_parameter()

        # Adam
        self.adam_moment1_weights, self.adam_moment1_biases = self.create_zeroed_matrix_per_parameter()
        self.adam_moment2_weights, self.adam_moment2_biases = self.create_zeroed_matrix_per_parameter()

    def create_weight_matrices(self):
        network_layout = [len(self.inputs[0])]
        for i in range(self.hidden_layers):
            network_layout.append(self.hidden_layers_sizes[i])
        network_layout.append(len(self.outputs[0]))
        
        weights = []
        biases = []

        # Sigmoid weight initialisation
        if isinstance(self.activation_func, Sigmoid):
            for i in range(len(network_layout) -  1):
                input = network_layout[i]
                output = network_layout[i+1]
                weight_init = (np.random.uniform(low=-1, high=1, size=(output, input))) / (np.sqrt(input))
                bias = np.zeros(output)
                weights.append(weight_init)
                biases.append(bias)
        
        # ReLU weight initialisation
        elif isinstance(self.activation_func, (ReLU, LeakyReLU)):
            for i in range(len(network_layout) -  1):
                input = network_layout[i]
                output = network_layout[i+1]
                weight_init = (np.random.uniform(low=-1, high=1, size=(output, input))) / (np.sqrt(input) / 2)
                bias = np.zeros(output)
                weights.append(weight_init)
                biases.append(bias)

        return weights, biases
    
    def train(self, epochs=100):
        loss_history = []
        # For Adam Optimiser
        adam_timestep = 1
        
        for i in range(epochs):
            batch_size = 64
            loss = None

            for j in range(0, len(x_train), batch_size):
                # Mini-Batch GD, batch size of 64    
                batch_x = x_train[j:j+batch_size]
                batch_y = y_train[j:j+batch_size]
                #cur_size = len(batch_x)
            
                self.inputs = batch_x
                self.outputs = batch_y

                loss, cache, softmax = self.forwardpass()            

                weight_gradients, bias_gradients = self.backprop(cache, softmax)
                self.weights = self.update_gradients(weight_gradients, bias_gradients)
                
                # optimiser 1
                #self.weights = self.update_gradients_momentum(weight_gradients, bias_gradients)

                # optimiser 2
                self.weights = self.adam(weight_gradients, bias_gradients, adam_timestep)
                adam_timestep += 1
            # Loss tracking at every epoch rather every batch
            loss_history.append(loss)
        pass 

    def forwardpass(self):
        cache = []
        self.activations[0] = self.inputs # change this to batch not entire dataset
        softmax = Softmax()
        for j in range(len(self.weights)):
            z1 = np.dot(self.activations[j], self.weights[j].T) + self.biases[j]
                
            # Softmax
            if j == len(self.weights) - 1:
                softmax.forward(z1)
                self.activations[j+1] = softmax.y_pred
                cache.append(z1)
                cache.append(softmax.y_pred)    
            # All other layers
            else:
                a1 = self.activation_func.forward(z1)

                # Integrated Dropout
                if j < len(self.weights) - 1:
                    a1 = self.dropout_layers[j-1].forward(a1, training=True)

                self.activations[j+1] = a1
                cache.append(z1)
                cache.append(a1)
            
        # Compute CE_loss
        ce_loss = self.cross_entropy_loss(softmax.y_pred, self.outputs)
        return ce_loss, cache, softmax

    def backprop(self, cache, softmax, batch_size=64):
        weight_gradients = []
        bias_gradients = []
        
        # Softmax layer backprop
        dsMax = softmax.backward(self.outputs)
        dw_sMax = np.dot((self.activations[-2]).T, dsMax)
        db_sMax = np.sum(dsMax, axis=0)

        # Average over batch size
        dw_sMax /= batch_size
        db_sMax /= batch_size

        # Store gradients in weight and bias lists
        weight_gradients.append(dw_sMax)
        bias_gradients.append(db_sMax)
        
        # Running gradient across entire network
        grad = dsMax

        j = len(self.weights) - 2
        for i in range(len(cache) - 4, -1, -2):
            z = cache[i]
            a = cache[i+1]
            prev_a = cache[i-1] if i > 0 else self.activations[0]

            da = np.dot(grad, self.weights[j+1].T)
            
            # Dropout implementation
            if j < len(self.weights) - 1:
                da = self.dropout_layers[j].backward(da)

            if isinstance (self.activation_func, (ReLU, LeakyReLU)):
                dz = da * self.activation_func.backward(z)
            else:
                dz = da * self.activation_func.backward(a)
            
            dw = np.dot(prev_a.T, dz) / batch_size
            weight_gradients.append(dw)
            db = np.sum(dz, axis=0) / batch_size
            bias_gradients.append(db)
            grad = dz
            j -= 1

        weight_gradients.reverse()
        bias_gradients.reverse()
        return weight_gradients, bias_gradients

    # Creates empty matrices for weights/bias needed for the optimisers
    def create_zeroed_matrix_per_parameter(self):
        weights_velocity = []
        biases_velocity = []
        for i in range(len(self.weights)):
            weights_velocity.append(np.zeros(*self.weights[i].shape))
            biases_velocity.append(np.zeros(*self.biases[i].shape))
        return (weights_velocity, biases_velocity)
    

    # Regular Mini-Batch GD 
    def update_gradients(self, weight_gradients, bias_gradients):
        for i in range(len(weight_gradients)):
            self.weights[i] = self.weights[i] - (self.learning_rate * weight_gradients[i])
            self.biases[i] = self.biases[i] - (self.learning_rate * bias_gradients[i])
        return self.weights

    """ Raw Momentum Mini-Batch GD
    # Beta hyperparam can be tuned, but 0.9 seems stable
    def update_gradients_momentum(self, weight_gradients, bias_gradients, Beta=0.9):
        for i in range(len(self.weights)):
            self.weights_velocity[i] = (Beta * self.weights_velocity[i]) + ((1 - Beta) * weight_gradients[i])
            self.biases_velocity[i] = (Beta * self.biases_velocity[i]) + ((1 - Beta) * bias_gradients[i])
            self.weights[i] = self.learning_rate * self.weights_velocity[i]
            self.biases[i] = self.learning_rate * self.biases_velocity[i]
        return self.weights
    """

    # Optimiser 1 - RMSProp
    # decay rate hyperparam can be tuned but 0.9 is stable
    def RMS_Prop(self, weight_gradients, bias_gradients, decay_rate=0.9, epsilon=1e-8):
        for i in range(len(self.weights)):
            self.r_weights[i] =  (decay_rate * self.r_weights[i]) + ((1 - decay_rate) * (weight_gradients[i] ** 2))
            self.r_biases[i] =  (decay_rate * self.r_biases[i]) + ((1 - decay_rate) * (bias_gradients[i] ** 2))
            self.weights[i] -= (self.learning_rate * weight_gradients[i]) / np.sqrt(self.r_weights[i] + epsilon)
            self.biases[i] -= (self.learning_rate * bias_gradients[i]) / np.sqrt(self.r_biases[i] + epsilon)
        return self.weights

    # Optimiser 2 - Adam
    def adam(self, weight_gradients, bias_gradients, timestep, momentum_Beta=0.9, RMS_Prop_decay=0.999, epsilon=1e-8):
        # Uses Momentum and RMSProp
        for i in range(len(self.weights)):
            # Moment 1 - Momentum style
            self.adam_moment1_weights[i] = (momentum_Beta * self.adam_moment1_weights[i]) + ((1 - momentum_Beta) * weight_gradients[i])
            self.adam_moment1_biases[i] = (momentum_Beta * self.adam_moment1_biases[i]) + ((1 - momentum_Beta) * bias_gradients[i])
            
            # Moment 2 - RMS_Prop style
            self.adam_moment2_weights[i] = (RMS_Prop_decay * self.adam_moment2_weights[i]) + ((1 - RMS_Prop_decay) * (weight_gradients[i] ** 2))
            self.adam_moment2_biases[i] = (RMS_Prop_decay * self.adam_moment2_biases[i]) + ((1 - RMS_Prop_decay) * (bias_gradients[i] ** 2))
            
            # Use timestep, which is incremented per batch to correct biases
            moment1_weights_bias_corrected = self.adam_moment1_weights[i] / (1 - (momentum_Beta ** timestep))
            moment1_biases_bias_corrected = self.adam_moment1_biases[i] / (1 - (momentum_Beta ** timestep))
            moment2_weights_bias_corrected = self.adam_moment2_weights[i] / (1 - (RMS_Prop_decay ** timestep))
            moment2_biases_bias_corrected = self.adam_moment2_biases[i] / (1 - (RMS_Prop_decay ** timestep))
            # Actual update
            self.weights[i] = self.weights[i] - ((self.learning_rate * moment1_weights_bias_corrected) / np.sqrt(moment2_weights_bias_corrected + epsilon))
            self.biases[i] = self.biases[i] - ((self.learning_rate * moment1_biases_bias_corrected) / np.sqrt(moment2_biases_bias_corrected + epsilon))
        return self.weights

    def run(self):
        pass

    def cross_entropy_loss(self, y_pred, y_actual, epsilon=1e-8):
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        loss = -np.sum(y_actual * np.log(y_pred), axis=1)
        return np.mean(loss)