In [1]:
import numpy as np    

In [73]:
class NN:
    
    def __init__(self, layers, activation):
        self.layers = layers
        self.n_layers = len(layers)
        self.n_weights = len(layers) - 1
        
        self.activations = {
            "relu": self.relu,
            "sigmoid": self.sigmoid,
            "tanh": self.tanh
        }

        self.activation_function = self.activations[activation]
        
        self.weights = self.default_weights_init(layers)
        self.biases = self.default_bias_init(layers)
        
        self.loss_function = self.mse
        self.loss_function_der = self.mse_der


    # gettting the weights from the normal distribution
    def default_weights_init(self, layers):
        weights = []
        for i in range(self.n_weights):
            weights.append(np.random.randn(layers[i], layers[i + 1]))
        return weights

    # setting all biases to zero
    def default_bias_init(self, layers):
        biases = []
        for i in layers[1:]:
            biases.append(np.random.randn(1, i))
        return biases


    # data set - array of tuples, where the first element is as input array and the second element is the desired output array
    # ratios are the ratios of train, validation and test data
    def load_data(self, data, ratios):
        # checking if the data matches the NN architecture - I/O
        assert (len(data[0][0]) == self.layers[0]) and (len(data[0][1]) == self.layers[-1])
        idx1 = int(ratios[0] * len(data))
        idx2 = int(sum(ratios[:2]) * len(data))
        np.random.shuffle(data)
        self.X_train = np.array([t[0] for t in data[:idx1]])
        self.Y_train = np.array([t[1] for t in data[:idx1]])
        self.X_valid = np.array([t[0] for t in data[idx1:idx2]])
        self.Y_valid = np.array([t[1] for t in data[idx1:idx2]])
        self.X_test = np.array([t[0] for t in data[idx2:]])
        self.Y_test = np.array([t[1] for t in data[idx2:]])

    
    def train_NN(self, n_epochs, batch_size, learning_rate):
        for epoch in range(n_epochs):
            for _ in range (int(len(self.X_train / batch_size))):
                # creating a minibatch on the fly
                minibatch_indexes = np.random.choice(range(0, len(self.X_train)), batch_size, replace=False)
                minibatch_X = self.X_train[minibatch_indexes]
                minibatch_Y = self.Y_train[minibatch_indexes]
                self.update_minibatch(minibatch_X, minibatch_Y, learning_rate, batch_size)
            if epoch % 100 == 0:
                print(f"epoch {epoch}: loss {self.evaluate()}")

    # calculating gradients for a single minibatch + updating NN parameters
    def update_minibatch(self, minibatch_X, minibatch_Y, learning_rate, batch_size):
        nabla_weights = [np.zeros(w.shape) for w in self.weights]
        nabla_biases = [np.zeros(b.shape) for b in self.biases]
        for (X,Y) in zip(minibatch_X, minibatch_Y):
            # backpropagation algorithm
            delta_nabla_weights, delta_nabla_biases = self.backpropagation(X, Y)
            # updating the nablas - adding gradients
            nabla_weights = [nw + dnw for (nw, dnw) in zip(nabla_weights, delta_nabla_weights)]
            nabla_biases = [nb + dnb for (nb, dnb) in zip(nabla_biases, delta_nabla_biases)]
        # updating the NN parameters, averiging the gradients + multiplying by the learning rate
        self.weights = [w - (learning_rate / batch_size) * delta_w for (w, delta_w) in zip(self.weights, nabla_weights)]
        self.biases = [b - (learning_rate / batch_size) * delta_b for (b, delta_b) in zip(self.biases, nabla_biases)]


    # expects input as an 1 x n  numpy array (n = neurons in input layer)
    # only for evaluation, custom input to the network
    def forward_pass(self, input):
        for i in range(self.n_weights):
            z = input @ self.weights[i] + self.biases[i]
            activation = self.activation_function(z)
            input = activation
        return input

    # backpropagation algorithm
    def backpropagation(self, x, y):
        # matrices of gradients - all zeros
        nabla_weights = [np.zeros(w.shape) for w in self.weights]
        nabla_biases = [np.zeros(b.shape) for b in self.biases]
        # storing activations and z's for each layer
        # storing the input as the first activation
        activations = [x]
        weighted_inputs = []

        # feed forward + storing all information
        for (w, b) in zip(self.weights, self.biases):
            z = x @ w + b
            x = self.activation_function(z)
            # storing weighted input + activation for further use
            weighted_inputs.append(z)
            activations.append(x)

        # backward pass
        # error in the output layer
        delta = (self.loss_function_der(activations[-1], y) * self.activation_function(weighted_inputs[-1], derivation=True))
        # gradient of biases = error in the corresponding layer
        nabla_biases[-1] = delta
        # gradient of weights = error in the corresponding layer dotted/weighted with activations of the previous layer
        nabla_weights[-1] = np.dot(activations[-2].T.reshape(-1, 1), delta)
        # iterating layers second to last to the second (first one won't be updated)
        for l in range(self.n_layers - 2, 0, -1):
            current_z = weighted_inputs[l - 1]
            current_z_der = self.activation_function(current_z, derivation=True)
            #              1x2           2x4                      1x4
            delta = np.dot(delta, self.weights[l].T) * current_z_der
            nabla_biases[l - 1] = delta
            nabla_weights[l - 1] = np.dot(activations[l - 1].T.reshape(-1, 1), delta)
        
        return (nabla_weights, nabla_biases)


    def evaluate(self):
        n = 0
        correct = 0
        for (X, Y) in zip(self.X_valid, self.Y_valid):
            n += 1
            if np.round(self.forward_pass(X)) == Y:
                correct += 1
        return 1 - (correct / n)

    # ------------------------------------- Activation Functions ------------------------------------- #
    
    # ReLU - 0 if x < 0 else x
    def relu(self, input, derivation=False):
        if not derivation:
            return np.maximum(0, input)
        else:
            return np.where(input > 0, 1, 0)

    # Sigmoid - 1 / (1 + e^-x)
    def sigmoid(self, input, derivation=False):
        if not derivation:
            return 1 / (1 + np.exp(-input))
        else:
            return self.sigmoid(input)*(1-self.sigmoid(input))

    # Tanh - 1 - 2 / (e^2x + 1)
    def tanh(self, input, derivation=False):
        if not derivation:
            return 1 - 2 / (np.exp(2 * input) + 1)
        else:
            return 1 - self.tanh(input) ** 2
            
    
    # ------------------------------------- Loss Functions ------------------------------------- #

    # mean square error - categorization loss function
    # Y - true values that SHOULD be predicted by the model
    # Y_hat - ACTUAL prediction by the model
    def mse(self, y, y_hat):
        return 0.5 * (y - y_hat) ** 2

    # mean square error derivative
    def mse_der(self, y, y_hat):
        return (y - y_hat)
        


        

In [74]:
example = NN([2, 3, 1], "tanh")

In [75]:
example.load_data(dummy, (0.7, 0.2, 1))

In [76]:
example.train_NN(300, 12, 2)

epoch 0: loss 0.11111111111111116
epoch 100: loss 0.0
epoch 200: loss 0.0


In [78]:
print(example.forward_pass([1, 1]), 0)
print(example.forward_pass([1, 0]), 1)
print(example.forward_pass([0, 1]), 1)
print(example.forward_pass([0, 0]), 0)

[[0.03863464]] 0
[[0.98436839]] 1
[[0.99018412]] 1
[[0.03014505]] 0
