## Imports

In [1]:
import numpy as np

## Network Class

In [71]:
"""
Simple_NN
Class to create a simple neural network
    Learning Algorithm: stochastic gradient descent with backpropagation
    Activation Function: sigmoid
    Cost Function: MSE
"""
class Simple_NN(object):
    """ 
    INITIALIZE THE NETWORK
    """
    def __init__(self, layers, activation_function="sigmoid", cost_function="MSE"):
        """
        self.layers is a list of numbers where the ith number how many neurons are in
        the ith layer of the network.
        """
        self.layers = layers;
        self.num_layers = len(layers);
        
        """
        self.weights[Layer - 1, input_neuron, output_neuron] = 
                                            List of weight matrices for each layer.                      
        self.biases[Layer - 1, neuron] = 
                                            List of vectors with biases for each neuron  
        FOR EXAMPLE:
            self.weights[layer, j, i] = weight going into the jth neuron of the lth layer
                                    from the ith neuron of the (l-1)st layer 
            self.biases[layer, k] = bias on the kth nuron of the lth layer
        NOTE: layer 0 is the input layer, so self.weights[0] is the weights going into layer 1
        """
        self.weights = [];
        self.biases = [];
        self.Z = [];
        self.activations = [];
        # Create matrices with correct dimensions 
        for layer_num in range(1, self.num_layers):
            self.weights.append(np.random.randn(layers[layer_num], layers[layer_num - 1]));
            self.biases.append(np.random.randn(layers[layer_num]));
        """
        self.activation = string specifying what activation function the neurons will use. 
            The options are:
            sigmoid (default)
        self.cost_function = string specifying hat cost function will be used for the 
            network.
            The options are:
            MSE (default)
        """
        self.activation_function = activation_function;
        self.cost_function = cost_function;
        self.errors = [];
        self.gradC_b = [];
        self.gradC_w = [];
 
    """
    TRAINING
    Train the network using stochastic gradient descent and backpropagation.
    Training data should be given in the following format:
        [x11, x12, ..., x1i, y1
         x21, x22, ..., x2i, y2
         ...
         xm1, xm1, ..., xmi, ym]
    Where each row corrsponds to a training example with i data points
    """
    def train(self, training_data, batch_size, num_epochs, learning_rate):
        for epoch in range(num_epochs):
            print("EPOCH: %d" % (epoch + 1));
            # Randomize the order of training examples
            np.random.shuffle(training_data);
            # Separate inputs from outputs
            inputs = np.matrix(training_data[:, :-1]);
            outputs = np.matrix(training_data[:, -1]);
            # For each epoch, loop through each batch to use as training data
            for batch in range(len(training_data))[::batch_size]:
                """
                For each batch, we calculate activations and use the backpropagation
                algorithm to change the weights and biases using gradient descent
                Create matrix out of all training inputs in the batch
                To apply W to all input vectors, we can multiply WX where
                X is the ixm matrix containing all m training examples as columns
                """
                X = inputs[batch : batch + batch_size];
                X = np.transpose(X);
                Y = outputs[batch : batch + batch_size];
                # FEEDFORWARD
                self.evaluate(X);
                # BACKPROPAGATION
                self.backpropagate(X, Y);
                # GRADIENT DESCENT
                self.grad_descent();
    
    """
    EVALUATE
    Takes an input column vector X and computes the output of the network
    """
    def evaluate(self, X):
        """
        self.Z[layer, neuron, training_example] = 
                        List of vectors with weighted inputs to the neurons
        self.activations[layer, neuron, training_Example] = 
                        List of vectors with activations for each neuron
        """
        self.Z = [];
        self.activations = [];
        # Calclate outputs going forwards through the network
        for layer in range(self.num_layers - 1):
            if layer == 0:
                # Feed inputs to the network
                prev_activations = X;
            else:
                prev_activations = self.activations[layer - 1];
            # Bias matrix where each column is a copy of the bias vector is needed
            # to add bias terms for each training example. 
            one_vector = np.ones(np.matrix(X).shape[1]);
            bias_matrix = np.outer(self.biases[layer], one_vector);
            self.Z.append(np.dot(self.weights[layer], prev_activations) + bias_matrix);
            self.activations.append(self.activation(self.Z[layer]));
        
    """
    BACKPROPAGATE
    Using input vector X and output vector Y, calculates the following instance variables:
        self.errors[layer, neuron, training_example] - error in each neuron
        self.gradC_b[layer] - average gradient wrt biases over all training examples
        self.gradC_w[layer, input_neuron, output_neuron] - average gradient wrt weights
    """
    def backpropagate(self, X, Y):
        # Calculate output error matrix so the [i, j]th entry contains the
        # error for the ith neuron in the output layer for the jth training
        # example
        output_error = np.multiply(
                             self.cost_derivative(self.activations[-1], Y),
                             self.activation_derivative(self.Z[-1]));
        # Backpropogate: we create the errors matrix which is indexed
        # in the form errors[layer, neuron, training_example]
        self.errors = [output_error];
        # Note that in the loop, we use negative subscripts to go through the 
        # layers from output towards input. We therefore start at layer [-2], the
        # second to last layer
        for layer in range(2, self.num_layers):               
            # For each layer, calculate errors in previous layer
            previous_errors = np.multiply(
                                np.dot(
                                    np.transpose(self.weights[-layer + 1]),
                                    self.errors[0]), 
                                self.activation_derivative(self.Z[-layer])); 
            # Add previous errors to the beginning of the error matrix list
            self.errors.insert(0, previous_errors);
        # Calculate gradients of cost function
        self.gradC_b = [];
        self.gradC_w = [];
        for layer in range(self.num_layers - 1):
            """
            gradC_b[layer, neuron] = 
            Gradient of cost wrt biases for a layer is just the
            vector of errors for that layer.
            When we compute the average over all training examples using 
            np.average(matrix, 1), we get a column vector. In order to
            correctly subtract this from the row-vector of biases, 
            we transpose the gradient vectors.
            """
            self.gradC_b.append(np.average(self.errors[layer],1));
            """
            sum_of_weights[layer, j, k] will contain the partial derivative 
            of cost wrt the weight from the kth neuron in layer - 1 to the jth
            neuron in layer summed over all training examples. That is,
            sum_of_weightes[layer, j, k] = [sum over training examples dC/dw_j,k]
            """
            if (layer == 0):
               prev_activations = X;
            else:
               prev_activations = self.activations[layer - 1];
            sum_of_weights = np.dot(
                                    self.errors[layer],
                                    np.transpose(prev_activations));
            """
            gradC_w [layer, input_neuron, output_neuron] is also 
                averaged over all traininge examples in the batch
            """
            self.gradC_w.append((1 / batch_size) * sum_of_weights);
    
    """
    GRADIENT DESCENT
    Use the gradients computed in backpropagate() to update the weights and
    biases of the network.
    """
    def grad_descent(self):
        for layer in range(self.num_layers - 1):
            self.biases[layer] = np.subtract(self.biases[layer],
                        np.transpose(np.multiply(learning_rate, self.gradC_b[layer])));
            self.weights[layer] = np.subtract(self.weights[layer],
                        np.multiply(learning_rate, self.gradC_w[layer]));
        
    """ 
    ACTIVATION FUNCTION
    For this network, we use the sigmoid function to calculate neuron activation
    In general, we assume the input z will be a matrix where the [i,j]th entry is the
    ith neuron in the jth training example
    """      
    def activation(self, z):
        if (self.activation_function == "sigmoid"):
            return 1.0 / (1 + np.exp(-z));
    
    def activation_derivative(self, z):
        if (self.activation_function == "sigmoid"):
            return np.multiply((1 - self.activation(z)), self.activation(z));
    """
    COST FUNCTION
    We assume that activations is a 2D matrix where each column corresponds to the activations
    for a specific training example and each row corresponds to a specific neuron

    We assume for now that outputs for the network are disjoint categories, so only
    one output neuron should fire at a time. The output_matrix function turns the output
    vector where each entry corresponds to a training example into a matrix where each
    column corresponds to a training example, with only one nonzero entry per column
    corresponding to the output for that example
    
    Both calculate_cost and cost_derivative return a row vector where the ith entry
    is the cost/cost derivative for training example i
    """
    def output_matrix(self, activations, outputs):
        # output_matrix[neuron, training_example]
        output_matrix = np.zeros((activations.shape[0], outputs.shape[0]));
        for training_example, output in enumerate(outputs):
            output_matrix[output - 1, training_example] = 1;
        return output_matrix;
    
    def calculate_cost(self, activations, outputs):
        outputs_matrix = self.output_matrix(activations, outputs);
        if (self.cost_function == "MSE"):
            squared_errors = np.square(activations - output_matrix);
            # Average sum of squared errors over each training example
            return 0.5 * np.average(np.sum(squared_errors, 0));
    """
    Derivative of cost functions with respect to activations
    Each entry [i, j] in the resulting matrix will be the gradient of the cost function
    with respect to the activation of the ith neuron in the jth training example
    """
    def cost_derivative(self, activations, outputs):
        output_matrix = self.output_matrix(activations, outputs);
        if (self.cost_function == "MSE"):
            return (activations - output_matrix);
       
    """ 
    TODO: 
        predict(data) -> call evaluate, return neuron number in output layer with max output
        
        test(testing_data) -> Add to each epoch in train to get # of correct predictions
            using predict and Y
    """
    
    def print_network(self, debug=0):
        print("Number of layers: ");
        print(self.num_layers);
        print("Weights: ")
        for layer in self.weights:
            print(layer)
        print("\nBiases:" )
        for layer in self.biases:
            print(layer)
        print("\nActivations:")
        for layer in self.activations:
            print(layer);
        # Print extra info about the network
        if (debug == 1):
            print("\nWeighted Inputs:")
            for layer in self.Z:
                print(layer)
            print("\nErrors:")
            for layer in self.errors:
                print(layer)
            print("\nBias Gradients:")
            for layer in self.gradC_b:
                print(layer)
            print("\nWeight Gradients")
            for layer in self.gradC_w:
                print(layer)

In [72]:
"""
TEST NETWORK CREATION
"""
test = Simple_NN([3, 7, 5, 4, 6]);
random_data = np.matrix('1, 2, 3, 0; \
                        11, 12, 13, 1; \
                        21, 22, 23, 2; \
                        31, 32, 33, 3; \
                        41, 42, 43, 4; \
                        51, 52, 53, 5 \
                       ');
    
batch_size = 2;
num_epochs = 10;
learning_rate = 0.5;

test.train(random_data, batch_size, num_epochs, learning_rate);
print("Done Training")

test_input = np.matrix('1; 2; 3');
test.evaluate(test_input);

test.print_network();


EPOCH: 1
EPOCH: 2
EPOCH: 3
EPOCH: 4
EPOCH: 5
EPOCH: 6
EPOCH: 7
EPOCH: 8
EPOCH: 9
EPOCH: 10
Done Training
Number of layers: 
5
Weights: 
[[-1.57385241 -0.98113612 -0.62163248]
 [-0.16552989 -0.08024697  0.08835607]
 [ 0.90473293 -0.78300384 -0.42353519]
 [ 0.30717572 -0.38364937 -0.36020769]
 [-0.40471432 -0.03408016 -1.05373029]
 [ 1.73568318  0.00372334  0.25753556]
 [ 0.19085972 -0.48560685  0.14483556]]
[[ 0.55722924 -0.81221202 -0.19445222  1.57406828 -0.33503165  0.63293282
   1.46264311]
 [-0.82334928 -2.0436962   0.98549067  2.0283247   0.13466573  2.33253479
   0.78327198]
 [ 2.58965687  1.30094233  0.33424546  0.68120173  0.11688864  1.05984171
   1.81946981]
 [-0.41389148 -0.14637531  1.05729996  0.08461584 -0.86509125  1.66868837
   0.28443579]
 [ 0.15225072 -2.0888847   0.65296554  1.63008856  0.63175085 -0.87351167
  -0.89370605]]
[[-1.13970394 -0.79415168 -0.18912557 -0.98659181  2.19564678]
 [ 0.11983706 -1.45035694  2.19737104 -0.25051697  0.06667381]
 [ 0.77386591  1.3