In [1]:
# Imports

import random
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
import sklearn.model_selection
from sklearn.preprocessing import StandardScaler

In [2]:
# Declare constants

NUM_INPUT_NODES = 4
NUM_OUTPUT_NODES = 3
NUM_HIDDEN_NODES = 8
NUM_HIDDEN_LAYERS = 1

In [3]:
class Node:
    """Each node has a weight and bias"""
    def __init__(self, num_weights):
        """
        num_weights: the number of nodes in the previous layer
        """
        self.z = 0
        self.activation = 0
        self.weights = [random.uniform(0, 0.01) for _ in range(num_weights)]
        self.bias = random.uniform(0.0, 0.01)

# Consider using a separate class for input node, not necessary
class InputNode:
    def __init__(self):
        self.activation = 0

In [4]:
# activation functions

def relu(x):
    return np.maximum(0, x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def leaky_relu(x, alpha=0.01):
    return max(alpha * x, x)

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))  # for numerical stability
    return exp_logits / np.sum(exp_logits, axis=0)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def sigmoid_derivative(x):
    return x * (1 - x)

def leaky_relu_derivative(x, alpha=0.01):
    return np.where(x > 0, 1, alpha)

In [5]:
class MLP:
    "Create a multilayer perceptron"
    def __init__(self, n_in, n_out, n_hidden, hidden_layers):
        self.n_hidden_layers = hidden_layers;
        self.nodes = []
        self.create(n_in, n_out, n_hidden)
        self.gradient = []

    def create(self, n_in, n_out, n_hidden):
        nodes = []
        
        input_layer = [InputNode() for _ in range(n_in)]
        nodes.append(input_layer)
        
        for i in range(self.n_hidden_layers):
            nodes.append([Node(num_weights=n_in) for _ in range(n_hidden)])
            
        output_layer = [Node(num_weights=n_hidden) for _ in range(n_out)]
        nodes.append(output_layer)
        
        self.nodes = nodes

    def forward_pass(self, X_row):
        # input layer
        for i, input_node in enumerate(self.nodes[0]):
            input_node.activation = X_row[i]

        # input layer to hidden layer
        for L in range(1, self.n_hidden_layers + 1):  # note +1 must be used bc the range is not inclusive of upper
            for node_j in self.nodes[L]:
                node_j.z = sum(node_k.activation * node_j.weights[k] for k, node_k in enumerate(self.nodes[L - 1])) + node_j.bias
                node_j.activation = leaky_relu(node_j.z)

        # calculate z of output layer
        for output_node_j in self.nodes[-1]:
            output_node_j.z = sum(node_k.activation * output_node_j.weights[k] for k, node_k in enumerate(self.nodes[-2])) + output_node_j.bias

        # apply softmax to output layer activations as a vector
        softmax_output = softmax([output_node.z for output_node in self.nodes[-1]])
        
        #update activations of output layer
        for j, output_node_j in enumerate(self.nodes[-1]):
            output_node_j.activation = softmax_output[j]

        output = [node.activation for node in mlp.nodes[-1]]
        return output

    def compute_gradient(self, y):
        ohw = []  # contains partial derivative of cost w respect to weights btwn output and hidden
        ohb = []  # contains partial derivative of cost w respect to biases btwn output and hidden
        hiw = []  # contains partial derivative of cost w respect to weights btwn hidden and input
        hib = []  # contains partial derivative of cost w respect to biases btwn hidden and input

        # weights btwn output and hidden 
        for j, output_node_j in enumerate(self.nodes[-1]):
            for k, weight_jk in enumerate(output_node_j.weights):
                a_k = self.nodes[-2][k].activation
                z_j = output_node_j.z
                a_j = output_node_j.activation
                partial = a_k * leaky_relu_derivative(z_j) * 2 * (a_j - y[j])
                ohw.append(partial)
                
        # biases btwn output and hidden
        for j, output_node_j in enumerate(self.nodes[-1]):
            z_j = output_node_j.z
            a_j = output_node_j.activation
            partial = leaky_relu_derivative(z_j) * 2 * (a_j - y[j])
            ohb.append(partial)

        # weights btwn hidden and input
        for L in range(1, self.n_hidden_layers + 1):  # note +1 must be used bc the range is not inclusive of upper
            for j, hidden_node_j in enumerate(self.nodes[L]):
                for k, weight_jk in enumerate(hidden_node_j.weights):
                    # notice it uses the updated weight from the output layer, recursive
                    partialC_partiala_j = sum([output_node.weights[j] * leaky_relu_derivative(output_node.z) * 2 * (output_node.activation - y[i]) for i, output_node in enumerate(self.nodes[L+1])])    
                    a_k = self.nodes[L-1][k].activation
                    z_j = hidden_node_j.z
                    partial = a_k * leaky_relu_derivative(z_j) * partialC_partiala_j
                    hiw.append(partial)

        # biases btwn hidden and input
        for L in range(1, self.n_hidden_layers + 1):  # note +1 must be used bc the range is not inclusive of upper
            for j, hidden_node_j in enumerate(self.nodes[L]):
                # notice it uses the updated weight from the output layer, recursive
                partialC_partiala_j = sum([output_node.weights[j] * leaky_relu_derivative(output_node.z) * 2 * (output_node.activation - y[i]) for i, output_node in enumerate(self.nodes[L+1])])    
                a_k = self.nodes[L-1][k].activation
                z_j = hidden_node_j.z
                partial = leaky_relu_derivative(z_j) * partialC_partiala_j
                hib.append(partial)
        self.gradient = []
        self.gradient.append(ohw)
        self.gradient.append(ohb)
        self.gradient.append(hiw)
        self.gradient.append(hib)

    def update(self, eta):
        indexer = 0  # gradient indexer
        
        # update weights between output and second to last layer
        for j, output_node_j in enumerate(self.nodes[-1]):
            for k, weight_jk in enumerate(output_node_j.weights):
                weight_jk -= eta * self.gradient[0][indexer]
                output_node_j.weights[k] = weight_jk
                indexer += 1
        indexer = 0
        
        # update biases between output and second to last layer
        for j, output_node_j in enumerate(self.nodes[-1]):
            bias_j = output_node_j.bias - eta * self.gradient[1][indexer]
            output_node_j.bias = bias_j
            indexer += 1
        indexer = 0

        # TODO implement recursive backpropogation for multiple hidden layers
        # update weights between hidden and previous layers
        for L in range(1, self.n_hidden_layers + 1):  # note +1 must be used bc the range is not inclusive of upper
            for j, hidden_node_j in enumerate(self.nodes[L]):
                for k, weight_jk in enumerate(hidden_node_j.weights):
                    weight_jk -= eta * self.gradient[2][indexer]
                    hidden_node_j.weights[k] = weight_jk
                    indexer += 1
        indexer = 0

        # update biases between hidden and previous layers
        for L in range(1, self.n_hidden_layers + 1):  # note +1 must be used bc the range is not inclusive of upper
            for j, hidden_node_j in enumerate(self.nodes[L]):
                bias_j = output_node_j.bias - eta * self.gradient[3][indexer]
                output_node_j.bias = bias_j
                indexer += 1
        indexer = 0

    def evaluate(self, X_test, y_test):
        errors = []
        for i in range(len(X_test)):
            prediction = self.forward_pass(X_test[i])
            error = abs(prediction - y_test[i])
            errors.append(error)
        accuracy = 1 - np.mean(errors)
        return accuracy
                    
    def show_weights(self):
        for layer in mlp.nodes[1:]:
            print([node.weights for node in layer])
            

In [6]:
# Create MLP

mlp = MLP(NUM_INPUT_NODES, NUM_OUTPUT_NODES, NUM_HIDDEN_NODES, NUM_HIDDEN_LAYERS)

In [7]:
iris = load_iris()

In [8]:
X = iris.data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
y = iris.target

In [10]:
# Number of classes
num_classes = np.max(y) + 1

# One-hot encode the target variable y
y = np.eye(num_classes)[y]

In [11]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.33)

In [12]:
mlp.nodes[-1][0].weights

[0.00014148481690323344,
 0.0070306806965436145,
 0.0008680063724662568,
 0.0014796119380785823,
 0.006663263768033457,
 0.004256902080828936,
 0.0011275532346807194,
 0.005908726800250265]

In [13]:
# training loop
learning_rate = 1.0
epochs = 40
for i in range(1, epochs+1):
    print("epoch " + str(i))
    for i, row in enumerate(X_train):
        output = mlp.forward_pass(X_train[i])
        # input = [node.activation for node in mlp.nodes[0]]
        # hidden = [node.activation for node in mlp.nodes[1]]
        #output = [node.activation for node in mlp.nodes[-1]]
        mlp.compute_gradient(y_train[i])
        mlp.update(learning_rate)
        print(output)
        print(y_train[i])

epoch 1
[0.33411315007766296, 0.3329063546106797, 0.3329804953116573]
[1. 0. 0.]
[0.7871882953773987, 0.10640596256119203, 0.10640574206140944]
[1. 0. 0.]
[0.8499920329736081, 0.07490738469443547, 0.07510058233195634]
[0. 1. 0.]
[0.5099283559024055, 0.2506960976379796, 0.23937554645961495]
[0. 0. 1.]
[0.2728418765680525, 0.3700050759745419, 0.35715304745740567]
[0. 1. 0.]
[0.27020001204231836, 0.3748184185507449, 0.35498156940693665]
[0. 0. 1.]
[0.2689483291790636, 0.37152711341436107, 0.3595245574065753]
[0. 1. 0.]
[0.2678283507868107, 0.37564313357056633, 0.35652851564262295]
[1. 0. 0.]
[0.27203734516802497, 0.373412504023415, 0.35455015080855995]
[0. 1. 0.]
[0.2704376541266872, 0.37770802954846744, 0.35185431632484526]
[0. 1. 0.]
[0.2687602064581422, 0.3821305963622293, 0.34910919717962846]
[0. 1. 0.]
[0.26708106761598716, 0.3865409506831955, 0.3463779817008174]
[0. 1. 0.]
[0.2653923483771986, 0.39092559665328486, 0.34368205496951654]
[1. 0. 0.]
[0.2697286318559671, 0.38845756352759

In [14]:
accuracy = mlp.evaluate(X_test, y_test)

TypeError: MLP.evaluate() takes 2 positional arguments but 3 were given