In [36]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras import layers, models, Model
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import random
import time

In [37]:
activation_functions = {
    'tanh': tf.tanh,
    'relu': tf.nn.relu,
    'sigmoid': tf.nn.sigmoid,
    'linear': tf.keras.activations.linear,
    'softmax': tf.nn.softmax,
    'sign': tf.sign,
    'sin': tf.sin,
    'exp': tf.exp
}

**Loading Data**

In [70]:
SUBSET = 1.0    # subset (in percentage) of X_test used during training

In [71]:
%%time

# numpy
_, (X_test, y_test) = mnist.load_data() # only care  about X_test

selection = np.random.choice(np.arange(X_test.shape[0]),
                             int(SUBSET * X_test.shape[0]),
                             replace=False)

X_test = X_test.reshape(10000, 784).astype(np.float32)[selection] / 255.0
y_test = to_categorical(y_test)[selection]  # one-hot encoding

# tensorflow
X_test = tf.convert_to_tensor(np.transpose(X_test))
y_test = tf.convert_to_tensor(np.transpose(y_test))

CPU times: total: 234 ms
Wall time: 241 ms


**Network Definition**

In [72]:
MUTATE_RATE_MATRIX = 0.3
MUTATE_RATE_BIAS = 0.1
MUTATE_RATE_ACTIVATION_FUNCTION = 0.1
CROSSOVER_RATE = 0.7
GAUSSIAN_NOISE_STDDEV = 1   # mutation applies additive gaussian noise
UNIFORM_CROSSOVER = False   # if True, performs crossover of matrices element-wise, else row-wise
HIDDEN_LAYER_WIDTH = 32     # TODO: for now all hidden layers same width

In [73]:
class MultiLayerPerceptron(Model):
    def __init__(self, **params):
        """
        Weight gnostic multi-layer feed forward neural network
        :param params: Params have to be in form: (matrix1=..., bias1=..., activations1=..., matrix2=..., ...)
        """
        super(MultiLayerPerceptron, self).__init__()

        self.n_layers = max([int(param_name[-1]) for param_name in params.keys()])   # = number of hidden layers + 1 (output layer)

        for (param_name, param) in params.items():
            assert param_name[:-1] in ('matrix', 'bias', 'activation'), 'Invalid attribute!'
            setattr(self, param_name, param)

    def call(self, inputs):
        x = inputs

        for layer in range(1, self.n_layers + 1):
            x = getattr(self, 'matrix' + str(layer)) @ x
            x += getattr(self, 'bias' + str(layer))
            x = activation_functions[getattr(self, 'activation' + str(layer))](x)

        return x

    # TODO: without custom function?
    def evaluate(self):
        y_pred = np.argmax(self.call(X_test), axis=0)
        y_true = np.argmax(y_test, axis=0)
        return np.mean(y_pred == y_true)

    def mutate(self):
        for layer in range(1, self.n_layers + 1):
            # matrix
            matrix = getattr(self, 'matrix' + str(layer))
            mutation_stencil = tf.cast(tf.reshape(tf.random.categorical(
                tf.math.log([[1 - MUTATE_RATE_MATRIX, MUTATE_RATE_MATRIX]]),
                matrix.shape[0] * matrix.shape[1]), matrix.shape), tf.float32)
            noise = tf.random.normal(mean=0.0, stddev=GAUSSIAN_NOISE_STDDEV, shape=matrix.shape)
            matrix = matrix + tf.multiply(mutation_stencil, noise)
            setattr(self, 'matrix' + str(layer), matrix)

            # bias
            bias = getattr(self, 'bias' + str(layer))
            mutation_stencil = tf.cast(tf.reshape(tf.random.categorical(
                tf.math.log([[1 - MUTATE_RATE_BIAS, MUTATE_RATE_BIAS]]),
                bias.shape[0]), bias.shape), tf.float32)
            noise = tf.random.normal(mean=0.0, stddev=GAUSSIAN_NOISE_STDDEV, shape=bias.shape)
            bias = bias + tf.multiply(mutation_stencil, noise)
            setattr(self, 'bias' + str(layer), bias)

            # activation
            cleaner = lambda x: 'softmax' if x=='softmax_v2' else x
            activation = cleaner(getattr(self, 'activation' + str(layer)))
            if random.uniform(0, 1) < MUTATE_RATE_ACTIVATION_FUNCTION:
                activation = random.choice(list(activation_functions.keys()))
            setattr(self, 'activation' + str(layer), activation)

    def summary(self):
        dash = '-' * 75
        ddash = '=' * 75
        print(dash)
        print('Model')
        print(ddash)

        n_params = 0
        for layer in range(1, self.n_layers + 1):
            # get values
            matrix = getattr(self, 'matrix' + str(layer))
            bias = getattr(self, 'bias' + str(layer))
            cleaner = lambda x: 'softmax' if x=='softmax_v2' else x
            activation = cleaner(getattr(self, 'activation' + str(layer)))

            n_params += matrix.shape[0] * matrix.shape[1] + bias.shape[0] + 1

            # print adjustments
            activation = '({})'.format(activation)
            layer_IO = '(in={}, out={})'.format(matrix.shape[1], matrix.shape[0],)

            print('Linear {:<20}{:<30}#Params: {}'.format(activation, layer_IO, matrix.shape[0] * matrix.shape[1] + bias.shape[0] + 1))

        print(ddash)
        print('Total params: {}'.format(n_params))
        print('Accuracy: {}\n'.format(self.evaluate()))

In [74]:
class Population:
    # TODO: n_layers evolvable
    def __init__(self, size=10, n_survivors=5, n_hidden_layers=1):
        """
        :param size: population size
        :param n_survivors: number of survivors after each generation (rest is killed and unable to pass on its genes)
        :param n_hidden_layers: number of hidden layers
        """
        self.generation = 0
        self.size = size
        self.n_survivors = n_survivors
        self.n_hidden_layers = n_hidden_layers
        self.elite = None
        self.fitness = None # cache fitness for increased speed
        self.fitness_generation = -1  # generation when fitness was evaluated

        # initialization (gaussian)
        # TODO: hidden layer width fixed for now
        self.organisms = []
        for _ in range(size):
            params = {}

            # first hidden layer
            params['matrix1'] = tf.random.normal(mean=0.0, stddev=1.0, shape=[HIDDEN_LAYER_WIDTH, 784])
            params['bias1'] = tf.random.normal(mean=0.0, stddev=1.0, shape=[HIDDEN_LAYER_WIDTH, 1])
            params['activation1'] = 'sigmoid'

            for hidden_layer in range(2, n_hidden_layers + 2):
                params['matrix' + str(hidden_layer)] = tf.random.normal(mean=0.0, stddev=1.0, shape=[HIDDEN_LAYER_WIDTH, HIDDEN_LAYER_WIDTH])
                params['bias' + str(hidden_layer)] = tf.random.normal(mean=0.0, stddev=1.0, shape=[HIDDEN_LAYER_WIDTH, 1])
                params['activation' + str(hidden_layer)] = 'sigmoid'

            # output layer
            params['matrix' + str(self.n_hidden_layers + 1)] = tf.random.normal(mean=0.0, stddev=1.0, shape=[10, HIDDEN_LAYER_WIDTH])
            params['bias' + str(self.n_hidden_layers + 1)] = tf.random.normal(mean=0.0, stddev=1.0, shape=[10, 1])
            params['activation' + str(self.n_hidden_layers + 1)] = 'softmax'

            model = MultiLayerPerceptron(**params)
            self.organisms.append(model)

        self.history = [(max(self.organism_fitness()), self.average_fitness())]   # fitness of population over all generations

    def organism_fitness(self):
        if self.generation != self.fitness_generation:
            self.fitness = [organism.evaluate() for organism in self.organisms]
            self.fitness_generation = self.generation

        return self.fitness

    def average_fitness(self):
        organism_fitness = self.organism_fitness()
        return sum(organism_fitness) / len(organism_fitness)

    def max_fitness(self):
        return max(self.organism_fitness())

    def selection(self):
        organism_fitness = self.organism_fitness()

        # elitism (n=1)
        elite_index = np.argmax(organism_fitness)
        self.elite = self.organisms.pop(elite_index)
        organism_fitness.pop(elite_index)

        probabilities = [fitness / sum(organism_fitness) for fitness in organism_fitness]  # normalized
        survivors = np.random.choice(self.organisms,
                                     size=self.n_survivors - 1,
                                     p=probabilities,
                                     replace=False)
        return [survivor for survivor in survivors]

    def crossover(self, parents):
        children = []
        while len(children) < int(CROSSOVER_RATE * (self.size - 1)):
            [father, mother] = random.sample(parents + [self.elite], k=2)  # sample without replacement

            child_params = {}
            for layer in range(1, father.n_layers + 1):
                if UNIFORM_CROSSOVER:
                    # matrix - uniform crossover
                    father_matrix = getattr(father, 'matrix' + str(layer))
                    mother_matrix = getattr(mother, 'matrix' + str(layer))

                    father_mask = tf.round(tf.random.uniform(father_matrix.shape))
                    mother_mask = - (father_mask - 1)

                    child_matrix = tf.multiply(father_mask, father_matrix) + tf.multiply(mother_mask, mother_matrix)
                    child_params['matrix' + str(layer)] = child_matrix
                else:
                    # matrix - row-wise (neuron-wise) crossover
                    father_matrix = getattr(father, 'matrix' + str(layer))
                    mother_matrix = getattr(mother, 'matrix' + str(layer))

                    n_rows = father_matrix.shape[0]
                    father_mask = np.random.choice([True, False], size=n_rows)

                    child_matrix = tf.convert_to_tensor([father_matrix[row, :] if father_mask[row] \
                                                         else mother_matrix[row, :] for row in range(n_rows)])
                    child_params['matrix' + str(layer)] = child_matrix

                # bias - uniform crossover
                father_bias = getattr(father, 'bias' + str(layer))
                mother_bias = getattr(mother, 'bias' + str(layer))

                father_mask = tf.round(tf.random.uniform(father_bias.shape))
                mother_mask = - (father_mask - 1)

                child_bias = tf.multiply(father_mask, father_bias) + tf.multiply(mother_mask, mother_bias)
                child_params['bias' + str(layer)] = child_bias

                # activation
                cleaner = lambda x: 'softmax' if x=='softmax_v2' else x
                father_activation = cleaner(getattr(father, 'activation' + str(layer)))
                mother_activation = cleaner(getattr(mother, 'activation' + str(layer)))

                child_activation = father_activation if (random.uniform(0, 1) < 0.5) else mother_activation
                child_params['activation' + str(layer)] = child_activation

            model = MultiLayerPerceptron(**child_params)
            children.append(model)

        # if CROSSOVER_RATE != 100% allow some individuals to pass on their genes without crossover
        while len(children) < (self.size - 1):
            children.append(random.sample(parents + [self.elite], k=1))

        return children

    def mutate(self, organisms):
        for organism in organisms:
            organism.mutate()

    def breed(self):
        parents = self.selection()
        children = self.crossover(parents)
        self.mutate(children)  # TODO: mGA or GA?
        self.organisms = children + [self.elite]
        self.generation += 1
        self.history.append((self.max_fitness(), self.average_fitness()))

    def plot(self):
        # plot evolution
        plt.figure()
        plt.plot(np.arange(self.generation + 1), [score[0] for score in self.history],
                 label='max fitness')
        plt.plot(np.arange(self.generation + 1), [score[1] for score in self.history],
                 label='avg fitness', alpha=0.6)
        plt.title('Population fitness' + ' (n=' + str(self.size) + ')')
        plt.xlabel('Generations')
        plt.ylabel('Fitness score (accuracy)')
        plt.legend()
        plt.show()

        # plot best performing final network
        organism_fitness = self.organism_fitness()
        elite_index = np.argmax(organism_fitness)
        self.organisms[elite_index].summary()

**Training**

In [75]:
# initialization
GENERATIONS = 5000
POPULATION_SIZE = 30
SURVIVORS = 15
N_HIDDEN_LAYERS = 1

In [None]:
%%time
# initial population
print('Starting training')
t_training = time.time()
population = Population(size=POPULATION_SIZE, n_survivors=SURVIVORS, n_hidden_layers=N_HIDDEN_LAYERS)
avg_population_fitness = population.average_fitness()
max_fitness = population.max_fitness()
t2 = time.time()
print('Gen {} {:<3} avg: {:.3f} {:^3} max: {:.3f} ({:<3}s)'.format(
    0, ':', round(avg_population_fitness, 3), '-', round(max_fitness, 3), round(t2 - t_training, 2)))

# future populations
for generation in range(1, GENERATIONS):
    # breed new population
    t1 = time.time()
    population.breed()

    # evaluate new population
    avg_population_fitness = population.average_fitness()
    max_fitness = population.max_fitness()
    t2 = time.time()

    print('Gen {} {:<3} avg: {:.3f} {:^3} max: {:.3f} ({:<3}s)'.format(
        generation, ':', round(avg_population_fitness, 3), '-', round(max_fitness, 3), round(t2 - t1, 2)))

print('Finished training ({})'.format(round(time.time() - t_training, 2)))

# performance of population
population.plot()

Starting training
Gen 0 :   avg: 0.101  -  max: 0.144 (0.32s)
Gen 1 :   avg: 0.105  -  max: 0.171 (0.69s)
Gen 2 :   avg: 0.103  -  max: 0.171 (0.73s)
Gen 3 :   avg: 0.095  -  max: 0.171 (0.8s)
Gen 4 :   avg: 0.102  -  max: 0.171 (0.68s)
Gen 5 :   avg: 0.108  -  max: 0.200 (0.92s)
Gen 6 :   avg: 0.101  -  max: 0.200 (0.66s)
Gen 7 :   avg: 0.109  -  max: 0.200 (0.68s)
Gen 8 :   avg: 0.107  -  max: 0.200 (0.66s)
Gen 9 :   avg: 0.107  -  max: 0.200 (0.66s)
Gen 10 :   avg: 0.108  -  max: 0.200 (0.64s)
Gen 11 :   avg: 0.109  -  max: 0.200 (0.62s)
Gen 12 :   avg: 0.112  -  max: 0.200 (0.62s)
Gen 13 :   avg: 0.108  -  max: 0.200 (0.61s)
Gen 14 :   avg: 0.114  -  max: 0.200 (0.62s)
Gen 15 :   avg: 0.112  -  max: 0.200 (0.68s)
Gen 16 :   avg: 0.112  -  max: 0.200 (0.67s)
Gen 17 :   avg: 0.108  -  max: 0.200 (0.67s)
Gen 18 :   avg: 0.108  -  max: 0.200 (0.67s)
Gen 19 :   avg: 0.107  -  max: 0.200 (0.64s)
Gen 20 :   avg: 0.112  -  max: 0.200 (0.62s)
Gen 21 :   avg: 0.116  -  max: 0.200 (0.64s)
Gen