In [13]:
import numpy as np
import tensorflow as tf

In [19]:
# Fashion-MNIST test dataset

def load_data():
    import tensorflow as tf
    print('Using tensorflow version {}.'.format(tf.__version__))
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255
    # convert labels to categorical samples
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)
    print('Loaded Fashion-MNIST into x_train, y_train, x_test, y_test.')
    print('Shapes: x_train: {}, y_train: {}, x_test: {}, y_test: {}'.format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))
    return ((x_train, y_train), (x_test, y_test))

(x_train, y_train), (x_test, y_test) = load_data()

### loading data ###
batch_size=32

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
val_dataset = val_dataset.batch(batch_size)

Using tensorflow version 2.7.1.
Loaded Fashion-MNIST into x_train, y_train, x_test, y_test.
Shapes: x_train: (60000, 28, 28), y_train: (60000, 10), x_test: (10000, 28, 28), y_test: (10000, 10)


In [6]:
# 2 - gradient teleportation net
tf.keras.backend.clear_session() # names init
inp0 = tf.keras.Input(shape=(1, 3)) # fix shape

d1 = tf.keras.layers.Dense(64, activation="relu")(inp0)
d2 = tf.keras.layers.Dense(64, activation="relu")(d1)
d3 = tf.keras.layers.Dense(64, activation="relu")(d2)
d4 = tf.keras.layers.Dense(64, activation="relu")(d3)
d5 = tf.keras.layers.Dense(10, activation="relu")(d4)
out0 = tf.keras.layers.Softmax()(d5)

model = tf.keras.Model([inp0], [out0]) # <- this it the step that defines the training op (way 1 of doing things)
loss_fn = tf.keras.losses.CategoricalCrossentropy()

opt = tf.keras.optimizers.SGD(learning_rate=1e-3)
model.compile(optimizer=opt, loss=None)

2022-08-14 16:14:22.496063: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# model.fit, model.compile define the training cycles - they can be written in a custom way using tensorflow!

In [12]:
# this is NOT low level enough for use with teleporting weights.
class CustomModel(tf.keras.Model):
    def train_step(self, data):
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True) # self is keras.Model, which does a forward pass model(x)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses) # uses the model's defined loss

        trainable_vars = self.trainable_variables # gets the variables which are trainable in the model
        gradients = tape.gradient(loss, trainable_vars) # gets all the gradients from the recorded tape
        self.optimizer.apply_gradients(zip(gradients, trainable_vars)) # applies gradients to trainable_vars to move through
        self.compiled_metrics.update_state(y, y_pred) # using the decided metric evaluator, evaluate metrics
        return {m.name: m.result() for m in self.metrics} # print in some fancy fashion way

##### implementation

In [550]:
tf.keras.backend.clear_session()

def repeat_const(tensor, myconst):
    shapes = tf.shape(tensor)
    return tf.repeat(myconst, shapes[0], axis=0)

class Leaky_MergedDense(tf.keras.layers.Layer):
    # custom dense layer incorporating w and b in one (helps the teleportation norm op.)
    def __init__(self, width, g=None, mat=None):
        super(Leaky_MergedDense, self).__init__()
        self.width = width
        self.shape = width
        self.g = g
        self.mat = mat
        if g is not None:
            self.mat /= tf.norm(self.mat)
        self.__name__ = 'md'
    
    def build(self, input_shape):
        self.params = self.add_weight(
          name="params",
          shape=(input_shape[-1]+1, self.width), # changed shape for biases
          # paper uses uniform[0, 1] init, but is inconsistent wrt datasets and loss funcs
          trainable=True
        )

    def call(self, inputs):
        ones = tf.ones((1, 1))
        lmbda = tf.keras.layers.Lambda(lambda x: repeat_const(inputs, ones))(inputs)
        concatd = tf.keras.layers.Concatenate(axis=1)([inputs, lmbda]) # adds resp. 1s to the input
        if self.g == 'prev':
            m1 = tf.nn.leaky_relu(tf.matmul(concatd, self.params), alpha=0.01)
            m2 = tf.matmul(self.mat, tf.cast(tf.nn.leaky_relu(m1, alpha=0.01)), 'float32')
            m3 = tf.matmul(tf.nn.leaky_relu(m2, alpha=1/0.01), tf.linalg.pinv(concatd))
            assert m1.shape == m3.shape
            return tf.nn.leaky_relu(m3, alpha=0.01)
        elif self.g == 'curr':
            m1 = tf.matmul(self.params, tf.cast(tf.linalg.pinv(self.mat), 'float32'))
            m2 = tf.matmul(concatd, m1)
            m3 = tf.nn.leaky_relu(m2, alpha=0.01)
            return m3
        else:
            return tf.nn.leaky_relu(tf.matmul(concatd, self.params), alpha=0.01)

# 2 - gradient teleportation net
# activations must be bijective
tf.keras.backend.clear_session() # names init
inp0 = tf.keras.Input(shape=(28, 28)) # fix shape
inpf = tf.keras.layers.Flatten()(inp0)

d1 = Leaky_MergedDense(64)(inpf)
d2 = Leaky_MergedDense(64)(d1)
d3 = Leaky_MergedDense(10)(d2)

out0 = tf.keras.layers.Softmax()(d3)

model = tf.keras.Model([inp0], [out0])
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
model.compile(optimizer=opt, loss=loss_fn)

train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()
global model

In [552]:
# generate invertible teleporters
# !x96! <- changeable, preferably by an ascenting scheme

import numpy as np
def gen_gln(n): # this should be group-specific (ideally SOn) for better results
    # generates an invertible nxn matrix
    m = np.random.rand(n, n)
    mx = np.sum(np.abs(m), axis=1)
    np.fill_diagonal(m, mx)
    return m/tf.norm(tf.convert_to_tensor(m)) # to ensure size preserving maps

def generate_teleport_candidates(matrix_size, num_candidates=10):
    # generates num_candidates teleport candidates for use with teleporting gradient descent
    return [tf.convert_to_tensor(gen_gln(matrix_size)) for i in range(num_candidates)]
# !x96!

def model_teleport_update(model, layer_num, mat):
    from keras.models import Sequential
    new_model = Sequential()
    # Go through all layers, if it has a ReLU activation, replace it with PrELU
    i = 0
    new_model.add(tf.keras.Input(shape=(28, 28)))

    for layer in tuple(model.layers):
        if hasattr(layer, '__name__'):
            i += 1
            if layer.__name__ == 'md':
                if layer_num-1 == i: # if prev.
                    new_layer = Leaky_MergedDense(model.layers[layer_num+2].weights[0].shape[-1], g='prev', mat=mat)
                    assert new_layer.shape == model.layers[layer_num+2].shape
                    new_model.add(new_layer)
                elif layer_num == i: # if curr.
                    new_shape = model.layers[layer_num+2].weights[0].shape[-1]
                    new_layer = Leaky_MergedDense(new_shape, g='curr', mat=mat)
                    assert new_layer.shape == model.layers[layer_num+2].shape
                    new_model.add(new_layer)
                else:
                    new_model.add(layer)
        else:
            new_model.add(layer)
    return new_model

In [541]:
model.trainable_weights[1].shape, model.trainable_weights[2].shape

(TensorShape([65, 64]), TensorShape([65, 10]))

        `teleport_schedule` decides in which epochs to do teleportations
        `teleport_duration` decides the number of teleports to do in the first of the epoch's minibatches
        `num_candidates` decides on the number of teleportation candidates to consider - as I am not using gradient ascent here, this is hit or miss so a high number is preferrred
        ``
        

In [567]:
# no longer subclassing keras.model directly
# here stuff like the opt is defined directly
# also true for the loss function, as opposed to being added to the model beforehand
# train_dataset = [[x_train[:10000]], [y_train[:10000]]]

num_epochs = 40
teleport_schedule = {5} # does teleports at this epoch val.
                        # tests show one is good enough for the 64, 64, 10 net.
teleport_duration = 1 # number of teleports in the first few steps of an epoch

def train_nn(model):
    for epoch in range(num_epochs):
        if epoch in teleport_schedule:
            #start.epoch
            for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
                if step < teleport_duration:
                    ### do teleporting with descent update
                    with tf.GradientTape(persistent=True) as tape:
                        # expected softmaxed model output
                        logits = model(x_batch_train, training=True)
                        loss_value = loss_fn(y_batch_train, logits)
                    # tracking weights to be gradient descented over (and finding their gradients)
                    
                    # 1) choose a layer
                    # 2) apply GL(n) elements to said weight with m varying instances
                    ####### they chose gradient ascent - I'm going it with just random samplings
                    # 3) compute m gradient tapes - choose the entry which maximises gradient of said layer
                    # 4) update model's prior layer and curr. layer - instantiating it according to teleportation scheme
                    # 5) continue training
                    
                    model_weights = model.trainable_weights
                    # this is one way to do teleports - I am simply taking a sampling and teleporting based off the sampling
                    # this is different to what was done in the paper - they used gradient ascent here. That one most definitely ends up with better teleports
                    # !x96!
                    layer_to_teleport = np.random.randint(1, len(model.trainable_weights)-1)
                    matrix_size = model.trainable_weights[layer_to_teleport].shape[-1] # this should be the layer outp. dim
                    num_candidates=1000
                    teleport_candidates = generate_teleport_candidates(matrix_size, num_candidates=num_candidates) # random loss-invariant actions
                    # adjusted_layers = [model.trainable_weights[layer_to_teleport]@(tf.cast(tf.linalg.inv(teleport_candidates[i]), 'float32')) for i in range(num_candidates)]
                    adjusted_derivs = [tf.linalg.norm(tape.gradient(loss_value, model.trainable_weights)[layer_to_teleport]@tf.cast(tf.linalg.inv(teleport_candidates[i]), 'float32')) for i in range(num_candidates)]
                    max_teleport_arg = np.argmax(np.array(adjusted_derivs))
                    del tape
                    # !x96!
                    # change prior layer of net correspondingly to output the correct thing
                    # print('model layer count {}, layer to teleport {}'.format(len(model.layers), layer_to_teleport))
                    model = model_teleport_update(model, layer_to_teleport, teleport_candidates[max_teleport_arg])
                    model.compile(optimizer=optimizer, 
                                  loss=loss_fn)

                    with tf.GradientTape(persistent=True) as tape:
                        # expected softmaxed model output
                        logits = model(x_batch_train, training=True)
                        print('teleporting! (epoch {}, layer {})'.format(epoch, layer_to_teleport))
                        loss_value = loss_fn(y_batch_train, logits)
                    gradient = tape.gradient(loss_value, model.trainable_weights)
                    optimizer.apply_gradients(zip(gradients, model.trainable_weights))

                else:
                    ### do descents as normal
                    with tf.GradientTape() as tape: # rec fwd pass
                        logits = model(x_batch_train, training=True) # softmaxed model outputs
                        loss_value = loss_fn(y_batch_train, logits)
                    gradients = tape.gradient(loss_value, model.trainable_weights)
                    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
                
                train_acc_metric.update_state(y_batch_train, logits)
            #end.epoch
            
            # Display metrics at the end of each epoch.
            train_acc = train_acc_metric.result()

        if epoch not in teleport_schedule: # using SGD
            # for each of the training dataset's batch
            for step, (x_batch_train, y_batch_train) in enumerate(val_dataset):
                with tf.GradientTape() as tape:
                    logits = model(x_batch_train, training=True)
                    loss_value = loss_fn(y_batch_train, logits)
                gradients = tape.gradient(loss_value, model.trainable_weights)
                optimizer.apply_gradients(zip(gradients, model.trainable_weights))
                # Update training metric
                train_acc_metric.update_state(y_batch_train, logits)
            #end.epoch
            
            # Display metrics at the end of each epoch.
            train_acc = train_acc_metric.result()
        
        ### EPOCH METRICS ###
        # print training accs and reset states
        print("Training acc over epoch {}: %.4f".format(epoch) % (float(train_acc),))
        train_acc_metric.reset_states()

        ### Run a validation loop at the end of each epoch.
        for x_batch_val, y_batch_val in [[x_test, y_test]]:
            val_logits = model(x_batch_val, training=False)
            # Update val metrics
            val_acc_metric.update_state(y_batch_val, val_logits)
        val_acc = val_acc_metric.result()
        val_acc_metric.reset_states()
        print("Validation acc for epoch {}: %.4f".format(epoch) % (float(val_acc),))

In [566]:
# g = train_nn(model)

Training acc over epoch 0: 0.3490
Validation acc for epoch 0: 0.3473
Training acc over epoch 1: 0.3470
Validation acc for epoch 1: 0.3554
Training acc over epoch 2: 0.3631
Validation acc for epoch 2: 0.3775
Training acc over epoch 3: 0.3833
Validation acc for epoch 3: 0.3937
Training acc over epoch 4: 0.3962
Validation acc for epoch 4: 0.4025
teleporting! (epoch 5, layer 1)
Training acc over epoch 5: 0.4088
Validation acc for epoch 5: 0.5109
Training acc over epoch 6: 0.5129
Validation acc for epoch 6: 0.5162
Training acc over epoch 7: 0.5181
Validation acc for epoch 7: 0.5211
Training acc over epoch 8: 0.5225
Validation acc for epoch 8: 0.5254
Training acc over epoch 9: 0.5264
Validation acc for epoch 9: 0.5291
Training acc over epoch 10: 0.5308
Validation acc for epoch 10: 0.5323
Training acc over epoch 11: 0.5328
Validation acc for epoch 11: 0.5349
Training acc over epoch 12: 0.5349
Validation acc for epoch 12: 0.5372
Training acc over epoch 13: 0.5370
Validation acc for epoch 13: 0

In [9]:
# tf.keras.utils.plot_model(model, "dnn.png", show_shapes=True)

In [None]:
# # 2 - running the net
# batch_size=source_shape.shape[0]
# model.fit(x=[source_shape, target_shape], y=None, epochs=1000, verbose=1, batch_size=batch_size);
# plt.plot(model.history.history['loss']);
# print(model.history.history['loss'][-1])