In [0]:
import keras 
from keras.datasets import mnist
from keras.models import Sequential 
from keras.optimizers import Optimizer 
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras import backend as K
from keras.legacy import interfaces
from keras.layers import GaussianNoise

Using TensorFlow backend.


In [0]:
learning_rate = 0.001
num_epochs = 20
display_step = 500
num_classes = 10
# layer = GaussianNoise(0.1)

In [0]:
def plot_history(history):
    train_loss_history = history.history['loss']

    train_acc_history = history.history['acc']

    epoch = [i for i in range(1, num_epochs+1)]

    plt.figure(figsize = (20, 8))
    plt.subplot(1, 2, 1)
    # plt.plot(epoch, val_loss_history, 'bo-', label = 'val loss')
    plt.plot(epoch, train_loss_history, 'ro-', label = 'train loss')
    plt.ylabel("loss")
    plt.xlabel("Epoch")
    plt.legend(['Training'], loc = 'upper right')

In [0]:
def plot_multiple_history(h1, h2, h3, h4):

    loss_1 = h1.history['loss']
    acc_1 = h1.history['acc']
    loss_2 = h2.history['loss']
    acc_2 = h2.history['acc']
    loss_3 = h3.history['loss']
    acc_3 = h3.history['acc']
    loss_4 = h4.history['loss']
    acc_4 = h4.history['acc']

    epoch = [i for i in range(1, num_epochs+1)]

    plt.figure(figsize = (20, 8))
    plt.subplot(1, 2, 1)
    # plt.plot(epoch, val_loss_history, 'bo-', label = 'val loss')
    plt.plot(epoch, loss_1, label = 'GD')
    plt.plot(epoch, loss_2, label = 'SGD')
    plt.plot(epoch, loss_3, label = 'CNC_PGD')
    plt.plot(epoch, loss_4, label = 'ISO_PGD')
    plt.ylabel("loss")
    plt.xlabel("Epoch")
    plt.legend(loc = 'best')
    plt.show()


In [0]:
def mse(y, y_predict):
    return ((y_predict - y).T.dot(y_predict - y))/y.shape[0]

In [0]:
def sgd(xtrain, ytrain, xtest, ytest):
    xtrain = xtrain.reshape(-1, 28, 28, 1)
    xtest = xtest.reshape(-1, 28, 28, 1)
    model = Sequential()
    xtrain = xtrain.astype('float32')
    xtest = xtest.astype('float32')
    xtrain /= 255
    xtest /= 255
    # print('x_train shape:', xtrain.shape)
    # print(xtrain.shape[0], 'train samples')
    # print(xtest.shape[0], 'test samples')

    ytrain = keras.utils.to_categorical(ytrain, num_classes)
    ytest = keras.utils.to_categorical(ytest, num_classes)

    model.add(Conv2D(filters=10, activation= 'relu', kernel_size=(5,5), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=8, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(10, activation = 'softmax'))

    optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=0.1, nesterov=True)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    history = model.fit(xtrain, ytrain, epochs = num_epochs)

    # plot_history(history)
    return history

In [0]:
def ISO_PGD(xtrain, ytrain, xtest, ytest):
    gd_loss = np.empty(num_epochs)
    xtrain = xtrain.reshape(-1, 28, 28, 1)
    xtest = xtest.reshape(-1, 28, 28, 1)
    
    model = Sequential()
    xtrain = xtrain.astype('float32')
    xtest = xtest.astype('float32')
    
    xtrain /= 255
    xtest /= 255
    model.add(GaussianNoise(0.01))
    # Adadelta is gradient descent optimizer for cnn

    ytrain = keras.utils.to_categorical(ytrain, num_classes)
    ytest = keras.utils.to_categorical(ytest, num_classes)

    model.add(Conv2D(filters=10, activation= 'relu', kernel_size=(5,5), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    model.add(GaussianNoise(0.1))
    model.add(Conv2D(filters=8, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    # model.add(GaussianNoise(0.3))
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    # model.add(GaussianNoise(0.1))
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(10, activation = 'softmax'))

    optimizer = Test_ISO_SGD(lr=0.001)    

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    history = model.fit(xtrain, ytrain, epochs = num_epochs)

    return history


In [0]:
# Find thetas using gradient descent
def gradient_descent(xtrain, ytrain, xtest, ytest):
    
    gd_loss = np.empty(num_epochs)
    xtrain = xtrain.reshape(-1, 28, 28, 1)
    xtest = xtest.reshape(-1, 28, 28, 1)
    
    model = Sequential()
    xtrain = xtrain.astype('float32')
    xtest = xtest.astype('float32')
    xtrain /= 255
    xtest /= 255

    # Adadelta is gradient descent optimizer for cnn

    ytrain = keras.utils.to_categorical(ytrain, num_classes)
    ytest = keras.utils.to_categorical(ytest, num_classes)

    model.add(Conv2D(filters=10, activation= 'relu', kernel_size=(5,5), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=8, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
  
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(10, activation = 'softmax'))

    optimizer = keras.optimizers.Adadelta(lr=0.001, rho=0.95)    

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    history = model.fit(xtrain, ytrain, epochs = num_epochs)

    # plot_history(history)

    return history

In [0]:
class CNC_GD_Optimizer(Optimizer):
    def __init__(self, learning_rate=1.0, rho=0.95, momentum=0, **kwargs):
        self.initial_decay = kwargs.pop('decay', 0.0)
        self.epsilon = kwargs.pop('epsilon', K.epsilon())
        learning_rate = kwargs.pop('lr', learning_rate)
        super(CNC_GD_Optimizer, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.learning_rate = K.variable(learning_rate, name='learning_rate')
            self.decay = K.variable(self.initial_decay, name='decay')
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.momentum = K.variable(momentum, name='momentum')
        self.rho = rho

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        shapes = [K.int_shape(p) for p in params]
        accumulators = [K.zeros(shape, name='accumulator_' + str(i))
                        for (i, shape) in enumerate(shapes)]
        delta_accumulators = [K.zeros(shape, name='delta_accumulator_' + str(i))
                              for (i, shape) in enumerate(shapes)]
        self.weights = [self.iterations] + accumulators + delta_accumulators
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))
            
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes)]

        for p, g, a, d_a, m in zip(params, grads, accumulators, delta_accumulators, moments):
            v = self.momentum * m - lr * g
            # update accumulator
            new_a = self.rho * a + (1. - self.rho) * K.square(g)
            self.updates.append(K.update(a, new_a))

            # use the new accumulator and the *old* delta_accumulator
            update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
            new_p = p - lr * update + v # Add Stochastic Gradient Step Here?

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

            # update delta_accumulator
            new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
            self.updates.append(K.update(d_a, new_d_a))
        return self.updates

    def set_weights(self, weights):
        params = self.weights
        # Override set_weights for backward compatibility of Keras 2.2.4 optimizer
        # since it does not include iteration at head of the weight list. Set
        # iteration to 0.
        if len(params) == len(weights) + 1:
            weights = [np.array(0)] + weights
        super(Adadelta, self).set_weights(weights)

    def get_config(self):
        config = {'learning_rate': float(K.get_value(self.learning_rate)),
                  'rho': self.rho,
                  'decay': float(K.get_value(self.decay)),
                  'epsilon': self.epsilon}
        base_config = super(Adadelta, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [0]:
def CNC_GD (xtrain, ytrain, xtest, ytest):
    
    gd_loss = np.empty(num_epochs)
    xtrain = xtrain.reshape(-1, 28, 28, 1)
    xtest = xtest.reshape(-1, 28, 28, 1)
    
    model = Sequential()
    xtrain = xtrain.astype('float32')
    xtest = xtest.astype('float32')
    xtrain /= 255
    xtest /= 255
    
    # Adadelta is gradient descent optimizer for cnn
    
    ytrain = keras.utils.to_categorical(ytrain, num_classes)
    ytest = keras.utils.to_categorical(ytest, num_classes)
    
    model.add(Conv2D(filters=10, activation= 'relu', kernel_size=(5,5), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    
    model.add(Conv2D(filters=8, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    
    model.add(Conv2D(filters=5, activation= 'relu', kernel_size=(3,3), strides=1))
    model.add(MaxPooling2D(pool_size=(2,2), strides=1))
    
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(10, activation = 'softmax'))
    
    optimizer = CNC_GD_Optimizer(lr=0.001, rho=0.95)    
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    history = model.fit(xtrain, ytrain, epochs = num_epochs)
    
    # plot_history(history)
    
    return history

In [0]:
class Test_ISO_SGD(Optimizer):
    """Stochastic gradient descent optimizer.
    Includes support for momentum,
    learning rate decay, and Nesterov momentum.
    # Arguments
        learning_rate: float >= 0. Learning rate.
        momentum: float >= 0. Parameter that accelerates SGD
            in the relevant direction and dampens oscillations.
        nesterov: boolean. Whether to apply Nesterov momentum.
    """

    # 1. Take out Nesterov support from originial SGD code

    def __init__(self, learning_rate=0.01, momentum=0., **kwargs):
        learning_rate = kwargs.pop('lr', learning_rate)
        self.initial_decay = kwargs.pop('decay', 0.0)
        super(Test_ISO_SGD, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.learning_rate = K.variable(learning_rate, name='learning_rate')
            self.momentum = K.variable(momentum, name='momentum')
            self.decay = K.variable(self.initial_decay, name='decay')

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape, name='moment_' + str(i))
                   for (i, shape) in enumerate(shapes)]
        self.weights = [self.iterations] + moments
        for p, g, m in zip(params, grads, moments):
            v = self.momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))

            new_p = p + v

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'learning_rate': float(K.get_value(self.learning_rate)),
                  'momentum': float(K.get_value(self.momentum)),
                  'decay': float(K.get_value(self.decay)),
                  'nesterov': self.nesterov}
        base_config = super(SGD, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [0]:
if __name__ == '__main__':
  # xtrain = 60000 of 28 by 28 arrays
  (xtrain, ytrain), (xtest, ytest) = mnist.load_data()
  
  # num_epochs=10
  h1 = gradient_descent(xtrain, ytrain, xtest, ytest)
  h2 = sgd(xtrain, ytrain, xtest, ytest)
  h4 = ISO_PGD(xtrain, ytrain, xtest, ytest)
  h3 = CNC_GD(xtrain, ytrain, xtest, ytest)
  plot_multiple_history(h1, h2, h3, h4)
  
  
  

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/20





Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
 1408/60000 [..............................] - ETA: 1:20 - loss: 0.3096 - acc: 0.9000