Importing the libraries.

In [1]:
from __future__ import print_function
import keras
from keras.datasets import cifar100
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
from keras.regularizers import l2
from keras.callbacks import Callback, LearningRateScheduler, TensorBoard, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import print_summary, to_categorical
from keras import backend as K
import sys
import os
import numpy as np

Using TensorFlow backend.


Initializing the parameters.

In [2]:
BATCH_SIZE = 100
NUM_CLASSES = 100
EPOCHS = 165000
INIT_DROPOUT_RATE = 0.5
MOMENTUM_RATE = 0.9
INIT_LEARNING_RATE = 0.01
L2_DECAY_RATE = 0.0005
CROP_SIZE = 32
LOG_DIR = './logs'
MODEL_PATH = './models/keras_cifar100_model.h5'

Thanks to Keras, we can load the dataset easily.

In [3]:
(x_train, y_train), (x_test, y_test) = cifar100.load_data()

We also need to convert the labels in the dataset into categorical matrix structure from 1-dim numpy array structure.

In [4]:
y_train = to_categorical(y_train, NUM_CLASSES)
y_test = to_categorical(y_test, NUM_CLASSES)

Once bitten twice shy, we will not forget it for this time. We need to normalize the images in the dataset.

In [5]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0

The following experiments should highlight the generalization capabilities of ELU networks.  The
CNN architecture is more sophisticated than in the previous subsection and consists of 18 convolutional layers arranged in stacks of ([1×384×3],[1×384×1,1×384×2,2×640×2],[1×640×1,3×768×2],[1×768×1,2×896×2],[1×896×3,2×1024×2],[1×1024×1,1×1152×2],[1×1152×1],[1×100×1])

In [6]:
model = Sequential()
model.add(ZeroPadding2D(4, input_shape=x_train.shape[1:]))
model.add(Conv2D(384, (3, 3), padding='same', kernel_regularizer=l2(0.01)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(384, (1, 1), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(384, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(640, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(640, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(640, (3, 3), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(768, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(768, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(768, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(768, (1, 1), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(896, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(896, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(896, (3, 3), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(1024, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(1024, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(1024, (1, 1), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Conv2D(1152, (2, 2), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Conv2D(1152, (1, 1), padding='same', kernel_regularizer=l2(L2_DECAY_RATE)))
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(Dropout(INIT_DROPOUT_RATE))

model.add(Flatten())
model.add(Dense(NUM_CLASSES))
model.add(Activation('softmax'))

In [7]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_1 (ZeroPaddin (None, 40, 40, 3)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 40, 40, 384)       10752     
_________________________________________________________________
activation_1 (Activation)    (None, 40, 40, 384)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 20, 20, 384)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 20, 384)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 20, 20, 384)       147840    
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 20, 20, 384)       590208    
__________

In [8]:
type(model.layers)

list

We need to use callbacks to make these adjustments. First, we will write the schedulers for learning rate and the drop-out rate.

In [9]:
def lr_scheduler(epoch, lr, step_decay = 0.1):
    if epoch == 35000:
        return float(lr * step_decay)
    elif epoch == -1:
        return float(lr * step_decay * 5)
    else:
        return float(lr)

In [11]:
def dr_scheduler(epoch, layers, rate_list = [0.0, .1, .2, .3, .4, .5, 0.0], rate_factor = 1.5):
    if epoch == 85000:
        for i, layer in enumerate([l for l in layers if "dropout" in np.str.lower(l.name)]):
            layer.rate = layer.rate + rate_list[i]
    elif epoch == 135000:
        for i, layer in enumerate([l for l in layers if "dropout" in np.str.lower(l.name)]):
            layer.rate = layer.rate + layer.rate * rate_factor if layer.rate <= 0.66 else 1
    return layers

Then, we can define our custom callback objects for the learning rate and drop-out rate

In [10]:
class StepLearningRateSchedulerAt(LearningRateScheduler):
    def __init__(self, schedule, verbose = 0): 
        super(LearningRateScheduler, self).__init__()
        self.schedule = schedule
        self.verbose = verbose
    
    def on_epoch_begin(self, epoch, logs=None): 
        if not hasattr(self.model.optimizer, 'lr'):
            raise ValueError('Optimizer must have a "lr" attribute.')
            
        lr = float(K.get_value(self.model.optimizer.lr))
        lr = self.schedule(epoch, lr)
       
        if not isinstance(lr, (float, np.float32, np.float64)):
            raise ValueError('The output of the "schedule" function ' 'should be float.')
        
        K.set_value(self.model.optimizer.lr, lr)

        if self.verbose > 0: 
            print('\nEpoch %05d: LearningRateScheduler reducing learning ' 'rate to %s.' % (epoch + 1, lr))

In [12]:
class DropoutRateScheduler(Callback):
    def __init__(self, schedule, verbose = 0):
        super(Callback, self).__init__()
        self.schedule = schedule
        self.verbose = verbose
        
    def on_epoch_begin(self, epoch, logs=None):
        if not hasattr(self.model, 'layers'):
            raise ValueError('Model must have a "layers" attribute.')
            
        layers = self.model.layers
        layers = self.schedule(epoch, layers)
        
        if not isinstance(layers, list):
            raise ValueError('The output of the "schedule" function should be list.')
        
        self.model.layers = layers
        
        if self.verbose > 0:
            for layer in [l for l in self.model.layers if "dropout" in np.str.lower(l.name)]:
                print('\nEpoch %05d: Dropout rate for layer %s: %s.' % (epoch + 1, layer.name, layer.rate))

By applying zero padding four 0 pixels at all borders, we will randomly crop the images by 32x32. To achieve this, we need to create custom generator which takes ImageDataGenerator object as an input and yields each batch of images by cropping them.

In [13]:
def random_crop(img, random_crop_size):
    height, width = img.shape[0], img.shape[1]
    dy, dx = random_crop_size
    x = np.random.randint(0, width - dx + 1)
    y = np.random.randint(0, height - dy + 1)
    return img[y:(y+dy), x:(x+dx), :]

In [14]:
def crop_generator(batches, crop_length, num_channel = 3):
    while True:
        batch_x, batch_y = next(batches)
        batch_crops = np.zeros((batch_x.shape[0], crop_length, crop_length, num_channel))
        for i in range(batch_x.shape[0]):
            batch_crops[i] = random_crop(batch_x[i], (crop_length, crop_length))
        yield (batch_crops, batch_y)

Defining the optimizer (Mini-batch Stochastic Gradient Descent with Momentum)

In [15]:
opt = SGD(lr=INIT_LEARNING_RATE, momentum=MOMENTUM_RATE)

Here is the part that I be loved. Callbacks! Let's create callback objects. First one is our custom learning scheduler to decrease the learning rate after a certain number of epoch. Also, we have another custom callback for adjusting the drop-out rates in the stack layers. Next, we will record what our model has done during the training process. And lastly, we will save our trained model in each epoch that has better result than previous one.

In [16]:
lr_rate_scheduler = StepLearningRateSchedulerAt(lr_scheduler)
dropout_scheduler = DropoutRateScheduler(dr_scheduler)
tensorboard = TensorBoard(log_dir=LOG_DIR, batch_size=BATCH_SIZE)
checkpointer = ModelCheckpoint(MODEL_PATH, monitor='val_loss', verbose=1, save_best_only=True)

We are ready to compile. GO GO GO!!!

In [17]:
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy', 'top_k_categorical_accuracy'])

We will use ImageDataGenerator object to handle the data pre-processing on real time and make sure that the process goes randomly. Just for reminding, in the article, global contrast normalization (sample-wise centering) and ZCA whitening and horizontal flipping methods should be used for augmenting the data.

In [18]:
datagen = ImageDataGenerator(samplewise_center=True,
                             zca_whitening=True,
                             horizontal_flip=True,
                             validation_split=0.2)



ATTENTION!
If we use sample-wise or feature-wise centering methods, we have to fit the training data to the generator. Otherwise, these methods do not work.

In [19]:
datagen.fit(x_train)

Now, we will flow the data using our custom generator object for cropping the images. Here is the flowing methods for training and validation data. Since we define the rate of splitting the data to training and validation in the ImageDataGenerator object, it is enough to specify the subset as "training" or "validation" in the flowing method to split the data.

In [20]:
train_flow = datagen.flow(x_train, y_train, batch_size=BATCH_SIZE, subset="training")
train_flow_w_crops = crop_generator(train_flow, CROP_SIZE)
valid_flow = datagen.flow(x_train, y_train, batch_size=BATCH_SIZE, subset="validation")

WOW! Ready to train, huh?

In [21]:
model.fit_generator(train_flow_w_crops,
                    epochs=EPOCHS,
                    steps_per_epoch=len(x_train) / BATCH_SIZE,
                    callbacks=[lr_rate_scheduler, dropout_scheduler, tensorboard, checkpointer],
                    validation_data=valid_flow,
                    validation_steps=len(x_train) / BATCH_SIZE)

Epoch 1/165000

Epoch 00001: val_loss improved from inf to 10.53320, saving model to ./models/keras_cifar100_model.h5
Epoch 2/165000

Epoch 00002: val_loss improved from 10.53320 to 9.32350, saving model to ./models/keras_cifar100_model.h5
Epoch 3/165000

Epoch 00003: val_loss improved from 9.32350 to 8.63947, saving model to ./models/keras_cifar100_model.h5
Epoch 4/165000

Epoch 00004: val_loss improved from 8.63947 to 7.76259, saving model to ./models/keras_cifar100_model.h5
Epoch 5/165000

Epoch 00005: val_loss improved from 7.76259 to 7.14185, saving model to ./models/keras_cifar100_model.h5
Epoch 6/165000
 89/500 [====>.........................] - ETA: 1:38 - loss: 7.3028 - acc: 0.2217 - top_k_categorical_accuracy: 0.5192

KeyboardInterrupt: 

165.000 epochs! COME ON!
As I mentioned earlier, I cannot finish the training process with my resources (by the way, it is 1080Ti). So, we do not have model to test at the end of this episode. If you have better GPU/s and have never ending patient during the training (for me it was expected to run at least 40 days -nonstop-), you can go for it -but I won't-.

In [None]:
test_datagen = ImageDataGenerator(samplewise_center=True,
                                  zca_whitening=True)
test_datagen.fit(x_test)

In [None]:
test_flow = test_datagen.flow(x_test, y_test, batch_size=BATCH_SIZE)
results = model.evaluate_generator(test_flow, steps=len(x_test) / BATCH_SIZE)

In [None]:
print('Test loss: ' + str(results[0]))
print('Accuracy: ' + str(results[1]))
print('Top-5 Accuracy: ' + str(results[2]))