### ResNet50 Model

References:

https://towardsdatascience.com/understand-and-implement-resnet-50-with-tensorflow-2-0-1190b9b52691

https://github.com/suvoooo/Learn-TensorFlow/blob/master/resnet/Implement_Resnet_TensorFlow.ipynb

https://appliedmachinelearning.blog/2018/03/24/achieving-90-accuracy-in-object-recognition-task-on-cifar-10-dataset-with-keras-convolutional-neural-networks/

Batch Size: 256

Number of Dropout Layers: 2

Number of BatchNorm Layers: 2

Dropout Probability: 0.2

Data Augmentation: true

In [3]:
import matplotlib.pyplot as plt
from matplotlib.patches import  Rectangle
import tensorflow as tf

import os
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPool2D,MaxPooling2D, Flatten,BatchNormalization, Dropout,ZeroPadding2D, AveragePooling2D, Add, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras import activations
from keras.preprocessing.image import ImageDataGenerator
import numpy as np

from tensorflow.keras.datasets import cifar10
from keras.utils import to_categorical

import time
import pickle

In [2]:
#Define the Model
#Removing all regularizers

def res_identity(x, filters, num_batchnorm = 0, num_dropout=0, dropout_prob = 0):
  #renet block where dimension doesnot change.
  #The skip connection is just simple identity conncection
  #we will have 3 blocks and then input will be added

    x_skip = x # this will be used for addition with the residual block 
    f1, f2 = filters
    bn = num_batchnorm
    drp = num_dropout

    #first block 
    x = Conv2D(f1, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    #second block # bottleneck (but size kept same with padding)
    x = Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    # third block activation used after adding the input
    x = Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    # add the input 
    x = Add()([x, x_skip])
    x = Activation(activations.relu)(x)


    return x

def res_conv(x, s, filters, num_batchnorm = 0, num_dropout=0, dropout_prob = 0):
    x_skip = x
    f1, f2 = filters
    bn = num_batchnorm
    drp = num_dropout

    # first block
    x = Conv2D(f1, kernel_size=(1, 1), strides=(s, s), padding='valid')(x)
    # when s = 2 then it is like downsizing the feature map
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    # second block
    x = Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    #third block
    x = Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1

    # shortcut 
    x_skip = Conv2D(f2, kernel_size=(1, 1), strides=(s, s), padding='valid')(x_skip)
    x_skip = BatchNormalization()(x_skip)

    # add 
    x = Add()([x, x_skip])
    x = Activation(activations.relu)(x)

    return x

def resnet50(num_batchnorm = 0, bn_pooling = False, dropout_prob=0, num_dropout_conv = 0, num_dropout_id = 0, num_dropout=0):

    input_im = Input(shape=(32, 32, 3)) # cifar 10 images size
    x = ZeroPadding2D(padding=(3, 3))(input_im)
    bn = num_batchnorm
    drp = num_dropout

    # 1st stage
    # here we perform maxpooling, see the figure above

    x = Conv2D(64, kernel_size=(7, 7), strides=(2, 2))(x)
    if bn_pooling:
        x = BatchNormalization()(x)
    x = Activation(activations.relu)(x)
    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1

    #2nd stage 
    # frm here on only conv block and identity block, no pooling

    x = res_conv(x, s=1, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 3rd stage

    x = res_conv(x, s=2, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 4th stage

    x = res_conv(x, s=2, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 5th stage

    x = res_conv(x, s=2, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # ends with average pooling and dense connection

    x = AveragePooling2D((2, 2), padding='same')(x)

    x = Flatten()(x)
    if drp>0:
        x = Dropout(dropout_prob)(x)
        drp-=1
    x = Dense(10, activation='softmax')(x) #multi-class

    # define the model 

    model = Model(inputs=input_im, outputs=x, name='Resnet50')

    return model

In [4]:
#to measure Time to Accuracy
#https://keras.io/guides/writing_your_own_callbacks/

class timeToAccuracy(keras.callbacks.Callback):
    def __init__(self, startTime, epoch_ckpt, model_name):
        super(timeToAccuracy, self).__init__()
        self.targetAcc = 0.87 #CHANGE TO 0.87 WHEN RUNNING MODEL
        self.foundTarget = False
        self.startTime = startTime
        self.epoch = 0
        self.epoch_ckpt = epoch_ckpt
        self.model_name = model_name
        self.prev_loss = None
    def on_epoch_end(self, epoch, logs=None):
        if self.epoch % self.epoch_ckpt == 0:
            print(self.epoch, epoch)
            name = self.model_name + '.h5'
            self.model.save_weights(name)
            print('end', logs)
            
        self.epoch += 1
#         if (self.prev_loss == None):
#             self.prev_loss = logs['loss']
#         else:
#             delta = np.abs(logs['loss'] - self.prev_loss)
        if not self.foundTarget:
            if logs['accuracy'] >= self.targetAcc:
                current = time.time()
                print("Time to reach {} accuracy: {} s".format(self.targetAcc, current-self.startTime))
                with open('{}_tta.pkl'.format(model_name), 'wb') as file:
                    pickle.dump(current-self.startTime, file)
                self.foundTarget = True

In [5]:
#prepare data

(x_train, y_train), (x_test, y_test) = cifar10.load_data() 

# Data normalization
x_train = x_train/255
x_test = x_test/255
y_train = to_categorical(y_train) 
y_test = to_categorical(y_test)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [6]:
#data augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    )
datagen.fit(x_train)

In [11]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
def fit_resnet_dataaug(model, xtrain, ytrain, xtest, ytest, model_name, convergence=False):
  
    EPOCHS = 500 if convergence else 100    
    #EPOCHS=10
    BATCH_SIZE= 256
    VERBOSITY = 0   #Change to 0 when actually running model
    EPOCH_CKPT = 15 # save model every N epochs

    opt = keras.optimizers.Adam(learning_rate=1e-3)

    model.compile(loss = keras.losses.categorical_crossentropy, optimizer = opt, metrics=['accuracy'])
    #fit the model
    start = time.time()
    print('Fitting with BS ', BATCH_SIZE)
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20) if convergence else None
    model.fit(datagen.flow(x_train,
                                 y_train, 
                                 batch_size=BATCH_SIZE),
                        verbose=VERBOSITY, 
                        epochs=EPOCHS,
                        callbacks=[timeToAccuracy(startTime=start, epoch_ckpt=EPOCH_CKPT, model_name=model_name), es]
                       )
    train_time = time.time() - start

    #evaluate
    score = model.evaluate(xtest,ytest)
    loss = score[0]
    acc = score[1]

    return train_time,  acc

In [21]:
dp = 2
prob = 0.2
bn = 2

print('Training BN{}, DPID{}, DPCONV{}, DPPROB{} till convergence with data augmentation'.format(bn, dp, dp, prob))
model_name = 'model_bn{}_dpid_{}_dpconv_{}_dpprob_{}_convergence_dataaug'.format(bn, dp, dp, prob)
resnet_model = resnet50(num_batchnorm=bn, bn_pooling=True, num_dropout_conv=dp, num_dropout_id=dp, dropout_prob=prob)
tt, acc = fit_resnet_dataaug(resnet_model,x_train, y_train, x_test, y_test, model_name, convergence=True)

print('Final accuracy {} reached in {}'.format(acc, tt))

Training BN2, DPID2, DPCONV2, DPPROB0.2 till convergence with data augmentation
Fitting with BS  256
0 0
end {'loss': 1.9386812448501587, 'accuracy': 0.369159996509552}
15 15
end {'loss': 0.7020004391670227, 'accuracy': 0.7520800232887268}
45 45
end {'loss': 0.4356963634490967, 'accuracy': 0.8458399772644043}
105 105
end {'loss': 0.25176045298576355, 'accuracy': 0.9099799990653992}
120 120
end {'loss': 0.22453036904335022, 'accuracy': 0.9193599820137024}
150 150
end {'loss': 0.17964017391204834, 'accuracy': 0.9366599917411804}
165 165
end {'loss': 0.1622748076915741, 'accuracy': 0.9424800276756287}
180 180
end {'loss': 0.14877501130104065, 'accuracy': 0.9458199739456177}
195 195
end {'loss': 0.1401999145746231, 'accuracy': 0.9498199820518494}
210 210
end {'loss': 0.12873868644237518, 'accuracy': 0.9539399743080139}
225 225
end {'loss': 0.24877601861953735, 'accuracy': 0.9218400120735168}
240 240
end {'loss': 0.11262156069278717, 'accuracy': 0.9605000019073486}
255 255
end {'loss': 0.10

In [22]:
#Sanity Check on Early Stopping
tt, acc = fit_resnet_dataaug(resnet_model,x_train, y_train, x_test, y_test, model_name, convergence=True)

print('Final accuracy {} reached in {}'.format(acc, tt))

Fitting with BS  256
0 0
end {'loss': 0.0736548975110054, 'accuracy': 0.9740399718284607}
Time to reach 0.87 accuracy: 30.87569832801819 s
15 15
end {'loss': 0.0672106072306633, 'accuracy': 0.9768999814987183}
30 30
end {'loss': 0.0706273764371872, 'accuracy': 0.9757400155067444}
45 45
end {'loss': 0.1117670014500618, 'accuracy': 0.962660014629364}
60 60
end {'loss': 0.0533704049885273, 'accuracy': 0.981660008430481}
75 75
end {'loss': 0.060485806316137314, 'accuracy': 0.9794399738311768}
Final accuracy 0.8313999772071838 reached in 2095.3354258537292


## Non-Uniform Dropout Probability

Testing the effect of varying the dropout probability across layers. 

Batch Size: 256

Number of Dropout Layers: 3

Number of BatchNorm Layers: 2

Dropout Probability: [0.1, 0.2, 0.3]

Data Augmentation: true


In [1]:
import matplotlib.pyplot as plt
from matplotlib.patches import  Rectangle
import tensorflow as tf

import os
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPool2D,MaxPooling2D, Flatten,BatchNormalization, Dropout,ZeroPadding2D, AveragePooling2D, Add, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras import activations
from keras.preprocessing.image import ImageDataGenerator
import numpy as np

from tensorflow.keras.datasets import cifar10
from keras.utils import to_categorical

import time
import pickle

Using TensorFlow backend.


In [8]:
#Define the Model
#Removing all regularizers

def res_identity(x, filters, num_batchnorm = 0, num_dropout=0, dropout_prob = [0,0,0]):
  #renet block where dimension doesnot change.
  #The skip connection is just simple identity conncection
  #we will have 3 blocks and then input will be added

    x_skip = x # this will be used for addition with the residual block 
    f1, f2 = filters
    bn = num_batchnorm
    drp = num_dropout

    #first block 
    x = Conv2D(f1, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob[0])(x)
        drp-=1

    #second block # bottleneck (but size kept same with padding)
    x = Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob[1])(x)
        drp-=1

    # third block activation used after adding the input
    x = Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob[2])(x)
        drp-=1

    # add the input 
    x = Add()([x, x_skip])
    x = Activation(activations.relu)(x)


    return x

def res_conv(x, s, filters, num_batchnorm = 0, num_dropout=0, dropout_prob = [0,0,0]):
    x_skip = x
    f1, f2 = filters
    bn = num_batchnorm
    drp = num_dropout

    # first block
    x = Conv2D(f1, kernel_size=(1, 1), strides=(s, s), padding='valid')(x)
    # when s = 2 then it is like downsizing the feature map
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob[0])(x)
        drp-=1

    # second block
    x = Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1
    x = Activation(activations.relu)(x)
    if drp>0:
        x = Dropout(dropout_prob[1])(x)
        drp-=1

    #third block
    x = Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid')(x)
    if bn>0:
        x = BatchNormalization()(x)
        bn-=1

    # shortcut 
    x_skip = Conv2D(f2, kernel_size=(1, 1), strides=(s, s), padding='valid')(x_skip)
    x_skip = BatchNormalization()(x_skip)

    # add 
    x = Add()([x, x_skip])
    x = Activation(activations.relu)(x)

    return x

def resnet50(num_batchnorm = 0, bn_pooling = False, dropout_prob=[0,0,0], num_dropout_conv = 0, num_dropout_id = 0, num_dropout=0):

    input_im = Input(shape=(32, 32, 3)) # cifar 10 images size
    x = ZeroPadding2D(padding=(3, 3))(input_im)
    bn = num_batchnorm
    drp = num_dropout

    # 1st stage
    # here we perform maxpooling, see the figure above

    x = Conv2D(64, kernel_size=(7, 7), strides=(2, 2))(x)
    if bn_pooling:
        x = BatchNormalization()(x)
    x = Activation(activations.relu)(x)
    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
    if drp>0:
        x = Dropout(dropout_prob[0])(x)
        drp-=1

    #2nd stage 
    # frm here on only conv block and identity block, no pooling

    x = res_conv(x, s=1, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(64, 256),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 3rd stage

    x = res_conv(x, s=2, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(128, 512),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 4th stage

    x = res_conv(x, s=2, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(256, 1024),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # 5th stage

    x = res_conv(x, s=2, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_conv, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)
    x = res_identity(x, filters=(512, 2048),num_batchnorm=bn, num_dropout = num_dropout_id, dropout_prob = dropout_prob)

    # ends with average pooling and dense connection

    x = AveragePooling2D((2, 2), padding='same')(x)

    x = Flatten()(x)
    if drp>0:
        x = Dropout(dropout_prob[1])(x)
        drp-=1
    x = Dense(10, activation='softmax')(x) #multi-class

    # define the model 

    model = Model(inputs=input_im, outputs=x, name='Resnet50')

    return model

In [3]:
#to measure Time to Accuracy
#https://keras.io/guides/writing_your_own_callbacks/

class timeToAccuracy(keras.callbacks.Callback):
    def __init__(self, startTime, epoch_ckpt, model_name):
        super(timeToAccuracy, self).__init__()
        self.targetAcc = 0.87 #CHANGE TO 0.87 WHEN RUNNING MODEL
        self.foundTarget = False
        self.startTime = startTime
        self.epoch = 0
        self.epoch_ckpt = epoch_ckpt
        self.model_name = model_name
        self.prev_loss = None
    def on_epoch_end(self, epoch, logs=None):
        if self.epoch % self.epoch_ckpt == 0:
            print(self.epoch, epoch)
            name = self.model_name + '.h5'
            self.model.save_weights(name)
            print('end', logs)
            
        self.epoch += 1
#         if (self.prev_loss == None):
#             self.prev_loss = logs['loss']
#         else:
#             delta = np.abs(logs['loss'] - self.prev_loss)
        if not self.foundTarget:
            if logs['accuracy'] >= self.targetAcc:
                current = time.time()
                print("Time to reach {} accuracy: {} s".format(self.targetAcc, current-self.startTime))
                with open('{}_tta.pkl'.format(model_name), 'wb') as file:
                    pickle.dump(current-self.startTime, file)
                self.foundTarget = True

In [4]:
#prepare data

(x_train, y_train), (x_test, y_test) = cifar10.load_data() 

# Data normalization
x_train = x_train/255
x_test = x_test/255
y_train = to_categorical(y_train) 
y_test = to_categorical(y_test)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [5]:
#data augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    )
datagen.fit(x_train)

In [6]:
def fit_resnet_dataaug(model, xtrain, ytrain, xtest, ytest, model_name, convergence=False):
  
    EPOCHS = 500 if convergence else 100    
    #EPOCHS=10
    BATCH_SIZE= 256
    VERBOSITY = 0   #Change to 0 when actually running model
    EPOCH_CKPT = 15 # save model every N epochs

    opt = keras.optimizers.Adam(learning_rate=1e-3)

    model.compile(loss = keras.losses.categorical_crossentropy, optimizer = opt, metrics=['accuracy'])
    #fit the model
    start = time.time()
    print('Fitting with BS ', BATCH_SIZE)
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20) if convergence else None
    model.fit(datagen.flow(x_train,
                                 y_train, 
                                 batch_size=BATCH_SIZE),
                        verbose=VERBOSITY, 
                        epochs=EPOCHS,
                        callbacks=[timeToAccuracy(startTime=start, epoch_ckpt=EPOCH_CKPT, model_name=model_name), es]
                       )
    train_time = time.time() - start

    #evaluate
    score = model.evaluate(xtest,ytest)
    loss = score[0]
    acc = score[1]

    return train_time,  acc

In [9]:
dp = 3
prob = [0.1,0.2,0.3]
bn = 2

print('Training BN{}, DPID{}, DPCONV{}, DPPROB{} till convergence with data augmentation'.format(bn, dp, dp, prob))
model_name = 'model_bn{}_dpid_{}_dpconv_{}_dpprob_{}_convergence_dataaug'.format(bn, dp, dp, prob)
resnet_model = resnet50(num_batchnorm=bn, bn_pooling=True, num_dropout_conv=dp, num_dropout_id=dp, dropout_prob=prob)
tt, acc = fit_resnet_dataaug(resnet_model,x_train, y_train, x_test, y_test, model_name, convergence=True)

print('Final accuracy {} reached in {}'.format(acc, tt))

Training BN2, DPID3, DPCONV3, DPPROB[0.1, 0.2, 0.3] till convergence with data augmentation
Fitting with BS  256
0 0
end {'loss': 2.1267759799957275, 'accuracy': 0.31859999895095825}
15 15
end {'loss': 0.739000141620636, 'accuracy': 0.7390400171279907}
30 30
end {'loss': 0.565538227558136, 'accuracy': 0.8022199869155884}
45 45
end {'loss': 0.4523443877696991, 'accuracy': 0.8400800228118896}
60 60
end {'loss': 0.38265860080718994, 'accuracy': 0.8648999929428101}
Time to reach 0.87 accuracy: 1536.7051684856415 s
75 75
end {'loss': 0.32152536511421204, 'accuracy': 0.8859400153160095}
90 90
end {'loss': 0.28127822279930115, 'accuracy': 0.8996800184249878}
105 105
end {'loss': 0.25219547748565674, 'accuracy': 0.9106199741363525}
120 120
end {'loss': 0.21934325993061066, 'accuracy': 0.9209799766540527}
135 135
end {'loss': 0.1972755491733551, 'accuracy': 0.9305599927902222}
150 150
end {'loss': 0.18034103512763977, 'accuracy': 0.9360399842262268}
165 165
end {'loss': 0.16317425668239594, 'ac