**HOMEWORK**

* **1bp** Use kernel/bias regularization to improve the behaviour of
AlexNet on the CIFAR10 dataset, using the random initialization.
Experiment with multiple
[initializers](https://keras.io/api/layers/initializers/).
Report your findings.

In [None]:

import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization, Dropout, Flatten, Conv2D, MaxPooling2D


np.random.seed(1000)


def create_alex_net(kernel_initializer, bias_initializer):
    AlexNet = Sequential()
    # block 1
    AlexNet.add(Conv2D(filters=96, input_shape=(32,32,3), kernel_size=(11,11), strides=(4,4), padding='same'
                       , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    #block 2
    AlexNet.add(Conv2D(filters=256, kernel_size=(5, 5), strides=(1,1), padding='same'
                       , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    #block 3
    AlexNet.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'
                       , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))

    #block 4
    AlexNet.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='same'
                       , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))

    #block 5
    AlexNet.add(Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding='same'
                       , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))

    AlexNet.add(Flatten())
    AlexNet.add(Dense(4096, input_shape=(32,32,3,)
                      , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(Dropout(0.4))
    AlexNet.add(Dense(4096
                      , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(Dropout(0.4))
    AlexNet.add(Dense(1000
                      , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('relu'))
    AlexNet.add(Dropout(0.4))
    AlexNet.add(Dense(10
                      , kernel_initializer=kernel_initializer, bias_initializer=bias_initializer))
    AlexNet.add(BatchNormalization())
    AlexNet.add(Activation('softmax'))

    AlexNet.compile(loss = keras.losses.categorical_crossentropy, optimizer= 'adam', metrics=['accuracy'])
    return AlexNet


In [None]:
AlexNet = create_alex_net('zeros', 'zeros')
AlexNet.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_20 (Conv2D)          (None, 8, 8, 96)          34944     
                                                                 
 batch_normalization_34 (Bat  (None, 8, 8, 96)         384       
 chNormalization)                                                
                                                                 
 activation_34 (Activation)  (None, 8, 8, 96)          0         
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 4, 4, 96)         0         
 g2D)                                                            
                                                                 
 conv2d_21 (Conv2D)          (None, 4, 4, 256)         614656    
                                                                 
 batch_normalization_35 (Bat  (None, 4, 4, 256)       

In [None]:

# load the CIFAR 10 data
from keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical

def load_cifar10_data():
  (x_train, y_train), (x_test, y_test) = cifar10.load_data()
  y_train = to_categorical(y_train)
  y_test = to_categorical(y_test)
  return x_train, x_test, y_train, y_test


In [None]:
X_train, X_test, Y_train, Y_test = load_cifar10_data()
initializers = [
    'random_normal',
    'random_uniform',
    'truncated_normal',
    'zeros',
    'ones',
    'glorot_normal',
    'glorot_uniform',
    'glorot_uniform',
    'he_uniform',
    'he_normal',
]
for init in initializers:
    AlexNet = create_alex_net(init, init)
    AlexNet.fit(X_train, Y_train, epochs=5)
    loss, acc = AlexNet.evaluate(X_test, Y_test, verbose=0)
    print(init, loss, acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random_normal 1.1380503177642822 0.6111999750137329
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
random_uniform 1.3054465055465698 0.5561000108718872
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
truncated_normal 1.3752315044403076 0.5264000296592712
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
zeros 2.3109729290008545 0.10000000149011612
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ones 2.0341594219207764 0.19539999961853027
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
glorot_normal 1.1748121976852417 0.5884000062942505
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
glorot_uniform 1.6946511268615723 0.43959999084472656
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
glorot_uniform 1.1936275959014893 0.5752999782562256
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
he_uniform 1.1416836977005005 0.6007000207901001
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
he_normal 1.2005150318145752 0.582300007343

We can see that random_normal and he_uniform gave the best results, but they were not significantly better.
Using a zero  initializer would make the network not trainable, and for a network initialized with ones,
it will need a long warm up period to be able to adjust the weights.

* **1bp** Perform K-Fold CV on a sequence of models, aiming for polynomial regression over a dataset of synthetically generated data. Are the complexe models chosen over the simpler models? Why/ why not?

* **1bp** Fit ResNet50 on CIFAR10 using other types of optimizations techniques (RMSProp, AdaGrad). Can you explain the results by the specific of the algorithms?

In [None]:
import tensorflow as tf


def resnet50(inputs):

  resnet50 = tf.keras.applications.resnet.ResNet50(input_shape=(32, 32, 3),
                                               include_top=False,
                                               weights=None)(inputs)
  return resnet50


def classifier(inputs):
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    x = tf.keras.layers.Dense(10, activation="softmax", name="classification")(x)
    return x

def final_model(inputs):
    resnet = resnet50(inputs)
    output = classifier(resnet)
    return output

def model(optimizer):
    inputs = tf.keras.layers.Input(shape=(32,32,3))
    output = final_model(inputs) 
    model = tf.keras.Model(inputs=inputs, outputs = output)
 
    model.compile(optimizer=optimizer, 
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                metrics = ['accuracy'])
  
    return model


In [None]:

X_train, X_test, Y_train, Y_test = load_cifar10_data()

resnet50 = model('rmsprop')
resnet50.fit(X_train, Y_train, epochs=5, verbose=1)
print(resnet50.evaluate(X_test, Y_test, verbose=1))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[2.1364195346832275, 0.41449999809265137]


In [None]:

X_train, X_test, Y_train, Y_test = load_cifar10_data()

resnet50 = model('adagrad')
resnet50.fit(X_train, Y_train, epochs=5, verbose=1)
print(resnet50.evaluate(X_test, Y_test, verbose=1))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[1.4140031337738037, 0.5828999876976013]


Adagrad:
 * adapts depending on the size of the gradient
 * It is inverse dependant on the size of the gradient
 * based on recently past gradients
 * advantages features that appear rarely
 * However, it can become  blocked on plateaus

RMSprop:
 * same as Adagrad, but instead of keeping track of past gradients, it uses an exponential running average
 * on each iteration it divides the learning rate by exponential running average gradient


As we can see, adagrad performs better than rmsprop in the short term. This may be due to the fact that
unlike RMSProp, which uses an exponential running average, the past gradients used
by adagrad help adjust the learning rate better, trading memory for convergence.
Also, it could be that Adagrad performed better in the first few epochs because
it didn't yet encounter plateaus.


* **1bp** Use the simple LeNet demo to do transfer learning, classifying on the STL 10 dataset. Compare the transfer learning model performance with the one randomly initialized.


In [2]:
!pip install extra-keras-datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting extra-keras-datasets
  Downloading extra_keras_datasets-1.2.0-py3-none-any.whl (12 kB)
Installing collected packages: extra-keras-datasets
Successfully installed extra-keras-datasets-1.2.0


In [57]:
from extra_keras_datasets import stl10
from tensorflow.keras.utils import to_categorical
(input_train, target_train), (input_test, target_test) = stl10.load_data()
target_train = target_train - 1
target_test = target_test  -1


INFO:root:Loading dataset = stl-10


In [58]:
target_train = to_categorical(target_train)
target_test = to_categorical(target_test)

In [59]:
input_train.shape, target_train.shape

((5000, 96, 96, 3), (5000, 10))

In [5]:

import keras
from keras.datasets import mnist
from keras.layers import Conv2D, AveragePooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical


def get_mnist_data():
  (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
  X_train = X_train.reshape((X_train.shape[0], 28, 28, 1))
  X_test = X_test.reshape((X_test.shape[0], 28, 28, 1))
  Y_train = to_categorical(Y_train) # use one-hot encoding
  Y_test = to_categorical(Y_test)
  
  return X_train, Y_train, X_test, Y_test



In [12]:
from keras.layers import AveragePooling2D, InputLayer

def create_lenet():
    model = keras.Sequential()
    model.add(InputLayer(input_shape=(28,28,1)))
    model.add(Conv2D(filters=6, kernel_size=(3, 3), activation='relu'))
    model.add(AveragePooling2D())
    model.add(Conv2D(filters=16, kernel_size=(3, 3), activation='relu'))
    model.add(AveragePooling2D())
    model.add(Flatten())
    model.add(Dense(units=120, activation='relu'))
    model.add(Dense(units=84, activation='relu'))
    model.add(Dense(units=10, activation = 'softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [70]:
def train_on_mnist(model):
    X_train, Y_train, _, _ = get_mnist_data()
    model.fit(X_train, Y_train, epochs=10)
    return model

In [71]:
def create_for_stl_10(train):
    model = create_lenet()
    if train:
        train_on_mnist(model)
    new_model = keras.Sequential()
    new_model.add(InputLayer(input_shape=(96,96,3)))
    new_model.add(Conv2D(filters=6, kernel_size=(5, 5), activation='relu'))
    new_model.add(AveragePooling2D(strides=(2, 2)))
    new_model.add(Conv2D(filters=6, kernel_size=(5, 5), activation='relu'))
    new_model.add(Conv2D(filters=6, kernel_size=(5, 5), activation='relu'))
    new_model.add(Conv2D(filters=6, kernel_size=(5, 5), activation='relu'))
    new_model.add(Conv2D(filters=6, kernel_size=(5, 5), activation='relu'))
    new_model.add(Conv2D(filters=1, kernel_size=(3, 3), activation='relu'))
    for layer in model.layers[:]:
        new_model.add(layer)
    new_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return new_model

In [72]:
model = create_for_stl_10(True)
model.fit(input_train, target_train, epochs=20)
model.evaluate(input_test, target_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[3.970431089401245, 0.29237499833106995]

In [73]:
model = create_for_stl_10(False)
model.fit(input_train, target_train, epochs=20)
model.evaluate(input_test, target_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.8606065511703491, 0.3608750104904175]

The results are not better when using the pretrained model. Even though it seems that the training accuracy is better due to having an increase in convergence speed due to the pretrained weights, the features learnt in the mnist dataset are not relevant to the STL_10 dataset. 

Mnist is smaller, (28, 28, 1), while stl 10 is (96, 96, 3). 
Mnist has images from the same class, numbers, while the classes in stl 10 are more diverse.

The pretrained model clearly shows signs of overfitting. 

However, the transfer learning experiment was not performed well.
In order to broadcast the stl 10 images to a shape which is also convenient for mnist, some more convolution layers were added before. 

maybe a better solution would have been to do upsampling for the mnist images to (96, 96, 3), because increasing the feature space might bring better results than 'forcefully' decreasing the feature space to make the 2 shapes compatible and trainable with the same model. 
