# Part1

## Resnet50 on ImageNet


In [None]:
# Prepare pre_train resnet50 om ImageNet

import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np


# ResNet50 with ImageNet weights
inputs = tf.keras.Input(shape=(224, 224, 3))

ImageNet_model = ResNet50(include_top = 'False', weights='imagenet', input_tensor=inputs)



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
# Summary of ImageNet model

ImageNet_model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 230, 230, 3)  0           ['input_1[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 112, 112, 64  9472        ['conv1_pad[0][0]']              
                                )                                                                 
                                                                                           

## Drop last fc layer and add new fc layer (10 neuron)




In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Lambda
import tensorflow as tf
from tensorflow.keras.models import Model

# get output befor apply last fc layer
output = ImageNet_model.layers[-2].output
drop_resnet = Model(ImageNet_model.input, output)

# freeze all weights
for layer in drop_resnet.layers:
    layer.trainable = False


New_Cifar10_model = Sequential()
New_Cifar10_model.add(drop_resnet) #add new resnet model



# Add last Fully connected layer based on number of classes
num_classes = 10
New_Cifar10_model.add(Dense(num_classes, activation='softmax'))
New_Cifar10_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (Functional)          (None, 2048)              23587712  
                                                                 
 dense (Dense)               (None, 10)                20490     
                                                                 
Total params: 23,608,202
Trainable params: 20,490
Non-trainable params: 23,587,712
_________________________________________________________________


## Custom DataLoader

In [None]:
# Create DataGenerator

import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
import cv2

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.y = y
        self.X = X
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        X_list = [self.X[k] for k in indexes]
        Y_list = [self.y[k] for k in indexes]

        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        for i in range(len(X_list)):
            X[i,] = cv2.resize(X_list[i], (224,224))
        for i in range(len(Y_list)):
          y[i,] = Y_list[i]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)





## Train procedure on Cifar10 using Transfer learning

In [None]:
from tensorflow.keras.datasets import cifar10


New_Cifar10_model.compile(loss='categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])


# Parameters
params = {'dim': (224,224),
          'batch_size': 32,
          'n_classes': 10,
          'n_channels': 3,
          'shuffle': True}


(x_train, y_train) , (x_val, y_val) = cifar10.load_data()


print("train input size: ", np.shape(x_train))
print("validation input size: ", np.shape(x_val))

training_generator = DataGenerator(x_train, y_train, **params)
validation_generator = DataGenerator(x_val, y_val, **params)


New_Cifar10_model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
train input size:  (50000, 32, 32, 3)
validation input size:  (10000, 32, 32, 3)
Epoch 1/3


  New_Cifar10_model.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5d9847c9a0>

## Evaluation

**With trained weights on ImageNet dataset, the validation loss and accuracy have started from good point, Also after 3 epochs, we've reached 86% in accuracy metric for validation set**

# Part2

## Keras Distilber implementation

In [None]:
# Use keras doc https://keras.io/examples/vision/knowledge_distillation/

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
    
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

## Resnet 18 implementation

In [None]:
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv2D,  MaxPool2D, Flatten, GlobalAveragePooling2D,  BatchNormalization, Layer, Add
from keras.models import Sequential
from keras.models import Model
import tensorflow as tf


class ResnetBlock(Model):
    """
    A standard resnet block.
    """

    def __init__(self, channels: int, down_sample=False):
        """
        channels: same as number of convolution kernels
        """
        super().__init__()

        self.__channels = channels
        self.__down_sample = down_sample
        self.__strides = [2, 1] if down_sample else [1, 1]

        KERNEL_SIZE = (3, 3)
        # use He initialization, instead of Xavier (a.k.a 'glorot_uniform' in Keras), as suggested in [2]
        INIT_SCHEME = "he_normal"

        self.conv_1 = Conv2D(self.__channels, strides=self.__strides[0],
                             kernel_size=KERNEL_SIZE, padding="same", kernel_initializer=INIT_SCHEME)
        self.bn_1 = BatchNormalization()
        self.conv_2 = Conv2D(self.__channels, strides=self.__strides[1],
                             kernel_size=KERNEL_SIZE, padding="same", kernel_initializer=INIT_SCHEME)
        self.bn_2 = BatchNormalization()
        self.merge = Add()

        if self.__down_sample:
            # perform down sampling using stride of 2, according to [1].
            self.res_conv = Conv2D(
                self.__channels, strides=2, kernel_size=(1, 1), kernel_initializer=INIT_SCHEME, padding="same")
            self.res_bn = BatchNormalization()

    def call(self, inputs):
        res = inputs

        x = self.conv_1(inputs)
        x = self.bn_1(x)
        x = tf.nn.relu(x)
        x = self.conv_2(x)
        x = self.bn_2(x)

        if self.__down_sample:
            res = self.res_conv(res)
            res = self.res_bn(res)

        # if not perform down sample, then add a shortcut directly
        x = self.merge([x, res])
        out = tf.nn.relu(x)
        return out


class ResNet18(Model):

    def __init__(self, num_classes, **kwargs):
        """
            num_classes: number of classes in specific classification task.
        """
        super().__init__(**kwargs)
        self.conv_1 = Conv2D(64, (7, 7), strides=2,
                             padding="same", kernel_initializer="he_normal")
        self.init_bn = BatchNormalization()
        self.pool_2 = MaxPool2D(pool_size=(2, 2), strides=2, padding="same")
        self.res_1_1 = ResnetBlock(64)
        self.res_1_2 = ResnetBlock(64)
        self.res_2_1 = ResnetBlock(128, down_sample=True)
        self.res_2_2 = ResnetBlock(128)
        self.res_3_1 = ResnetBlock(256, down_sample=True)
        self.res_3_2 = ResnetBlock(256)
        self.res_4_1 = ResnetBlock(512, down_sample=True)
        self.res_4_2 = ResnetBlock(512)
        self.avg_pool = GlobalAveragePooling2D()
        self.flat = Flatten()
        self.fc = Dense(num_classes, activation="softmax")

    def call(self, inputs):
        out = self.conv_1(inputs)
        out = self.init_bn(out)
        out = tf.nn.relu(out)
        out = self.pool_2(out)
        for res_block in [self.res_1_1, self.res_1_2, self.res_2_1, self.res_2_2, self.res_3_1, self.res_3_2, self.res_4_1, self.res_4_2]:
            out = res_block(out)
        out = self.avg_pool(out)
        out = self.flat(out)
        out = self.fc(out)
        return out

## Teacher_Student model

In [None]:
import tensorflow.keras as keras

teacher = New_Cifar10_model
num_classes = 10
resnet18 = ResNet18(10)
resnet18.build(input_shape = (None,224,224,3))
student = resnet18



In [None]:
student.summary()

Model: "res_net18_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_40 (Conv2D)          multiple                  9472      
                                                                 
 batch_normalization_40 (Bat  multiple                 256       
 chNormalization)                                                
                                                                 
 max_pooling2d_2 (MaxPooling  multiple                 0         
 2D)                                                             
                                                                 
 resnet_block_16 (ResnetBloc  multiple                 74368     
 k)                                                              
                                                                 
 resnet_block_17 (ResnetBloc  multiple                 74368     
 k)                                                    

In [None]:
# Create DataGenerator

import numpy as np
import tensorflow.keras as keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.y = y
        self.X = X
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        X_list = [self.X[k] for k in indexes]
        Y_list = [self.y[k] for k in indexes]

        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        for i in range(len(X_list)):
          X[i,] = cv2.resize(X_list[i], (224,224))
        for i in range(len(Y_list)):
          y[i,] = Y_list[i]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.X))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


# Parameters
params = {'dim': (224,224),
          'batch_size': 32,
          'n_classes': 10,
          'n_channels': 3,
          'shuffle': True}


(x_train, y_train) , (x_val, y_val) = cifar10.load_data()



training_generator = DataGenerator(x_train, y_train, **params)
validation_generator = DataGenerator(x_val, y_val, **params)



### Test knowledge distilation test1 : ( alpha = 0.1 , temperature = 10)

In [None]:
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.metrics.CategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10,
)


distiller.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)



Epoch 1/3


  distiller.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f28def9ee50>

### Test knowledge distilation test2 : ( alpha = 0.8 , temperature = 10)




In [None]:
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.metrics.CategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.8,
    temperature=10,
)


distiller.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)



Epoch 1/3


  distiller.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f28def9e340>

### Test knowledge distilation test3 : ( alpha = 0.8 , temperature = 3)

In [None]:
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.metrics.CategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.8,
    temperature=3,
)


distiller.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)

Epoch 1/3


  distiller.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f267ee9d4c0>

### Test knowledge distilation test4 : ( alpha = 0.9 , temperature = 2)

In [None]:
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn=keras.metrics.CategoricalCrossentropy(),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.9,
    temperature=2,
)


distiller.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)

Epoch 1/3


  distiller.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f267eff6c40>

## Evaluation

**Based on 4 experiments, alpha = 0.9 and temperature = 2 is the best hyperparameters for teacher_student model.**

# Part3

**Use [ part2 -> Resnet Implementation and part1 -> CustomDataloader] to train and evaluate Resnet18 on Cifar10**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Lambda
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar10

(x_train, y_train) , (x_val, y_val) = cifar10.load_data()



# Parameters
params = {'dim': (224,224),
          'batch_size': 32,
          'n_classes': 10,
          'n_channels': 3,
          'shuffle': True}


training_generator = DataGenerator(x_train, y_train, **params)
validation_generator = DataGenerator(x_val, y_val, **params)

resnet18 = ResNet18(10)
resnet18.build(input_shape = (None,224,224,3))
# resnet18 = ResNet18(input_shape=(224, 224, 3), classes = num_classes)

resnet18.compile(loss='categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])

resnet18.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


Cause: mangled names are not yet supported


Cause: mangled names are not yet supported
Epoch 1/3


  resnet18.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f28e0dfc8e0>

## Evaluation

**We can see that the loss and accuracy of Resnet18 model in comparison with teacher_student model (alpha=0.9, temperature=2) have decreased.**

**Knowledge Distillation (Resnet50 : teacher , resnet18: student)**

1.   Best validation loss : 0.6737
2.   Best validation accuracy : 0.7689

**Resnet18**

1.   Best validation loss : 1.09
2.   Best validation accuracy : 65.46

The reason of differences is that we use kind of transfer learning by usage of trained resnet50 model, this model had obtained important features of Cifar10 images with help of trained model on ImageNet dataset. by fine tunning of the hyperparameters, we obtained dependencies of teacher and student models toward each other.





# Part4

**Use Part1 to train and evaluate Resnet18 on Cifar10**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Lambda
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar10

# ResNet50 with ImageNet weights
inputs = tf.keras.Input(shape=(224, 224, 3))

ImageNet_model = ResNet50(include_top = 'False', weights='imagenet', input_tensor=inputs)



# get output befor apply last fc layer
output = ImageNet_model.layers[-2].output
drop_resnet = Model(ImageNet_model.input, output)

# freeze all weights
for layer in drop_resnet.layers:
    layer.trainable = True   # Just adjust this section to true


New_Cifar10_model = Sequential()
New_Cifar10_model.add(drop_resnet) #add new resnet model



# Add last Fully connected layer based on number of classes
num_classes = 10
New_Cifar10_model.add(Dense(num_classes, activation='softmax'))
New_Cifar10_model.summary()



New_Cifar10_model.compile(loss='categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])


# Parameters
params = {'dim': (224,224),
          'batch_size': 32,
          'n_classes': 10,
          'n_channels': 3,
          'shuffle': True}


(x_train, y_train) , (x_val, y_val) = cifar10.load_data()


print("train input size: ", np.shape(x_train))
print("validation input size: ", np.shape(x_val))

training_generator = DataGenerator(x_train, y_train, **params)
validation_generator = DataGenerator(x_val, y_val, **params)


New_Cifar10_model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    epochs=3,
                    use_multiprocessing=True,
                    workers=6)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_1 (Functional)        (None, 2048)              23587712  
                                                                 
 dense_2 (Dense)             (None, 10)                20490     
                                                                 
Total params: 23,608,202
Trainable params: 23,555,082
Non-trainable params: 53,120
_________________________________________________________________
train input size:  (50000, 32, 32, 3)
validation input size:  (10000, 32, 32, 3)
Epoch 1/3


  New_Cifar10_model.fit_generator(generator=training_generator,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f297c041970>

## Evaluation

**Like last section, the results of pre_trained resnet50 is better than resnet50 from scratch**

**pre_trained resnet50 on ImageNet (trained on Cifar10)** 

1.   Best validation loss : 0.4176
2.   Best validation accuracy : 86.15

**Resnet50 from scratch**

1.   Best validation loss : 0.5682
2.   Best validation accuracy : 81.18

**Other important difference between these 2 models is that in resnet50 from scratch model overfitting have been occured from second epoch while we did not see this event in pre trained resnet50**

**The reason of differences in metric results of these two approaches, is that whenever we use transfer learning, we obtained features from other models and datasets, i.e., the pretrained model has learned general features like edges from bigger dataset with perfect fine_tunned model, So for new dataset, it should just learn subtle features which are dependent to dataset**




