<a href="https://colab.research.google.com/github/aryanasadianuoit/Adaptive-Graph-Based-Cohort-Creation-For-Deep-MutualLearning/blob/master/KD_Different_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Conv2D,GlobalAveragePooling2D,Dense,Softmax,Flatten,MaxPooling2D,Dropout,Activation, Lambda, concatenate
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import kullback_leibler_divergence as KLD_Loss, categorical_crossentropy as logloss
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import categorical_accuracy
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
NUM_CLASSES = 10
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
print("x_train shape:", x_train.shape, "y_train shape:", y_train.shape)

# Normalize the dataset
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
x_train shape: (50000, 32, 32, 3) y_train shape: (50000, 1)


In [6]:
def teacher_generator():
  teacher = Sequential() # Must define the input shape in the first layer of the neural network
  teacher.add(Conv2D(filters=32, kernel_size=2, padding='same', activation='relu', input_shape=(32,32,3)))
  teacher.add(MaxPooling2D(pool_size=2))
  teacher.add(Conv2D(filters=64, kernel_size=2, padding='same', activation='relu'))
  teacher.add(MaxPooling2D(pool_size=2))
  teacher.add(Flatten())
  teacher.add(Dense(256, activation='relu'))
  teacher.add(Dense(64, activation='relu',name="teacher_target_layer"))
  teacher.add(Dropout(0.5))
  teacher.add(Dense(10))
  teacher.add(Activation('softmax'))

  teacher.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

  # Take a look at the model summary

  #teacher.summary()

  return teacher

In [9]:
def student_generator():
  student = Sequential() #a Must define the input shape in the first layer of the neural network
  student.add(Flatten(input_shape=(32,32,3)))
  student.add(Dense(64, activation='relu'))
  student.add(Dense(10))
  return student

In [14]:
# Instanciating a teacher model and training it ===> for




#                                                                 OFFLINE KNOWLEDGE DISTILLATION


teacher_model = teacher_generator()
myCP = ModelCheckpoint(save_best_only=True,filepath='teacher_model.h5',monitor = 'val_acc')
teacher_model.fit(x_train,
         y_train,
         batch_size=128,
         epochs=20,
         validation_split = 0.2,
         callbacks=[myCP])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f61eb05c400>

In [16]:
#                                                                OFFLINE KNOWLEDGE DISTILLATION

# Phase 1 ===>  re-create the TEACHER model with the softened softmax layer , Using the pre-defined TEMPERATURE

teacher_logits_model = Model(teacher_model.input,teacher_model.layers[-2].output)

Temperature = 3.25
T_layer = Lambda(lambda x:x/Temperature)(teacher_logits_model.output)
Softmax_layer = Activation('softmax')(T_layer)
teacher_soften_model = Model(teacher_model.input,Softmax_layer)
     

In [17]:
#                                                                OFFLINE KNOWLEDGE DISTILLATION

# Phase 2 ===>  Updating the training lables by adding the softened probabilities produced by the pre-trained TEACHER


# Predict and convert to sparse categorical matrix
y_train_new = teacher_soften_model.predict(x_train)
y_test_new = teacher_soften_model.predict(x_test)

y_train_new = np.c_[to_categorical(y_train),y_train_new]
y_test_new = np.c_[to_categorical(y_test),y_test_new]

In [19]:
#                                                                OFFLINE KNOWLEDGE DISTILLATION

# Phase 3 ===>  Instanciating a STUDENT. In default, the student_generator, generates student models that they do not have ACTIVATION in their last layer.
# After initializing the student model, we compute the logits, and add the last layer with of SOFTMAX


student_model =  student_generator()
student_model.summary()

student_logits = student_model.layers[-1].output

# Compute softmax
probs = Activation("softmax")(student_logits)

# Compute softmax with softened logits
logits_T = Lambda(lambda x:x/Temperature)(student_logits)
probs_T = Activation("softmax")(logits_T)

CombinedLayers = concatenate([probs,probs_T])

student_model_with_softmax = Model(student_model.input,CombinedLayers)


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_9 (Flatten)          (None, 3072)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 64)                196672    
_________________________________________________________________
dense_19 (Dense)             (None, 10)                650       
Total params: 197,322
Trainable params: 197,322
Non-trainable params: 0
_________________________________________________________________


In [20]:
#                                                                OFFLINE KNOWLEDGE DISTILLATION

# Phase 4 ===>  KNOWLEDGE DISTILLATION custom LOSS

def KD_loss(y_true,y_pred,lambd=0.5,T=10.0):
  y_true,y_true_KD = y_true[:,:NUM_CLASSES],y_true[:,NUM_CLASSES:]
  y_pred,y_pred_KD = y_pred[:,:NUM_CLASSES],y_pred[:,NUM_CLASSES:]
  # Classic cross-entropy (without temperature)
  CE_loss = logloss(y_true,y_pred)
  # KL-Divergence loss for softened output (with temperature)
  KL_loss = T**2*KLD_Loss(y_true_KD,y_pred_KD)
  
  return lambd*CE_loss + (1-lambd)*KL_loss

def accuracy(y_true,y_pred):
  return categorical_accuracy(y_true,y_pred)

In [21]:
#                                                                OFFLINE KNOWLEDGE DISTILLATION

# Phase 5- END ===>  KTraining the complete STUDENT Model with SOFTMAX layer at the end by using the custom  KD LOSS.

student_model_with_softmax.compile(optimizer='adam',loss=lambda y_true,y_pred: KD_loss(y_true, y_pred,lambd=0.5,T=Temperature),metrics=[accuracy])
myCP = ModelCheckpoint(save_best_only=True,filepath='student_model_trained_regular_kd.h5',monitor = 'val_accuracy')

student_model_with_softmax.fit(x_train,y_train_new,epochs=50,validation_split=0.15,batch_size=128,callbacks=[myCP])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f61eaf606a0>

In [22]:
# STANDALONE student model trained from SCRATCH, ONLY From DATASET
aloneModel = student_generator()
aloneModel.summary()
aloneModel.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

myCP = ModelCheckpoint(_best_only=True,filepath='alone.h5',monitor = 'val_acc')

aloneModel.fit(x_train,y_train,epochs=50,validation_split=0.15,batch_size=128,callbacks=[myCP])

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_10 (Flatten)         (None, 3072)              0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)                196672    
_________________________________________________________________
dense_21 (Dense)             (None, 10)                650       
Total params: 197,322
Trainable params: 197,322
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Ep

<tensorflow.python.keras.callbacks.History at 0x7f61e7b7ab70>

In [None]:
#                                                             INTERNAL OFFLINE KNOWLEDGE DISTILLATION
# Phase 1 ===>        get the  TARGET layer TEACHER
teacher_model
