In [1]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import concatenate
from keras.layers.core import Dense, Activation, Dropout, Flatten, Lambda 
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from scipy.ndimage.interpolation import rotate, shift, zoom
from keras.constraints import max_norm
from keras.layers import Dense, Dropout
from keras.layers import Flatten,  MaxPooling2D, Conv2D
from keras.callbacks import TensorBoard
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import json
from PIL import Image
import h5py
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from prediction_utils import predict_proba
from shufflenet import get_shufflenet

Using TensorFlow backend.


In [2]:
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
def SOFTMAX(s_):
    return np.exp(s_) / np.matmul(np.ones((1, s_.shape[0])), np.exp(s_))

In [4]:
def soft_targets(temp, logits, n_classes):
    soft_targets_ = np.zeros((len(logits), n_classes))
    for i in range(len(logits)):
        soft_targets_[i] = SOFTMAX(logits[i]/temp)
    return soft_targets_

In [5]:
def knowledge_distillation_loss(y_true, y_pred, alpha):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_true_softs = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss =(alpha * tf.keras.losses.categorical_crossentropy(y_true, y_pred) +
           tf.keras.losses.categorical_crossentropy(y_true_softs, y_pred_softs))
    return loss

def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return tf.keras.metrics.categorical_accuracy(y_true, y_pred)

In [6]:
# this file is created after training is finished
config_path = 'logs/run0/model_config.txt'

# folder where validation dataset is
validation_images = 'tiny-imagenet-200/validation/'

# this file is created when you run `image_dataset_to_tfrecords.py`
class_encoder_path = 'tiny-imagenet-200/class_encoder.npy'

# this file comes with dataset
class_names_file = 'tiny-imagenet-200/words.txt'

In [7]:
# folder name -> class name in human readable format
class_names = pd.read_csv(class_names_file, sep='\t', header=None)
names = dict(class_names.set_index(0)[1])

# folder name -> class index
encoder = np.load(class_encoder_path)[()]

# class index -> class name in human readable format
decoder = {encoder[i]: names[i] for i in encoder}

In [8]:
teacher_train_logits = np.load(
    'new_teacher_train_logits.npz')['arr_0']
teacher_val_logits = np.load(
    'new_teacher_val_logits.npz')['arr_0']

In [9]:
x_val = np.load('x_val.npz')['arr_0']
x_train = np.load('new_x_train.npz')['arr_0']

In [10]:
y_train = np.load('new_y_train.npz')['arr_0']
y_val = np.load('y_val.npz')['arr_0']

In [None]:
#Let's verify the teacher predictions on the validation set  are the same as before:

In [11]:
config_path = 'logs/run0/model_config.txt'
config = json.load(open(config_path))

graph, ops = get_shufflenet(
    groups=config['groups'], 
    complexity_scale_factor=config['complexity_scale_factor']
)

In [12]:
predictions_val = predict_proba(graph, ops, x_val, run=config['run'])
predictions_argmax = np.argmax(predictions_val, 1)

In [13]:
np.mean((y_val == predictions_argmax))

0.5072213181448332

In [None]:
#Let's verify the teacher predictions on the training set  are the same as before:

In [15]:
#I'm batching so my computer doesn't pass out.
batch_size = 10000
# predictions_training = [predict_proba(graph, ops, X[i], run=config['run']) for i in ]
predictions_training = []
for batch_idx in range(x_train.shape[0] // batch_size + 1):
    #print("new batch")
    X_batch = x_train[batch_idx * batch_size: (batch_idx + 1) * batch_size]
    batch_pred = predict_proba(graph, ops, X_batch, run=config['run'])
    predictions_training.append(batch_pred)
concatenated_predictions_training = predictions_training[0]
for batch_prediction in predictions_training[1:]:
    concatenated_predictions_training = np.concatenate((concatenated_predictions_training, batch_prediction), axis=0)


In [17]:
predictions_argmax = np.argmax(concatenated_predictions_training, 1)

In [18]:
np.mean((y_train == predictions_argmax))

0.598620886340256

In [15]:
# encoding labels in one hot

In [14]:
nb_classes = 200
#one hot representation
y_train = to_categorical(y_train)
y_val= to_categorical(y_val)

In [None]:
#Training without distillation:

In [16]:
#model without distillation:
model = Sequential()
model.add(Flatten(input_shape=(56,56,3)))
model.add(Dense(800, activation='relu'))
model.add(Dense(800, activation='relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tensor_board = TensorBoard('./logs/tiny-imagenet-t')

In [17]:
model.fit(x_train, y_train, batch_size=256, epochs=50, verbose=1,
          validation_data=(x_val, y_val), callbacks=[tensor_board])

Train on 98179 samples, validate on 9832 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5e0c1d8d68>

In [32]:
#Training with distillation:

In [18]:
#soft targets
temp = 20
nb_classes = 200
train_soft_targets = soft_targets(temp, teacher_train_logits, nb_classes)
val_soft_targets = soft_targets(temp, teacher_val_logits, nb_classes)

In [20]:
Y_train_new = np.concatenate([y_train, train_soft_targets], axis=1)
Y_val_new =  np.concatenate([y_val, val_soft_targets], axis =1)

In [23]:
loss_weight = 0.5 * 1 / temp**2

In [24]:
loss_weight

0.00125

In [25]:
1 / 800

0.00125

In [39]:
temp=4
#student model:
nb_classes = 200
student_m = Sequential()
student_m.add(Flatten(input_shape=(56,56,3)))
student_m.add(Dense(800, activation='relu'))
student_m.add(Dense(800, activation='relu'))
student_m.add(Dense(nb_classes))
#student_m.add(Activation('softmax'))
student_m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(student_m.summary())

logits = student_m.layers[-1].output
probs = Activation('softmax')(logits)

logits_T = Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits_T)

output = concatenate([probs, probs_T])

student_m = Model(student_m.input, output)

loss_weight = 0.5 * 1 / temp**2
student_m.compile(optimizer='adam',
                      loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, loss_weight),
                      metrics=[acc])

In [33]:
student_m.fit(x_train, Y_train_new,
              batch_size=256,
              epochs=100,
              verbose=1,
              validation_data= (x_val, Y_val_new))

Train on 98179 samples, validate on 9832 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

<keras.callbacks.History at 0x7f5da6e53c18>

In [23]:
student_m.fit(x_train, Y_train_new,
              batch_size=256,
              epochs=50,
              verbose=1,
              validation_data= (x_val, Y_val_new))

Train on 98179 samples, validate on 9832 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f987784beb8>