In [1]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import concatenate
from keras.layers.core import Dense, Activation, Dropout, Flatten, Lambda 
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from scipy.ndimage.interpolation import rotate, shift, zoom
from keras.constraints import max_norm
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import json
from PIL import Image
import h5py
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from prediction_utils import predict_proba
from shufflenet import get_shufflenet

Using TensorFlow backend.


In [2]:
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
def SOFTMAX(s_):
    return np.exp(s_) / np.matmul(np.ones((1, s_.shape[0])), np.exp(s_))

In [4]:
def soft_targets(temp, logits, n_classes):
    soft_targets_ = np.zeros((len(logits), n_classes))
    for i in range(len(logits)):
        soft_targets_[i] = SOFTMAX(logits[i]/temp)
    return soft_targets_

In [5]:
def knowledge_distillation_loss(y_true, y_pred, alpha):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_true_softs = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss =(alpha*tf.keras.losses.categorical_crossentropy(y_true,y_pred) +
           tf.keras.losses.categorical_crossentropy(y_true_softs, y_pred_softs))
    return loss

def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return tf.keras.metrics.categorical_accuracy(y_true, y_pred)

In [6]:
# this file is created after training is finished
config_path = 'logs/run0/model_config.txt'

# folder where validation dataset is
validation_images = 'tiny-imagenet-200/validation/'

# this file is created when you run `image_dataset_to_tfrecords.py`
class_encoder_path = 'tiny-imagenet-200/class_encoder.npy'

# this file comes with dataset
class_names_file = 'tiny-imagenet-200/words.txt'

In [7]:
# folder name -> class name in human readable format
class_names = pd.read_csv(class_names_file, sep='\t', header=None)
names = dict(class_names.set_index(0)[1])

# folder name -> class index
encoder = np.load(class_encoder_path)[()]

# class index -> class name in human readable format
decoder = {encoder[i]: names[i] for i in encoder}

In [8]:
teacher_train_logits = np.load(
    'teacher_logits_train.npz')['arr_0']
teacher_val_logits = np.load(
    'teacher_logits_val.npz')['arr_0']

In [9]:
x_val = np.load('x_val.npz')['arr_0']
x_train = np.load('x_train.npz')['arr_0']

In [10]:
x_train_files = np.load(
    'teacher_kept_image_filenames_train.npz')['arr_0']

In [11]:
y_val = np.load('y_val.npz')['arr_0']
# getting labels category numbers , ex: n02056570
labels_number_list = [kept_filename.split('_')[0] for kept_filename in x_train_files]
# translating the label category number into the corresponding index
y_train = [encoder[label_nb] for label_nb in labels_number_list]

In [12]:
#one hot representation
y_train = to_categorical(y_train)
y_val= to_categorical(y_val)

In [13]:
#soft targets
temp = 20
nb_classes = 200
train_soft_targets = soft_targets(temp, teacher_train_logits, nb_classes)
val_soft_targets = soft_targets(temp, teacher_val_logits, nb_classes)

In [14]:
Y_train_new = np.concatenate([y_train, train_soft_targets], axis=1)
Y_val_new =  np.concatenate([y_val, val_soft_targets], axis =1)

In [15]:
#student model:
student_m = Sequential()
student_m.add(Flatten(input_shape=(56,56,3)))
student_m.add(Dense(800, activation='relu'))
student_m.add(Dense(800, activation='relu'))
student_m.add(Dense(nb_classes))
#student_m.add(Activation('softmax'))
student_m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(student_m.summary())

logits = student_m.layers[-1].output
probs = Activation('softmax')(logits)

logits_T = Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits_T)

output = concatenate([probs, probs_T])

student_m = Model(student_m.input, output)

student_m.compile(optimizer='SGD',
                      loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 1),
                      metrics=[acc])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 9408)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 800)               7527200   
_________________________________________________________________
dense_2 (Dense)              (None, 800)               640800    
_________________________________________________________________
dense_3 (Dense)              (None, 200)               160200    
Total params: 8,328,200
Trainable params: 8,328,200
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
student_m.fit(x_train, Y_train_new,
              batch_size=256,
              epochs=50,
              verbose=1,
              validation_data= (x_val, Y_val_new))

Train on 98179 samples, validate on 9832 samples
Epoch 1/50

KeyboardInterrupt: 