In [23]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from CNN import ThreeLayerConvNet
from model import myModel

from data_utils import unpickle
from ResNet_Tensorflow.utils import load_cifar100
from ResNet_Tensorflow.ResNet import ResNet
from keras.models import Model
from cifar100.cifar_resnet import SmallResNet
from cifar100.cifar100vgg import cifar100vgg

import time
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
# Load data
x_train, y_train, x_test, y_test = load_cifar100()

In [48]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(50000, 32, 32, 3) (50000, 100)
(10000, 32, 32, 3) (10000, 100)


In [78]:
# Prepare data to train with small model
data = {'X_train': x_train[:49000].transpose(0,3,1,2).copy(), 'y_train': np.argmax(y_train[:49000],axis=1),
        'X_val': x_train[49000:].transpose(0,3,1,2).copy(), 'y_val': np.argmax(y_train[49000:],axis=1),
        'X_test': x_test.transpose(0,3,1,2).copy(), 'y_test': np.argmax(y_test,axis=1),
       }

In [53]:
print(data['X_train'].shape, data['y_train'].shape)
print(data['X_val'].shape, data['y_val'].shape)
print(data['X_test'].shape, data['y_test'].shape)

(49000, 3, 32, 32) (49000,)
(1000, 3, 32, 32) (1000,)
(10000, 3, 32, 32) (10000,)


In [51]:
print(np.mean(x_train[0]))

0.2736978722097671


In [27]:
net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

small_model = myModel(net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      checkpoint_name='checkpoints/small_model_cifar100')
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 4.611361
Saving checkpoint to "checkpoints/small_model_cifar100_epoch_0.pkl"
(Epoch 0 / 2) train acc: 0.016673; val_acc: 0.010000
(Iteration 101 / 980) loss: 4.040595
(Iteration 201 / 980) loss: 3.700560
(Iteration 301 / 980) loss: 3.088319
(Iteration 401 / 980) loss: 3.812060
Saving checkpoint to "checkpoints/small_model_cifar100_epoch_1.pkl"
(Epoch 1 / 2) train acc: 0.227633; val_acc: 0.209000
(Iteration 501 / 980) loss: 3.471729
(Iteration 601 / 980) loss: 3.433722
(Iteration 701 / 980) loss: 3.290821
(Iteration 801 / 980) loss: 3.303398
(Iteration 901 / 980) loss: 3.190131
Saving checkpoint to "checkpoints/small_model_cifar100_epoch_2.pkl"
(Epoch 2 / 2) train acc: 0.293041; val_acc: 0.234000
Execution time:  1476.2186529636383
Test accuracy: 0.2561


In [12]:
net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=7,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

small_model = myModel(net, data, 
                      num_epochs=5, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      checkpoint_name='checkpoints/small_model_cifar100_5epoch')
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 2450) loss: 4.611355
Saving checkpoint to "checkpoints/small_model_cifar100_5epoch_epoch_0.pkl"
(Epoch 0 / 5) train acc: 0.020878; val_acc: 0.026000
(Iteration 101 / 2450) loss: 3.761836
(Iteration 201 / 2450) loss: 3.713803
(Iteration 301 / 2450) loss: 3.501951
(Iteration 401 / 2450) loss: 3.283360
Saving checkpoint to "checkpoints/small_model_cifar100_5epoch_epoch_1.pkl"
(Epoch 1 / 5) train acc: 0.242082; val_acc: 0.197000
(Iteration 501 / 2450) loss: 3.376592
(Iteration 601 / 2450) loss: 3.289839
(Iteration 701 / 2450) loss: 3.061102
(Iteration 801 / 2450) loss: 2.918731
(Iteration 901 / 2450) loss: 2.930805
Saving checkpoint to "checkpoints/small_model_cifar100_5epoch_epoch_2.pkl"
(Epoch 2 / 5) train acc: 0.306776; val_acc: 0.256000
(Iteration 1001 / 2450) loss: 3.341122
(Iteration 1101 / 2450) loss: 3.110064
(Iteration 1201 / 2450) loss: 3.280368
(Iteration 1301 / 2450) loss: 2.934712
(Iteration 1401 / 2450) loss: 2.807596
Saving checkpoint to "checkpoints/small_mod

# Distill knowledge from ResNet18

In [79]:
# For being faster pick up file of logit from vgg16
# This model has 87.26% train accuracy and 68.81% test accuracy
logits_train_res18 = unpickle('./ResNet-Tensorflow/resnet18_logits_train.txt')

In [80]:
# Train small model with distilling from ResNet18
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=5.0,logit_distill=logits_train_res18)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 59.873485
(Epoch 0 / 2) train acc: 0.012245; val_acc: 0.017000
(Iteration 101 / 980) loss: 58.004536
(Iteration 201 / 980) loss: 57.094500
(Iteration 301 / 980) loss: 56.669959
(Iteration 401 / 980) loss: 56.520184
(Epoch 1 / 2) train acc: 0.250633; val_acc: 0.224000
(Iteration 501 / 980) loss: 56.381089
(Iteration 601 / 980) loss: 56.571088
(Iteration 701 / 980) loss: 56.215117
(Iteration 801 / 980) loss: 55.882912
(Iteration 901 / 980) loss: 56.269351
(Epoch 2 / 2) train acc: 0.329918; val_acc: 0.269000
Execution time:  1548.218759059906
Test accuracy: 0.2908


In [81]:
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=2.0,logit_distill=logits_train_res18)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 11.519054
(Epoch 0 / 2) train acc: 0.026224; val_acc: 0.034000
(Iteration 101 / 980) loss: 10.375120
(Iteration 201 / 980) loss: 9.378027
(Iteration 301 / 980) loss: 8.848267
(Iteration 401 / 980) loss: 9.114535
(Epoch 1 / 2) train acc: 0.241388; val_acc: 0.194000
(Iteration 501 / 980) loss: 8.459070
(Iteration 601 / 980) loss: 9.038652
(Iteration 701 / 980) loss: 8.882652
(Iteration 801 / 980) loss: 8.657711
(Iteration 901 / 980) loss: 8.091252
(Epoch 2 / 2) train acc: 0.328714; val_acc: 0.248000
Execution time:  1549.500007867813
Test accuracy: 0.285


In [None]:
# Train small model with distilling from ResNet18
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=20.0,logit_distill=logits_train_res18)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

# Distill knowledge from ResNet110

In [53]:
net_res = SmallResNet(n=18, top_activation=None, weights='./cifar100/resnet-110_cifar100.model.h5')

In [54]:
net_res.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv0 (Conv2D)                  (None, None, None, 1 448         input_7[0][0]                    
__________________________________________________________________________________________________
bn0 (BatchNormalization)        (None, None, None, 1 64          conv0[0][0]                      
__________________________________________________________________________________________________
activation_1117 (Activation)    (None, None, None, 1 0           bn0[0][0]                        
__________________________________________________________________________________________________
res1-1x (C

In [57]:
logits_train_res = net_res.predict(x_train[:49000], verbose = 1)



In [39]:
def SoftMax(s):
    # minus max to avoid large s case
    p = np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1))/\
    np.expand_dims(np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1)).sum(axis=1),axis=1)  # matrix of size NxK
    return p

In [58]:
print(logits_train_res.shape)
print(logits_train_res[0])
y_pred_big = np.argmax(SoftMax(logits_train_res),axis=1)
y_true = np.argmax(y_train[:49000],axis=1)
print('Train accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

(49000, 100)
[-1.5965376   2.0916662  -1.7080394  -2.3401544  -1.5550946  -7.9265337
 -1.9221675   1.0908899   3.6988983   4.865921    3.9860163  -4.4878407
 -1.5065367   4.4253283   2.149107   -4.5261736   1.1044347  -2.214517
 -0.60959107  2.3700893  -4.7188826   4.562145   -2.2339377  -0.43608427
  1.4681711  -0.21188563 -2.7075388   1.8217117  -3.0712423  -1.8068831
 -5.9322743   6.629902    3.0470107   3.1984599  -1.5094794   1.6511922
  4.640464    4.141948   -2.356608   -4.3477554  -2.7679079   4.5643945
 -1.0097531  -4.699317   -7.1383376  -3.9733102   5.1767006   1.4382753
  1.668138   -1.7402356  -0.0684002   2.003531   -1.9643902  -6.250108
 -0.2044217  -5.8434987   3.341186   -0.6098274  -0.32815063  3.3661585
  1.5814054  -6.343765    0.4769208   2.1709983  -0.5469464  -0.8419374
  1.518573   -1.3634694   2.8196032  -0.73783785 -3.8844328  -3.6488805
  5.6056137   4.701839    5.568709    5.429322    2.1272514  -0.21784377
 -0.08637193 -0.1858269   5.4145494   8.120225    0

In [59]:
with open('./cifar100/resnet110_logits_train.txt', 'wb') as fp:
    pickle.dump(logits_train_res, fp)

In [55]:
# Predict to get logits for test
logits_test = net_res.predict(x_test, verbose = 1)



In [56]:
print(logits_test[0])
y_pred_big = np.argmax(SoftMax(logits_test),axis=1)
y_true = np.argmax(y_test,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

[-6.068798   -4.523218   -4.731172   -1.6792998   3.5741763  -0.15125123
 -1.2078682  -3.6526413   3.4743977  -3.005215   -3.1354995   2.5808342
  5.7947254   2.6016715  -2.0448287   5.173783   -2.3950481   3.4126356
 -3.6132193  -3.5915787  -6.2181554  -0.9666601   2.5290911   0.872355
  0.7275995   0.7811909  -3.4272017  -2.6549697  -2.0854492   0.10899204
  2.1402147   4.6930556   4.711023    3.0502515  -2.0846872   2.606184
 -1.7049557  -0.37278196 -0.36674178  0.09816748  6.8902707  -4.270845
  2.2859044  -0.65528595 -5.1995964  -1.9651096   5.2476544  -2.3290915
  2.1142879   8.304985   -1.3609399   2.003549   -1.9033939  -3.1886997
 -0.3502258  -0.5843327   4.484993   -1.893204    0.34000242  2.0272462
  1.6472216   0.9460925  -1.1196889   1.7885422   0.69869184 -2.1220722
  1.6792787  -0.23104388  8.554266    0.89954746 -5.844656    2.5973814
 -2.5295947  -2.0598173  -2.4477658   1.7708652   8.152147   -0.26964808
 -6.7581735   9.770423    3.3933237   0.15353978  3.8578472  -7.

In [44]:
# For being faster pick up file of logits from resnet110
logits_train_res = unpickle('./cifar100/resnet110_logits_train.txt')

In [29]:
# Train small model with distilling from ResNet110
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=5.0,logit_distill=logits_train_res,
                      checkpoint_name='checkpoints/distill_model_cifar100')

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 59.873581
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_0.pkl"
(Epoch 0 / 2) train acc: 0.013327; val_acc: 0.015000
(Iteration 101 / 980) loss: 56.560818
(Iteration 201 / 980) loss: 55.696028
(Iteration 301 / 980) loss: 54.858248
(Iteration 401 / 980) loss: 54.276219
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_1.pkl"
(Epoch 1 / 2) train acc: 0.224857; val_acc: 0.183000
(Iteration 501 / 980) loss: 54.723139
(Iteration 601 / 980) loss: 52.461798
(Iteration 701 / 980) loss: 52.780953
(Iteration 801 / 980) loss: 53.014063
(Iteration 901 / 980) loss: 52.673768
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_2.pkl"
(Epoch 2 / 2) train acc: 0.310224; val_acc: 0.261000
Execution time:  1475.1574792861938
Test accuracy: 0.2802


In [54]:
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=2.0,logit_distill=logits_train_res,
                      checkpoint_name='checkpoints/distill_model_cifar100')

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 11.519195
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_0.pkl"
(Epoch 0 / 2) train acc: 0.013551; val_acc: 0.020000
(Iteration 101 / 980) loss: 9.875374
(Iteration 201 / 980) loss: 8.976517
(Iteration 301 / 980) loss: 8.953095
(Iteration 401 / 980) loss: 9.079626
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_1.pkl"
(Epoch 1 / 2) train acc: 0.231592; val_acc: 0.205000
(Iteration 501 / 980) loss: 8.623214
(Iteration 601 / 980) loss: 7.994780
(Iteration 701 / 980) loss: 8.240799
(Iteration 801 / 980) loss: 7.790274
(Iteration 901 / 980) loss: 7.640798
Saving checkpoint to "checkpoints/distill_model_cifar100_epoch_2.pkl"
(Epoch 2 / 2) train acc: 0.311776; val_acc: 0.249000
Execution time:  1504.2091999053955
Test accuracy: 0.2694


In [55]:
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                 hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=20.0,logit_distill=logits_train_res)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 923.342696
(Epoch 0 / 2) train acc: 0.014327; val_acc: 0.007000
(Iteration 101 / 980) loss: 921.857613
(Iteration 201 / 980) loss: 921.056706
(Iteration 301 / 980) loss: 920.527488
(Iteration 401 / 980) loss: 920.742419
(Epoch 1 / 2) train acc: 0.251816; val_acc: 0.227000
(Iteration 501 / 980) loss: 919.912761
(Iteration 601 / 980) loss: 920.733250
(Iteration 701 / 980) loss: 920.239117
(Iteration 801 / 980) loss: 919.971648
(Iteration 901 / 980) loss: 919.344459
(Epoch 2 / 2) train acc: 0.327082; val_acc: 0.284000
Execution time:  1500.6566090583801
Test accuracy: 0.2822


# Distill knowledge from VGG

In [56]:
import keras
from keras.datasets import cifar100

In [57]:
(x_train, y_train), (x_test, y_test) = cifar100.load_data()
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

y_train = keras.utils.to_categorical(y_train, 100)
y_test = keras.utils.to_categorical(y_test, 100)

In [58]:
# Verify shape of data
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(50000, 32, 32, 3) (50000, 100)
(10000, 32, 32, 3) (10000, 100)


In [32]:
net_vgg = cifar100vgg(train=False)

In [17]:
net_vgg.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 32, 64)        1792      
_________________________________________________________________
activation_1 (Activation)    (None, 32, 32, 64)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 32, 64)        256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 64)        36928     
_________________________________________________________________
activation_2 (Activation)    (None, 32, 32, 64)        0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 32, 32, 64)        256       
__________

In [61]:
x_train, x_test = net_vgg.normalize(x_train,x_test)

121.93584
68.38902


In [34]:
# Predict  test
prob_test = net_vgg.model.predict(x_test, verbose = 1)



In [35]:
print(sum(prob_test[0]))
y_pred_big = np.argmax(prob_test,axis=1)
y_true = np.argmax(y_test,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

0.9999999016124406
Test accuracy of big model: 0.7048


In [36]:
net_vgg_woSM = Model(inputs=net_vgg.model.input, outputs=net_vgg.model.layers[-2].output)

In [37]:
# Predict to get logits for test
logits_test = net_vgg_woSM.predict(x_test, verbose = 1)



In [40]:
print(logits_test.shape)
print(logits_test[0])
y_pred_big = np.argmax(SoftMax(logits_test),axis=1)
y_true = np.argmax(y_test,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

(10000, 100)
[-1.1773857  -2.3598697   1.379565    1.7740533  -0.43864304 -4.053173
 -2.957971    1.0075067   5.64918    -2.6610117  -4.3843617  -1.078161
  6.2216654  -1.3275986  -2.4417372   3.7754247  -1.8505068   4.599816
  2.036851    1.7919738  -5.447965   -0.79682076  0.80301005  1.6044474
 -2.500914   -3.6765857  -0.91894567  1.3104973  -4.375844   -1.4203635
  2.317135    1.5534563  -2.6738296   2.795367   -0.7494376  -1.8716614
 -3.869698    3.1157846   1.8392999  -3.396057    0.19659424  0.18485183
 -2.5569162  -1.676392    1.4789174  -3.97746     2.8022676  -1.7627606
  0.5107993   4.509336   -3.742899    0.49826813 -0.8908025  -1.6613612
 -1.9512595   2.3769374   1.4075581  -0.28709927 -1.6575047   2.83997
  7.7976484  -2.7571177  -3.3368654  -1.2154697  -0.9536209  -0.10844386
 -4.472899   -0.8980129  10.274249    5.9781027  -0.03573143  5.0046697
  5.292143    0.45626858 -2.348977   -2.4027076   4.855569    5.6626554
 -0.88871896  6.448552    2.3205538  -0.33526862  0.88

In [41]:
# Predict to get logits for train data to prepare to distill
logits_train_vgg = net_vgg_woSM.predict(x_train[:49000], verbose = 1)



In [42]:
print(logits_train_vgg[0])
y_pred_big = np.argmax(SoftMax(logits_train_vgg),axis=1)
y_true = np.argmax(y_train[:49000],axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

[-3.6869254e+00 -9.9567795e-01 -3.7708497e-01  6.7680111e+00
  7.0529473e-01 -2.3600364e+00 -1.5549344e-01  9.8614573e-01
  6.2138867e-01 -1.3383497e+00 -9.2372143e-01  2.5031939e+00
 -9.4144213e-01  1.4102044e+00  4.4262552e+00  7.3418713e+00
 -4.0618072e+00  2.9724872e-01 -5.0880604e+00  1.8497730e+01
  1.0002916e+00 -1.2709975e-02 -6.9960408e+00 -5.6045527e+00
 -2.9017096e+00 -4.3465281e-01  1.5875976e+00 -3.5531640e+00
 -2.3916948e-01  7.2946680e-01 -5.0264592e+00  5.4473872e+00
  4.7827392e+00  2.1768814e-01  7.4618239e+00 -1.1913402e+00
 -1.3756633e-01  2.6253901e+00  8.6846657e+00 -1.3063462e+01
 -1.1678722e+00  3.2733631e+00  1.8129163e+00  3.7221885e+00
 -1.4965668e+00 -5.3872437e+00  5.6498661e+00 -3.4365425e+00
  1.4811699e+00 -2.8586533e+00  9.4321239e-01  4.7666812e+00
 -1.8643762e+00 -2.3422599e+00 -3.9670011e-01  5.9071261e-01
  7.6323128e-01 -3.8382895e+00  3.6487432e+00 -3.6542647e+00
 -1.1571419e-01 -4.9820099e+00  3.0264869e+00 -4.5632582e+00
  1.2380736e+00  6.12513

In [43]:
with open('./cifar100/vgg_logits_train.txt', 'wb') as fp:
    pickle.dump(logits_train_vgg, fp)

In [59]:
# For being faster pick up file of logit from vgg16
logits_train_vgg = unpickle('./cifar100/vgg_logits_train.txt')

In [62]:
# Repare data for training small model
data = {'X_train': x_train[:49000].transpose(0,3,1,2).copy(), 'y_train': np.argmax(y_train[:49000],axis=1),
        'X_val': x_train[49000:].transpose(0,3,1,2).copy(), 'y_val': np.argmax(y_train[49000:],axis=1),
        'X_test': x_test.transpose(0,3,1,2).copy(), 'y_test': np.argmax(y_test,axis=1),
       }

In [63]:
# Train small model without distilling from VGG
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                              hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

small_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 4.611179
(Epoch 0 / 2) train acc: 0.012082; val_acc: 0.016000
(Iteration 101 / 980) loss: 3.808190
(Iteration 201 / 980) loss: 3.663060
(Iteration 301 / 980) loss: 3.430246
(Iteration 401 / 980) loss: 3.501492
(Epoch 1 / 2) train acc: 0.232959; val_acc: 0.214000
(Iteration 501 / 980) loss: 3.307973
(Iteration 601 / 980) loss: 3.057731
(Iteration 701 / 980) loss: 3.279852
(Iteration 801 / 980) loss: 2.956263
(Iteration 901 / 980) loss: 2.969794
(Epoch 2 / 2) train acc: 0.300633; val_acc: 0.236000
Execution time:  1384.906968832016
Test accuracy: 0.2679


In [64]:
# Train small model with distilling from VGG
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                              hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=5.0,logit_distill=logits_train_vgg)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 59.873586
(Epoch 0 / 2) train acc: 0.012102; val_acc: 0.009000
(Iteration 101 / 980) loss: 58.185014
(Iteration 201 / 980) loss: 57.930904
(Iteration 301 / 980) loss: 57.435695
(Iteration 401 / 980) loss: 57.035439
(Epoch 1 / 2) train acc: 0.239449; val_acc: 0.213000
(Iteration 501 / 980) loss: 56.381810
(Iteration 601 / 980) loss: 56.496848
(Iteration 701 / 980) loss: 56.395677
(Iteration 801 / 980) loss: 56.058263
(Iteration 901 / 980) loss: 56.434930
(Epoch 2 / 2) train acc: 0.326184; val_acc: 0.249000
Execution time:  1373.5005910396576
Test accuracy: 0.2851


In [66]:
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                              hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=2.0,logit_distill=logits_train_vgg)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 11.519137
(Epoch 0 / 2) train acc: 0.016286; val_acc: 0.023000
(Iteration 101 / 980) loss: 9.913101
(Iteration 201 / 980) loss: 9.217385
(Iteration 301 / 980) loss: 8.961465
(Iteration 401 / 980) loss: 8.390139
(Epoch 1 / 2) train acc: 0.231735; val_acc: 0.211000
(Iteration 501 / 980) loss: 8.452644
(Iteration 601 / 980) loss: 8.163462
(Iteration 701 / 980) loss: 8.318298
(Iteration 801 / 980) loss: 8.502651
(Iteration 901 / 980) loss: 7.830879
(Epoch 2 / 2) train acc: 0.307000; val_acc: 0.250000
Execution time:  1367.8328320980072
Test accuracy: 0.2609


In [65]:
small_net = ThreeLayerConvNet(input_dim=(3, 32, 32), num_filters=16, filter_size=5,
                              hidden_dim=512, num_classes=100, reg=0.001, weight_scale=0.1)

distill_model = myModel(small_net, data, 
                      num_epochs=2, batch_size=100,
                      optimizer='adam',
                      optim_config={
                          'learning_rate': 1e-3,},
                      verbose=True, print_every=100,
                      temperature=20.0,logit_distill=logits_train_vgg)

tic = time.time()
distill_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(distill_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 980) loss: 923.342945
(Epoch 0 / 2) train acc: 0.019510; val_acc: 0.012000
(Iteration 101 / 980) loss: 922.124840
(Iteration 201 / 980) loss: 921.592591
(Iteration 301 / 980) loss: 921.630974
(Iteration 401 / 980) loss: 921.394343
(Epoch 1 / 2) train acc: 0.244857; val_acc: 0.238000
(Iteration 501 / 980) loss: 921.246561
(Iteration 601 / 980) loss: 921.265330
(Iteration 701 / 980) loss: 921.407883
(Iteration 801 / 980) loss: 921.448293
(Iteration 901 / 980) loss: 920.884021
(Epoch 2 / 2) train acc: 0.305184; val_acc: 0.243000
Execution time:  1372.3521888256073
Test accuracy: 0.269
