In [6]:
import numpy as np
import matplotlib.pyplot as plt

from data_utils import get_CIFAR10_data, get_MNIST_data
from CNN import ThreeLayerConvNet
from model import myModel

from ResNet164.resnet164 import ResNet164
from keras.models import Model

import h5py
import time
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [1]:
def unpickle(file):    
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

## Test with CIFAR-10

In [9]:
# Load the (preprocessed) CIFAR10 data.

cifar10_data = get_CIFAR10_data()
for k, v in cifar10_data.items():
  print('%s: ' % k, v.shape)

X_train:  (49000, 3, 32, 32)
y_train:  (49000,)
X_val:  (1000, 3, 32, 32)
y_val:  (1000,)
X_test:  (1000, 3, 32, 32)
y_test:  (1000,)


In [10]:
net = ThreeLayerConvNet(reg=0.001,weight_scale=0.1)

small_model = myModel(net, cifar10_data,
                num_epochs=1, batch_size=50,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 980) loss: 2.315664
(Epoch 0 / 1) train acc: 0.124265; val_acc: 0.121000
(Iteration 101 / 980) loss: 1.780725
(Iteration 201 / 980) loss: 1.696385
(Iteration 301 / 980) loss: 1.507722
(Iteration 401 / 980) loss: 1.603677
(Iteration 501 / 980) loss: 1.371052
(Iteration 601 / 980) loss: 1.662679
(Iteration 701 / 980) loss: 1.441571
(Iteration 801 / 980) loss: 1.572337
(Iteration 901 / 980) loss: 1.395455
(Epoch 1 / 1) train acc: 0.503878; val_acc: 0.500000


## Test with MNIST

In [3]:
# Load MNIST data
mnist_data = get_MNIST_data(num_test=10000)
for k, v in mnist_data.items():
  print('%s: ' % k, v.shape)

X_train:  (59000, 1, 28, 28)
y_train:  (59000,)
X_val:  (1000, 1, 28, 28)
y_val:  (1000,)
X_test:  (10000, 1, 28, 28)
y_test:  (10000,)


In [4]:
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=10,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, mnist_data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 590) loss: 2.413948
(Epoch 0 / 1) train acc: 0.195034; val_acc: 0.181000
(Iteration 101 / 590) loss: 0.808981
(Iteration 201 / 590) loss: 0.480308
(Iteration 301 / 590) loss: 0.333304
(Iteration 401 / 590) loss: 0.365276
(Iteration 501 / 590) loss: 0.183830
(Epoch 1 / 1) train acc: 0.959441; val_acc: 0.961000
Execution time:  447.68891406059265


In [5]:
X_test, y_test = mnist_data['X_test'], mnist_data['y_test']

print('Test accuracy: {}'.format(small_model.check_accuracy(X_test,y_test)))

Test accuracy: 0.9578


### Using data with own preprocessing ResNet164 gives ~87% in test accuracy

In [7]:
# Data for ResNet164
x_train = mnist_data['X_train'].transpose(0,2,3,1).copy()
x_val = mnist_data['X_val'].transpose(0,2,3,1).copy()
x_test = mnist_data['X_test'].transpose(0,2,3,1).copy()
print(x_train.shape)

(59000, 28, 28, 1)


## Distill ResNet to small_model for MNIST

### Check accuracy of big model (ResNet164)

In [4]:
from ResNet164.utils import load_mnist

Using TensorFlow backend.


In [5]:
# Data used in ResNet164
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_mnist()

In [25]:
print(x_train.shape)
print(y_train.shape)

(50000, 28, 28, 1)
(50000, 10)


In [6]:
# Load ResNet which achieve 99.7% test accuracy
big_model = ResNet164()

big_model.compile()
# Load pre-trained model
big_model.load_weights('ResNet164/ResNet164.h5')

# Remove softmax from VGG16
big_model_woSM = Model(inputs=big_model.model.input, outputs=big_model.model.layers[-2].output)

Instructions for updating:
Colocations handled automatically by placer.


In [37]:
logits_train = big_model_woSM.predict(x_train, verbose = 1)
print(logits_train[0])

[-6.2030087  -3.9134192  -6.317644    6.6796656  -2.957327   12.809154
 -3.4207807   0.18087192  2.3757007   0.04373608]


In [38]:
y_pred_big = np.argmax(SoftMax(logits_train),axis=1)
y_true = np.argmax(y_train,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

Test accuracy of big model: 0.99966


In [39]:
print(logits_train.shape)
with open('resnet164_logits_train.txt', 'wb') as fp:
    pickle.dump(logits_train, fp)

(50000, 10)


In [5]:
# For being faster pick up file of logit from resnet164
logits_train = unpickle('resnet164_logits_train.txt')

In [23]:
logits_test = big_model_woSM.predict(x_test, verbose = 1)



In [18]:
def SoftMax(s):
    # minus max to avoid large s case
    p = np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1))/\
    np.expand_dims(np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1)).sum(axis=1),axis=1)  # matrix of size NxK
    return p

In [27]:
y_pred_big = np.argmax(SoftMax(logits_test),axis=1)
y_true = np.argmax(y_test,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

Test accuracy of big model: 0.997


### Compare accuracy of small model with and without using distillation

In [11]:
# Prepare data to train with small model
data = {'X_train': x_train.transpose(0,3,1,2).copy(), 'y_train': np.argmax(y_train,axis=1),
        'X_val': x_val.transpose(0,3,1,2).copy(), 'y_val': np.argmax(y_val,axis=1),
        'X_test': x_test.transpose(0,3,1,2).copy(), 'y_test': np.argmax(y_test,axis=1),
       }

In [30]:
print(x_train.shape, y_train.shape)

print(data['X_train'].shape, data['y_train'].shape)

(50000, 28, 28, 1) (50000, 10)
(50000, 1, 28, 28) (50000,)


In [31]:
# Train small model without distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 500) loss: 3.470583
(Epoch 0 / 1) train acc: 0.172860; val_acc: 0.175800
(Iteration 101 / 500) loss: 0.316082
(Iteration 201 / 500) loss: 0.309362
(Iteration 301 / 500) loss: 0.142844
(Iteration 401 / 500) loss: 0.179971
(Epoch 1 / 1) train acc: 0.975500; val_acc: 0.973300
Execution time:  394.203412771225


In [34]:
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

Test accuracy: 0.9702


In [40]:
# Train small model with distilling knowledge from big model
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=5.0,logit_distill=logits_train,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 500) loss: 31.128427
(Epoch 0 / 1) train acc: 0.207020; val_acc: 0.219400
(Iteration 101 / 500) loss: 12.530449
(Iteration 201 / 500) loss: 11.674805
(Iteration 301 / 500) loss: 11.588711
(Iteration 401 / 500) loss: 11.688246
(Epoch 1 / 1) train acc: 0.980920; val_acc: 0.981300
Execution time:  402.96113777160645


In [41]:
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

Test accuracy: 0.9816


In [6]:
# Train small model with batch normalization (without distilling)
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32,batchnorm=True)
small_model = myModel(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 500) loss: 2.732309
(Epoch 0 / 1) train acc: 0.386420; val_acc: 0.399900
(Iteration 101 / 500) loss: 0.279304
(Iteration 201 / 500) loss: 0.232153
(Iteration 301 / 500) loss: 0.154236
(Iteration 401 / 500) loss: 0.149580
(Epoch 1 / 1) train acc: 0.979100; val_acc: 0.975600
Execution time:  1348.2941009998322


In [7]:
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

Test accuracy: 0.9755


In [10]:
# Train small model with batch normalization and with distilling knowledge from big model
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32,batchnorm=True)
small_model = myModel(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=5.0,logit_distill=logits_train,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 500) loss: 30.617924
(Epoch 0 / 1) train acc: 0.241300; val_acc: 0.246700
(Iteration 101 / 500) loss: 17.103761
(Iteration 201 / 500) loss: 13.742485
(Iteration 301 / 500) loss: 11.510487
(Iteration 401 / 500) loss: 11.518176
(Epoch 1 / 1) train acc: 0.984660; val_acc: 0.983600
Execution time:  655.8659062385559
Test accuracy: 0.983


### Use knowledge from big model to train small model using unlabeled data

In [12]:
# Train small model with batch normalization and with distilling knowledge from big model
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=5.0,logit_distill=logits_train,alpha=1.0,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

(Iteration 1 / 500) loss: 58.469911
(Epoch 0 / 1) train acc: 0.267340; val_acc: 0.266700
(Iteration 101 / 500) loss: 26.392003
(Iteration 201 / 500) loss: 23.601190
(Iteration 301 / 500) loss: 23.303173
(Iteration 401 / 500) loss: 22.829914
(Epoch 1 / 1) train acc: 0.979360; val_acc: 0.979400
Execution time:  386.4370698928833
Test accuracy: 0.979


### Training data without digit 3

In [48]:
def remove_class(data, class_out=3):
    data_not_3 = data.copy()
    mask = data_not_3['y_train']!=class_out
    
    data_not_3['X_train'] = data_not_3['X_train'][mask]
    data_not_3['y_train'] = data_not_3['y_train'][mask]
    return data_not_3, mask

In [49]:
data_not_3, mask = remove_class(data, 3)

In [50]:
# Verify original data which should not change
print(data_not_3['X_train'].shape, data_not_3['y_train'].shape)
print(data['X_train'].shape, data['y_train'].shape)

(44899, 1, 28, 28) (44899,)
(50000, 1, 28, 28) (50000,)


In [51]:
print(data['y_train'][:20].T)
print(data_not_3['y_train'][:20].T)

[5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9]
[5 0 4 1 9 2 1 1 4 5 6 1 7 2 8 6 9 4 0 9]


In [20]:
# Train small model without distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 3.319217
(Epoch 0 / 1) train acc: 0.242856; val_acc: 0.215200
(Iteration 101 / 448) loss: 0.257129
(Iteration 201 / 448) loss: 0.183139
(Iteration 301 / 448) loss: 0.157314
(Iteration 401 / 448) loss: 0.240851
(Epoch 1 / 1) train acc: 0.972227; val_acc: 0.873700
Execution time:  502.87588810920715
Test accuracy: 0.8742


In [24]:
mask_3 = data_not_3['y_test']==3
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.0


In [52]:
mask_3 = data_not_3['y_test']==3
# remove digit 3 from knowledge come from big model
logits_train_not_3 = logits_train[mask].copy()

In [53]:
# verify new and original data
print(logits_train.shape, logits_train_not_3.shape)

(50000, 10) (44899, 10)


In [10]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=3.0, logit_distill=logits_train_not_3,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 12.767096
(Epoch 0 / 1) train acc: 0.305508; val_acc: 0.271700
(Iteration 101 / 448) loss: 2.125419
(Iteration 201 / 448) loss: 1.406824
(Iteration 301 / 448) loss: 1.317867
(Iteration 401 / 448) loss: 1.553746
(Epoch 1 / 1) train acc: 0.980512; val_acc: 0.879000
Execution time:  367.10596680641174
Test accuracy: 0.8813


In [11]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.005940594059405941


In [12]:
0.005940594059405941*sum(mask_3)

6.0

In [13]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=2.5, logit_distill=logits_train_not_3,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 8.714978
(Epoch 0 / 1) train acc: 0.311009; val_acc: 0.289900
(Iteration 101 / 448) loss: 1.726750
(Iteration 201 / 448) loss: 0.588200
(Iteration 301 / 448) loss: 0.818060
(Iteration 401 / 448) loss: 0.530997
(Epoch 1 / 1) train acc: 0.978196; val_acc: 0.878300
Execution time:  365.2879328727722
Test accuracy: 0.8768


In [14]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.0019801980198019802


In [15]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=4.0, logit_distill=logits_train_not_3,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 19.946362
(Epoch 0 / 1) train acc: 0.158868; val_acc: 0.143800
(Iteration 101 / 448) loss: 5.611085
(Iteration 201 / 448) loss: 4.812587
(Iteration 301 / 448) loss: 5.758645
(Iteration 401 / 448) loss: 4.807010
(Epoch 1 / 1) train acc: 0.980400; val_acc: 0.882100
Execution time:  361.6439278125763
Test accuracy: 0.8833


In [16]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.034653465346534656


In [17]:
0.034653465346534656*sum(mask_3)

35.0

In [18]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=20.0, logit_distill=logits_train_not_3,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 461.625588
(Epoch 0 / 1) train acc: 0.164347; val_acc: 0.152600
(Iteration 101 / 448) loss: 446.038888
(Iteration 201 / 448) loss: 446.381245
(Iteration 301 / 448) loss: 445.294606
(Iteration 401 / 448) loss: 443.807440
(Epoch 1 / 1) train acc: 0.980935; val_acc: 0.893700
Execution time:  360.6434180736542
Test accuracy: 0.8934


In [19]:
mask_3 = data_not_3['y_test']==3
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.11188118811881188


In [20]:
0.11188118811881188*sum(mask_3)

113.0

In [21]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_not_3,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=50.0, logit_distill=logits_train_not_3,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

(Iteration 1 / 448) loss: 2879.750401
(Epoch 0 / 1) train acc: 0.231698; val_acc: 0.207200
(Iteration 101 / 448) loss: 2866.701205
(Iteration 201 / 448) loss: 2865.685626
(Iteration 301 / 448) loss: 2865.922503
(Iteration 401 / 448) loss: 2865.657721
(Epoch 1 / 1) train acc: 0.979086; val_acc: 0.879700
Execution time:  349.06136202812195
Test accuracy: 0.8819


In [22]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.01782178217821782


### Add bias 

In [23]:
# Check bias of 3-digit
print(small_model.model.params.keys())

dict_keys(['W1', 'b1', 'W2', 'b2', 'W3', 'b3'])


In [25]:
print(small_model.model.params['b3'])

[-0.00169301  0.02463553  0.00436699 -0.00590691  0.00758079 -0.00038451
  0.00223532 -0.01131989 -0.01801676  0.00932435]


In [26]:
bias_3 = np.zeros_like(small_model.model.params['b3'])
bias_3[3] = 3.5
small_model.model.params['b3'] += bias_3
print(small_model.model.params['b3'])

[-1.6930116e-03  2.4635525e-02  4.3669855e-03  3.4940932e+00
  7.5807851e-03 -3.8450916e-04  2.2353220e-03 -1.1319885e-02
 -1.8016757e-02  9.3243457e-03]


In [27]:
# accurancy of class 3 after increasing the bias at 3
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'][mask_3],data_not_3['y_test'][mask_3])))

Test accuracy: 0.4792079207920792


In [54]:
# test accurancy of the distilled model after increasing the bias at 3
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_not_3['X_test'],data_not_3['y_test'])))

Test accuracy: 0.9279


### Train distillation model with only 2 classes or 1 class

In [55]:
mask_7 = data['y_train']==7
mask_8 = data['y_train']==8
mask_7_8 = mask_7+mask_8
print(sum(mask_7_8), sum(mask_7), sum(mask_8))

10017 5175 4842


In [39]:
print(data['y_train'][:50])

[5 0 4 1 9 2 1 1 4 5 6 1 7 2 8 6 9 4 0 9 1 1 2 4 2 7 8 6 9 0 5 6 0 7 6 1 8
 7 9 9 8 5 9 0 7 4 9 8 0 9]


In [56]:
data_7_8 = data.copy()
data_7_8['X_train'] = data_7_8['X_train'][mask_7_8]
data_7_8['y_train'] = data_7_8['y_train'][mask_7_8]
print(data_7_8['X_train'].shape, data_7_8['y_train'].shape)

(10017, 1, 28, 28) (10017,)


In [58]:
print(data['X_train'].shape, data['y_train'].shape)
print(data_7_8['X_train'].shape, data_7_8['y_train'].shape)

(50000, 1, 28, 28) (50000,)
(10017, 1, 28, 28) (10017,)


In [60]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_7_8,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_7_8['X_test'],data_7_8['y_test'])))

(Iteration 1 / 100) loss: 2.175337
(Epoch 0 / 1) train acc: 0.905361; val_acc: 0.192200
(Epoch 1 / 1) train acc: 0.991814; val_acc: 0.208400
Execution time:  94.81561994552612
Test accuracy: 0.1981


In [63]:
mask_not_7_8 = np.logical_and(data['y_test']!=7, data['y_test']!=8)
print(sum(mask_not_7_8))
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_7_8['X_test'][mask_not_7_8],data_7_8['y_test'][mask_not_7_8])))

7998
Test accuracy: 0.0


In [64]:
# take only 7 and 8 digit from knowledge come from big model
logits_train_7_8 = logits_train[mask_7_8].copy()
# verify new and original data
print(logits_train.shape, logits_train_7_8.shape)

(50000, 10) (10017, 10)


In [65]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_7_8,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=20.0, logit_distill=logits_train_7_8,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_7_8['X_test'],data_7_8['y_test'])))

(Iteration 1 / 100) loss: 464.934103
(Epoch 0 / 1) train acc: 0.544375; val_acc: 0.115700
(Epoch 1 / 1) train acc: 0.994010; val_acc: 0.208500
Execution time:  95.38177514076233
Test accuracy: 0.1988


In [66]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_7_8['X_test'][mask_not_7_8],data_7_8['y_test'][mask_not_7_8])))

Test accuracy: 0.00012503125781445363


In [67]:
print(small_model.model.params['b3'])

[ 0.01286919  0.00903602 -0.0188286   0.00395877 -0.00949581  0.00819844
 -0.01376974  0.00276909  0.00058601 -0.00525341]


In [68]:
bias_7_8 = np.zeros_like(small_model.model.params['b3'])
bias_7_8[7] = 7.6
bias_7_8[8] = 7.6
small_model.model.params['b3'] -= bias_7_8
print(small_model.model.params['b3'])

[ 1.2869191e-02  9.0360213e-03 -1.8828603e-02  3.9587682e-03
 -9.4958069e-03  8.1984363e-03 -1.3769737e-02 -7.5972309e+00
 -7.5994139e+00 -5.2534081e-03]


In [69]:
# Accuracy on class 7 & 8
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_7_8['X_test'][mask_not_7_8],data_7_8['y_test'][mask_not_7_8])))

Test accuracy: 0.12253063265816454


In [70]:
# Accuracy on whole test set
print('Test accuracy: {}'.format(small_model.check_accuracy(data_7_8['X_test'],data_7_8['y_test'])))

Test accuracy: 0.2909


In [71]:
# Train small model with distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = myModel(net, data_7_8,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=50.0, logit_distill=logits_train_7_8,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)
print('Test accuracy: {}'.format(small_model.check_accuracy(data_7_8['X_test'],data_7_8['y_test'])))

(Iteration 1 / 100) loss: 2880.486162
(Epoch 0 / 1) train acc: 0.579714; val_acc: 0.124200
(Epoch 1 / 1) train acc: 0.994809; val_acc: 0.209900
Execution time:  90.94003295898438
Test accuracy: 0.2


In [72]:
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_7_8['X_test'][mask_not_7_8],data_7_8['y_test'][mask_not_7_8])))

Test accuracy: 0.0017504376094023505


In [74]:
print(small_model.model.params['b3'])
# reduce bias on 7 and 8 class
small_model.model.params['b3'] -= bias_7_8
print(small_model.model.params['b3'])

[ 0.00720396  0.00685708 -0.00248807 -0.00164515 -0.01292054 -0.00159284
 -0.00695635  0.00915938 -0.00256593  0.01327047]
[ 7.2039641e-03  6.8570757e-03 -2.4880739e-03 -1.6451530e-03
 -1.2920539e-02 -1.5928353e-03 -6.9563491e-03 -7.5908403e+00
 -7.6025658e+00  1.3270469e-02]


In [75]:
# Accuracy on class 7 & 8
print('Test accuracy: {}'.
      format(small_model.check_accuracy(data_7_8['X_test'][mask_not_7_8],data_7_8['y_test'][mask_not_7_8])))
# Accuracy on whole test set
print('Test accuracy: {}'.format(small_model.check_accuracy(data_7_8['X_test'],data_7_8['y_test'])))

Test accuracy: 0.19617404351087772
Test accuracy: 0.3485


In [36]:
a = np.arange(10)
a[[7,8]] = 0
print(a)

[0 1 2 3 4 5 6 0 0 9]
