In [1]:
import numpy as np
import matplotlib.pyplot as plt

from data_utils import get_CIFAR10_data, get_MNIST_data
from CNN import ThreeLayerConvNet
from model import Model
from ResNet164.resnet164 import ResNet164

import h5py
import time
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [2]:
def unpickle(file):    
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

## Test with CIFAR-10

In [9]:
# Load the (preprocessed) CIFAR10 data.

cifar10_data = get_CIFAR10_data()
for k, v in cifar10_data.items():
  print('%s: ' % k, v.shape)

X_train:  (49000, 3, 32, 32)
y_train:  (49000,)
X_val:  (1000, 3, 32, 32)
y_val:  (1000,)
X_test:  (1000, 3, 32, 32)
y_test:  (1000,)


In [10]:
net = ThreeLayerConvNet(reg=0.001,weight_scale=0.1)

small_model = Model(net, cifar10_data,
                num_epochs=1, batch_size=50,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 980) loss: 2.315664
(Epoch 0 / 1) train acc: 0.124265; val_acc: 0.121000
(Iteration 101 / 980) loss: 1.780725
(Iteration 201 / 980) loss: 1.696385
(Iteration 301 / 980) loss: 1.507722
(Iteration 401 / 980) loss: 1.603677
(Iteration 501 / 980) loss: 1.371052
(Iteration 601 / 980) loss: 1.662679
(Iteration 701 / 980) loss: 1.441571
(Iteration 801 / 980) loss: 1.572337
(Iteration 901 / 980) loss: 1.395455
(Epoch 1 / 1) train acc: 0.503878; val_acc: 0.500000


## Test with MNIST

In [3]:
# Load MNIST data
mnist_data = get_MNIST_data(num_test=10000)
for k, v in mnist_data.items():
  print('%s: ' % k, v.shape)

X_train:  (59000, 1, 28, 28)
y_train:  (59000,)
X_val:  (1000, 1, 28, 28)
y_val:  (1000,)
X_test:  (10000, 1, 28, 28)
y_test:  (10000,)


In [4]:
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=10,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = Model(net, mnist_data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 590) loss: 2.413948
(Epoch 0 / 1) train acc: 0.195034; val_acc: 0.181000
(Iteration 101 / 590) loss: 0.808981
(Iteration 201 / 590) loss: 0.480308
(Iteration 301 / 590) loss: 0.333304
(Iteration 401 / 590) loss: 0.365276
(Iteration 501 / 590) loss: 0.183830
(Epoch 1 / 1) train acc: 0.959441; val_acc: 0.961000
Execution time:  447.68891406059265


In [5]:
X_test, y_test = mnist_data['X_test'], mnist_data['y_test']

print('Test accuracy: {}'.format(small_model.check_accuracy(X_test,y_test)))

Test accuracy: 0.9578


### Using data with own preprocessing ResNet164 gives ~87% in test accuracy

In [7]:
# Data for ResNet164
x_train = mnist_data['X_train'].transpose(0,2,3,1).copy()
x_val = mnist_data['X_val'].transpose(0,2,3,1).copy()
x_test = mnist_data['X_test'].transpose(0,2,3,1).copy()
print(x_train.shape)

(59000, 28, 28, 1)


## Distill ResNet to small_model for MNIST

In [20]:
from ResNet164.utils import load_mnist

In [21]:
# Data used in ResNet164
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_mnist()

In [25]:
print(x_train.shape)
print(y_train.shape)

(50000, 28, 28, 1)
(50000, 10)


In [6]:
# Load ResNet which achieve 99.7% test accuracy
big_model = ResNet164()

big_model.compile()
# Load pre-trained model
big_model.load_weights('ResNet164/ResNet164.h5')

Instructions for updating:
Colocations handled automatically by placer.


In [37]:
logits_train = big_model.predict(x_train, verbose = 1)
print(logits_train[0])

[-6.2030087  -3.9134192  -6.317644    6.6796656  -2.957327   12.809154
 -3.4207807   0.18087192  2.3757007   0.04373608]


In [38]:
y_pred_big = np.argmax(SoftMax(logits_train),axis=1)
y_true = np.argmax(y_train,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

Test accuracy of big model: 0.99966


In [39]:
print(logits_train.shape)
with open('resnet164_logits_train.txt', 'wb') as fp:
    pickle.dump(logits_train, fp)

(50000, 10)


In [4]:
# For being faster pick up file of logit from resnet164
logits_train = unpickle('resnet164_logits_train.txt')

### Check accuracy of big model (ResNet164)

In [23]:
logits_test = big_model.predict(x_test, verbose = 1)



In [18]:
def SoftMax(s):
    p = np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1))/\   # minus max to avoid large s case
    np.expand_dims(np.exp(s-np.expand_dims(np.max(s,axis=1),axis=1)).sum(axis=1),axis=1)  # matrix of size NxK
    return p

In [27]:
y_pred_big = np.argmax(SoftMax(logits_test),axis=1)
y_true = np.argmax(y_test,axis=1)
print('Test accuracy of big model: {}'.format(np.mean(y_true==y_pred_big)))

Test accuracy of big model: 0.997


In [33]:
# Prepare data to train with small model
data = {'X_train': x_train.transpose(0,3,1,2).copy(), 'y_train': np.argmax(y_train,axis=1),
        'X_val': x_val.transpose(0,3,1,2).copy(), 'y_val': np.argmax(y_val,axis=1),
        'X_test': x_test.transpose(0,3,1,2).copy(), 'y_test': np.argmax(y_test,axis=1),
       }

In [30]:
print(x_train.shape, y_train.shape)

print(data['X_train'].shape, data['y_train'].shape)

(50000, 28, 28, 1) (50000, 10)
(50000, 1, 28, 28) (50000,)


In [31]:
# Train small model without distilling
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = Model(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 500) loss: 3.470583
(Epoch 0 / 1) train acc: 0.172860; val_acc: 0.175800
(Iteration 101 / 500) loss: 0.316082
(Iteration 201 / 500) loss: 0.309362
(Iteration 301 / 500) loss: 0.142844
(Iteration 401 / 500) loss: 0.179971
(Epoch 1 / 1) train acc: 0.975500; val_acc: 0.973300
Execution time:  394.203412771225


In [34]:
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

Test accuracy: 0.9702


In [40]:
# Train small model with distilling knowledge from big model
net = ThreeLayerConvNet(input_dim=(1, 28, 28),num_filters=28,filter_size=5,hidden_dim=50,
                        reg=0.001,weight_scale=1,dtype=np.float32)
small_model = Model(net, data,
                num_epochs=1, batch_size=100,
                optimizer='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                temperature=5.0,logit_distill=logits_train,
                verbose=True, print_every=100)
tic = time.time()
small_model.train()
toc = time.time()
print('Execution time: ',toc-tic)

(Iteration 1 / 500) loss: 31.128427
(Epoch 0 / 1) train acc: 0.207020; val_acc: 0.219400
(Iteration 101 / 500) loss: 12.530449
(Iteration 201 / 500) loss: 11.674805
(Iteration 301 / 500) loss: 11.588711
(Iteration 401 / 500) loss: 11.688246
(Epoch 1 / 1) train acc: 0.980920; val_acc: 0.981300
Execution time:  402.96113777160645


In [41]:
print('Test accuracy: {}'.format(small_model.check_accuracy(data['X_test'],data['y_test'])))

Test accuracy: 0.9816
