In [1]:
%matplotlib inline
path = "../data/dl1/dogscats/"
# path = "../data/dl1/dogscats/sample/"
TYPE = 'train'
SEED = 1101

In [2]:
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import scipy.spatial.distance
import utils; reload(utils)
from utils import plots
from time import time

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [3]:
# import vgg class
import vgg16; reload(vgg16)
from vgg16 import Vgg16

In [4]:
# define submodels to extract representations at a given layer
from keras.models import Model
idx = [0,5,10,17,24,31,33,35,37]

# lambda_1
# maxpooling2d_1
# maxpooling2d_2
# maxpooling2d_3
# maxpooling2d_4
# maxpooling2d_5
# dense_1
# dense_2
# dense_3

def get_models(idx,base):
    layers  = []
    for i in range(len(idx)):
        print(base.layers[idx[i]].name)
        layers.append( base.layers[idx[i]] )

    models = []
    outshapes = []
    for layer in layers:
        models.append( Model(input=base.input,
                             output=base.get_layer(layer.name).output) )
        outshapes.append(list( layer.output_shape ))
        
        
    print('Done.')
    return layers,models,outshapes

In [5]:
# instantiate base model and sub-models
vgg = Vgg16()
base = vgg.model
layers,models,outshapes = get_models(idx,base)

lambda_1
maxpooling2d_1
maxpooling2d_2
maxpooling2d_3
maxpooling2d_4
maxpooling2d_5
dense_1
dense_2
dense_3
Done.


### Extract representations without finetuning

In [None]:
# I extract the representations of 2000 images in the training or validation set (TYPE)
datapath = path + TYPE + '/'
repath = '../data/dl1/objnc/rep/dogscats/nofinetune/'
batch_size = 50
nbatches   = 40
#batches = vgg.get_batches(datapath, batch_size=batch_size, shuffle=True, seed=1101)
print('N.of batches ' + str(nbatches))

N.of batches 40


In [None]:
# For each batch extract representations. Notice that in case TYPE is 'train' you need to
# set shuffle=True in get_batches. This is not needed when TYPE is 'valid' since there we 
# use the whole validation data

D = []
D_sq = []
for layer,model in zip(layers,models):
    print('Processing model : ' + layer.name)
    R = []
    batches = vgg.get_batches(datapath, batch_size=batch_size, shuffle=True, seed=SEED)
    ti = time()
    for n in range(nbatches):       
        imgs,_ = batches.next()       
        R.append(model.predict(imgs))
    R = np.asarray(R, dtype=float) 
    outshape = list(layer.output_shape)
    outshape[0] = nbatches*batch_size
    R.shape = tuple(outshape)
    print(R.shape)
    np.save(repath +  layer.name + '_' + TYPE, R)
    
    # compute distances
    R.shape = R.shape[0], -1
    d = scipy.spatial.distance.pdist(R, 'euclidean')
    d_sq = scipy.spatial.distance.squareform(d, force='no', checks=True)
    D.append(d)
    D_sq.append(d_sq)
    te = time() - ti
    print(te) 
    
np.save(repath + 'D'    + '_' + TYPE, D)
np.save(repath + 'D_sq' + '_' + TYPE, D_sq)
print('Done.')

Processing model : lambda_1
Found 23000 images belonging to 2 classes.
(2000, 3, 224, 224)
440.202852011
Processing model : maxpooling2d_1
Found 23000 images belonging to 2 classes.
(2000, 64, 112, 112)


### Finetune

In [None]:
# define batches, now this is not constrained to be 50 anymore. We use these batches for training only
batch_size = 64
batches = vgg.get_batches(path+'train', batch_size=batch_size, seed=SEED)
val_batches = vgg.get_batches(path+'valid', batch_size=batch_size, seed=SEED)

In [None]:
vgg.finetune(batches)

In [None]:
vgg.fit(batches, val_batches, nb_epoch=5)

##### Observation 
I notice that it's doing strange things but this is not important since we need just to compute anew the representations of the last hidden layer - the new one inserted to substitute the last hidden layer in the original VGG

In [None]:
repath   = '../data/dl1/objnc/rep/dogscats/finetune/'
vgg.model.save_weights(repath + 'w_ft.h5')

### Extract representations last hidden layer finetuned

In [None]:
base = vgg.model
layers,models,outshapes = get_models(idx,base)

In [None]:
datapath = path + TYPE + '/'
batch_size = 50
nbatches   = 40
batches = vgg.get_batches(datapath, batch_size=batch_size, shuffle=True, seed = SEED)
print('N.of batches ' + str(nbatches))

In [None]:
repath = '../data/dl1/objnc/rep/dogscats/finetune/'

D = []
D_sq = []
layer = layers[-1]
model = models[-1]
print('Processing model : ' + layer.name)

R = []
ti = time()
for n in range(nbatches):
    imgs,_ = batches.next()       
    R.append(model.predict(imgs))
R = np.asarray(R, dtype=float) 
outshape = list(layer.output_shape)
outshape[0] = nbatches*batch_size
R.shape = tuple(outshape)
print(R.shape)
np.save(repath +  layer.name + '_' + TYPE, R)

# compute distances
R.shape = R.shape[0], -1
d = scipy.spatial.distance.pdist(R, 'euclidean')
d_sq = scipy.spatial.distance.squareform(d, force='no', checks=True)
D.append(d)
D_sq.append(d_sq)
te = time() - ti
print(te) 
    
np.save(repath + 'D'    + '_' + TYPE, D)
np.save(repath + 'D_sq' + '_' + TYPE, D_sq)
print('Done.')