In [1]:
from __future__ import print_function

import os
import sys
import utils
import glob
from vgg16 import Vgg16
import numpy as np

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


##### Global constants

In [16]:
DO_SETUP=0                          # Set to 1 for first time setup of necessary directory structure
VALIDATION_SET_SZ=1000
SAMPLE_SET_SZ=100

ddir="/home/ubuntu/nbs/courses/deeplearning1/nbs/data/myredux"
#ddir="/home/ubuntu/nbs/courses/deeplearning1/nbs/data/myredux/sample"

resdir=ddir + "/results/"
testdir=ddir + "/test/"
traindir=ddir + "/train/"
validdir=ddir + "/valid/"

# Prepare directory structure

In [3]:
if (DO_SETUP == 1):
    print("DO_SETUP=1")
    %mkdir -p $resdir
    %mkdir -p $traindir/cats $traindir/dogs
    %mkdir -p $ddir/sample $ddir/valid
    %mkdir -p $ddir/valid/cats $ddir/valid/dogs
    %mkdir -p $ddir/sample/train $ddir/sample/train/cats $ddir/sample/train/dogs $ddir/sample/test $ddir/sample/results 
    %mkdir -p $ddir/sample/valid $ddir/sample/valid/cats $ddir/sample/valid/dogs
    %mkdir -p $ddir/test/unknown $ddir/sample/test/unknown
    %mv $testdir/*.jpg $testdir/unknown

DO_SETUP=1


In [5]:
# Move images to appropriate directories


-

### Finetune existing vgg16 model to categorize only 2 classes in data set

In [17]:
batchsz=64

vgg = Vgg16()
train_batches = vgg.get_batches(traindir, batch_size=batchsz)
valid_batches = vgg.get_batches(validdir, batch_size=batchsz)
vgg.finetune(train_batches)

Found 23000 images belonging to 2 classes.
Found 2000 images belonging to 2 classes.


### Train the finetuned model

In [18]:
nb_epoch=3

for e in range(nb_epoch):
    print("Running Epoch %d" %e)
    vgg.fit(train_batches, valid_batches)
    wfname=resdir + "ft%d.h5" %e
    vgg.model.save_weights(wfname) 

Running Epoch 0
Epoch 1/1
Running Epoch 1
Epoch 1/1
Running Epoch 2
Epoch 1/1


### Test model results against validation data

In [74]:
best_wfname=resdir + "ft1.h5"
vgg.model.load_weights(best_wfname)

In [63]:
valid_batches, valid_preds = vgg.test(validdir, batch_size=batchsz)

Found 2000 images belonging to 2 classes.


In [64]:
expected_valid_preds = valid_batches.classes
got_valid_preds = np.round(1-valid_preds[:,0])

#### Get confusion matrix on validation set

In [65]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_valid_preds, got_valid_preds)
cm

array([[986,  14],
       [ 28, 972]])

### Run predictions on test data using finetuned model

In [75]:
test_batches, test_preds = vgg.test(testdir, batch_size=batchsz)

Found 12500 images belonging to 1 classes.


In [76]:
# Saving test results
utils.save_array(resdir + "test_fnames.dat", test_batches.filenames)
utils.save_array(resdir + "test_preds.dat", test_preds)

In [77]:
# Retrieve test results
fnames=utils.load_array(resdir + "test_fnames.dat")
preds=utils.load_array(resdir + "test_preds.dat")

In [78]:
isdog=preds[:,1] # column 1 probabilities indicating 'dog' prediction

##### Adjustments to look good on log-loss loss computation function which penalizes high probability wrong predictions

In [83]:
isdog = isdog.clip(min=0.001, max=0.999)

In [84]:
imgids = np.array([int(f[8:f.find('.')]) for f in fnames])

In [85]:
subm = np.stack([imgids, isdog], axis=1)

In [86]:
submission_file_name = ddir + "/submission_using_ft1_clip0.001.csv"
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')