# Enter Dogs v Cat Redux

In [3]:
import os
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/redux'

In [None]:
import sys

#Allow relative imports
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from utils.utils import *
from utils.vgg16 import Vgg16

In [None]:
%matplotlib inline

In [None]:
batch_size=64

To do:
1. create validation set and sample
2. move to separate dirs for each set
3. finetune and train
4. submit

## Create validation set and sample

In [None]:
#Create directories
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

In [None]:
%cd $DATA_HOME_DIR/train

In [None]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])

In [None]:
from shutil import copyfile

In [None]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

In [None]:
%cd $DATA_HOME_DIR/valid

In [None]:
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

## Move to separate dirs for each set

#### Create cats/dogs directories

In [None]:
%cd $DATA_HOME_DIR/sample/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/sample/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/valid
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

%cd $DATA_HOME_DIR/train
%mkdir cats
%mkdir dogs
%mv cat.*.jpg cats/
%mv dog.*.jpg dogs/

In [None]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/

## Finetune and train

In [4]:
%cd $DATA_HOME_DIR

#Set path to sample path if desired
path = DATA_HOME_DIR + '/' #/sample/'
test_path = DATA_HOME_DIR + '/'

/home/ubuntu/workplace/deep_learning_course/lesson1/data/redux


In [None]:
vgg = Vgg16()

In [None]:
batches = vgg.get_batches(path+'train', batch_size=batch_size)
val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
vgg.finetune(batches)

In [None]:
vgg.fit(batches, val_batches, nb_epoch=15)

In [None]:
vgg.model.save_weights(path+'results/ft1.h5')

#### Run a few more epochs...

In [None]:
#vgg.fit(batches, val_batches, nb_epoch=1)

In [None]:
#vgg.model.save_weights(path+'results/ft2.h5')

In [None]:
#vgg.model.optimizer.lr = 0.01

In [None]:
#vgg.fit(batches, val_batches, nb_epoch=1)

In [None]:
#vgg.model.save_weights(path+'results/ft2_1.h5')

In [None]:
#vgg.fit(batches, val_batches, nb_epoch=1)

In [None]:
#vgg.model.save_weights(path+'results/ft2_2.h5')

## Submit

In [None]:
batches, preds = vgg.test(test_path+'/test', batch_size = batch_size*2)

In [None]:
filenames = batches.filenames

In [None]:
preds[:5]

In [None]:
filenames[:5]

In [None]:
save_array(path+'results/test_preds.dat', preds)
save_array(path+'results/filenames.dat', filenames)

In [None]:
preds = load_array(path+'results/test_preds.dat')
filenames = load_array(path+'results/filenames.dat')

In [None]:
%cd ..

In [None]:
from PIL import Image
Image.open('test/'+filenames[2])

In [None]:
isdog = preds[:,1] #grab dogs column
isdog[:5]

In [None]:
ids = [int(f[8:f.find('.')]) for f in filenames]
ids[:5]

In [None]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [None]:
np.savetxt(DATA_HOME_DIR+'/subm98.csv', subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
%cd $LESSON_HOME_DIR
from IPython.display import FileLink
FileLink('data/redux/subm98.csv')

## Visualizing results

Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 

- **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).

As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
1. A few correct labels at random
2. A few incorrect labels at random
3. The most correct labels of each class (ie those with highest probability that are correct)
4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
5. The most uncertain labels (ie those with probability closest to 0.5).

Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)

Calculate predictions on validation set, so we can find correct and incorrect examples:

In [None]:
vgg.model.load_weights(path+'results/ft1.h5')

In [None]:
val_batches, probs = vgg.test(test_path+'valid', batch_size = batch_size)

In [None]:
labels = val_batches.classes
filenames = val_batches.filenames

In [None]:
probs = probs[:,0]
preds = np.round(1-probs)
probs[:8]

In [None]:
preds[:8]

In [None]:
# Number of images to view for each visualization task
n_view = 4

Helper function to plot images by index in the validation set:

In [None]:
def plots_idx(idx, titles=None):
    plots([image.load_img(test_path+'valid/' + filenames[i]) for i in idx], titles=titles)

In [None]:
#1. A few correct labels at random
correct = np.where(preds==labels)[0]
idx = permutation(correct)[:n_view]
plots_idx(idx, probs[idx])

In [None]:
#2. A few incorrect labels at random
incorrect = np.where(preds!=labels)[0]
idx = permutation(incorrect)[:n_view]
plots_idx(idx, probs[idx])

In [None]:
#3. The images we most confident were cats, and are actually cats
correct_cats = np.where((preds==0) & (preds==labels))[0]
most_correct_cats = np.argsort(probs[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], probs[correct_cats][most_correct_cats])

In [None]:
#3. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((preds==1) & (preds==labels))[0]
most_correct_dogs = np.argsort(probs[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], probs[correct_dogs][most_correct_dogs])

In [None]:
#3. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((preds==0) & (preds!=labels))[0]
most_incorrect_cats = np.argsort(probs[incorrect_cats])[::-1][:n_view]
plots_idx(incorrect_cats[most_incorrect_cats], probs[incorrect_cats][most_incorrect_cats])

In [None]:
#3. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((preds==1) & (preds!=labels))[0]
most_incorrect_dogs = np.argsort(probs[incorrect_dogs])[:n_view]
plots_idx(incorrect_dogs[most_incorrect_dogs], probs[incorrect_dogs][most_incorrect_dogs])

In [None]:
#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(probs-0.5))
plots_idx(most_uncertain[:n_view], probs[most_uncertain])

Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels, preds)

We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

In [None]:
imgIds = subm[:,0]
isdog = np.clip(subm[:,1], 0.05, 0.95)
newsubmission = np.stack([imgIds,isdog], axis=1)
newsubmission[:5]

In [None]:
np.savetxt(DATA_HOME_DIR+'/subm2.csv', newsubmission, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
%cd $LESSON_HOME_DIR
FileLink('data/redux/subm2.csv')