In [1]:
from __future__ import division,print_function
import os, sys
os.chdir("/home/ubuntu/fastai/")
print("current dir:", os.getcwd() )
#import modules
from utils import *
from vgg16 import Vgg16

from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

current dir: /home/ubuntu/fastai


 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5110)
Using Theano backend.


In [2]:
#Create references to important directories we will use over and over
DATA_HOME_DIR = "/home/ubuntu/fastai/data/dogscats/"
os.chdir(DATA_HOME_DIR)
print("current dir:", os.getcwd() )

#Set path to sample/ path if desired
test_path = DATA_HOME_DIR + 'test1/' #We use all the test data
results_path=DATA_HOME_DIR + 'results/'
train_path=DATA_HOME_DIR + 'train/'
valid_path=DATA_HOME_DIR + 'valid/'
print ("test_path:", test_path)
print ("results_path:", results_path)
print ("train_path:", train_path)
print ("valid_path:", valid_path)

current dir: /home/ubuntu/fastai/data/dogscats
test_path: /home/ubuntu/fastai/data/dogscats/test1/
results_path: /home/ubuntu/fastai/data/dogscats/results/
train_path: /home/ubuntu/fastai/data/dogscats/train/
valid_path: /home/ubuntu/fastai/data/dogscats/valid/


In [7]:
# As large as you can, but no larger than 64 is recommended. 
# If you have an older or cheaper GPU, you'll run out of memory, so will have to decrease this.
batch_size=64
no_of_epochs=3

In [8]:
vgg = Vgg16()

In [9]:
#Finetune the model
print ("start")
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01

start
Found 23000 images belonging to 2 classes.
Found 2000 images belonging to 2 classes.


In [10]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
#latest_weights_filename = None
for epoch in range(no_of_epochs):
    print ("Running epoch: %d" % epoch)
    vgg.fit(batches, val_batches, nb_epoch=1)
    #latest_weights_filename = 'ft%d.h5' % epoch
    #vgg.model.save_weights(results_path+latest_weights_filename)
print ("Completed %s fit operations" % no_of_epochs)

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Running epoch: 2
Epoch 1/1
Completed 3 fit operations


In [11]:
print ("start")
batches, preds = vgg.test(test_path, batch_size = batch_size*2)

start
Found 12500 images belonging to 1 classes.


In [None]:
val_batches, probs = vgg.test(valid_path, batch_size = batch_size)

Found 2000 images belonging to 2 classes.


In [12]:
filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

In [17]:
print (preds.shape)
print (preds[0:10, 0])

(12500, 2)
[ 1.  0.  0.  1.  1.  0.  1.  1.  0.  0.]


In [14]:
#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)
#NB: later we adjust this method to 0.05 and 0.95 for better scoring in kaggle

In [22]:
print (our_predictions.shape)
print ("our_predictions:", our_predictions[0:10])
print ("our_labels:", our_labels[0:10])
print (expected_labels.shape)

(12500,)
our_predictions: [ 1.  0.  0.  1.  1.  0.  1.  1.  0.  0.]
our_labels: [ 0.  1.  1.  0.  0.  1.  0.  0.  1.  1.]
(2000,)


In [15]:
correct = np.where(our_labels==expected_labels)[0]
print ("Found %d correct labels" % len(correct))
incorrect = np.where(our_labels!=expected_labels)[0]
print ("Found %d incorrect labels" % len(incorrect))
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print ("Found %d confident correct cats labels" % len(correct_cats))
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print ("Found %d confident correct dogs labels" % len(correct_dogs))
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print ("Found %d incorrect cats" % len(incorrect_cats))
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print ("Found %d incorrect dogs" % len(incorrect_dogs))


Found 0 correct labels
Found 1 incorrect labels
Found 0 confident correct cats labels
Found 0 confident correct dogs labels
Found 6126 incorrect cats
Found 6374 incorrect dogs


  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  """
  import sys
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)

In [None]:
plot_confusion_matrix(cm, val_batches.class_indices)

In [None]:
#Grab the dog prediction column
isdog = preds[:,1]

In [None]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])

In [None]:
subm = np.stack([ids,isdog], axis=1)
subm[:5]

In [None]:
submission_file_name = 'submission2.csv'
print ("submission_file_name:", submission_file_name)
np.savetxt(results_path+submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')

In [None]:
from IPython.display import FileLink
%cd $results_path
FileLink(submission_file_name)