In [6]:
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import time

# display plots in this notebook
%matplotlib inline

# set display defaults
plt.rcParams['figure.figsize'] = (10, 10)        # large images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap

# The caffe module needs to be on the Python path;
#  we'll add it here explicitly.
import sys
caffe_root = '/home/marek/caffe/caffe/'  # this file should be run from {caffe_root}/examples (otherwise change this line)
sys.path.insert(0, caffe_root + 'python')

import caffe
# If you get "No module named _caffe", either you have not built pycaffe or you have the wrong path.

import os
if os.path.isfile(caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'):
    print 'CaffeNet found.'
else:
    print 'Downloading pre-trained CaffeNet model...'
    !../scripts/download_model_binary.py ../models/bvlc_reference_caffenet
    

model_def = caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
model_weights = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'

net = caffe.Net(model_def,      # defines the structure of the model
                model_weights,  # contains the trained weights
                caffe.TEST)     # use test mode (e.g., don't perform dropout)

# load the mean ImageNet image (as distributed with Caffe) for subtraction
mu = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
mu = mu.mean(1).mean(1)  # average over pixels to obtain the mean (BGR) pixel values
print 'mean-subtracted values:', zip('BGR', mu)

# create transformer for the input called 'data'
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})

transformer.set_transpose('data', (2,0,1))  # move image channels to outermost dimension
transformer.set_mean('data', mu)            # subtract the dataset-mean value in each channel
transformer.set_raw_scale('data', 255)      # rescale from [0, 1] to [0, 255]
transformer.set_channel_swap('data', (2,1,0))  # swap channels from RGB to BGR

# load ImageNet labels
labels_file = caffe_root + 'data/ilsvrc12/synset_words.txt'
if not os.path.exists(labels_file):
    !../data/ilsvrc12/get_ilsvrc_aux.sh
    
labels = np.loadtxt(labels_file, str, delimiter='\t')
labels = np.array(labels)

caffe.set_device(0)  # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()

CaffeNet found.
mean-subtracted values: [('B', 104.0069879317889), ('G', 116.66876761696767), ('R', 122.6789143406786)]


In [8]:
def flatten(l): return flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]


def get_labels(image_path):
    image = caffe.io.load_image(image_path)
    net.blobs['data'].data[...] = transformer.preprocess('data', image)
    net.forward()
    output_prob = net.blobs['prob'].data[0]
    top_inds = output_prob.argsort()[::-1][:5]
    
    mlist = zip(output_prob[top_inds].tolist(), labels[top_inds].tolist())
    
    rlist = []
    for (p,i) in mlist:
        l = i.split(" ")
        string_entry = str(p) + ":" + (",".join(l[1:]))
        rlist.append(string_entry)
        
    return ";".join(rlist)

def absoluteFilePaths(directory):
   for dirpath,_,filenames in os.walk(directory):
       for f in filenames:
           id = f.split(".")[0]
           yield (id, os.path.abspath(os.path.join(dirpath, f)))

for i in [5,6]:
    for j in [8,9]:
        mpath = '/home/marek/kaggle_ads/input/images/Images_%d/%d%d/' % (i, i, j)
        nm = 'input/labels/labels%d%d.csv' % (i, j)
        if (os.path.exists(nm)):
            print "Skipping " + nm
            continue
        fl = absoluteFilePaths(mpath)
        plist = []

        iter = 0
        start_time = time.time()

        for (id, path) in fl:
            try:
                labelss = get_labels(path)
            except:
                print "skipping " + path
            plist.append((id, labelss))
            iter = iter + 1
            if (iter % 1000 == 0):
                print "Iteration of %d%d : %dk" % (i,j,iter/1000)

        df = pd.DataFrame(plist, columns=['id', 'labels'])
        print ("Saving: " + nm)
        print('Evaluation took: {} minutes'.format(round((time.time() - start_time)/60, 2)))
        print 'Time for a single image: %d ms' % ((time.time() - start_time)*1000 / iter)
        df.to_csv(nm)

Iteration of 58 : 1k
Iteration of 58 : 2k
Iteration of 58 : 3k
Iteration of 58 : 4k
Iteration of 58 : 5k
Iteration of 58 : 6k
Iteration of 58 : 7k
Iteration of 58 : 8k
Iteration of 58 : 9k
Iteration of 58 : 10k
Iteration of 58 : 11k
Iteration of 58 : 12k
Iteration of 58 : 13k
Iteration of 58 : 14k
Iteration of 58 : 15k
Iteration of 58 : 16k
Iteration of 58 : 17k
Iteration of 58 : 18k
Iteration of 58 : 19k
Iteration of 58 : 20k
Iteration of 58 : 21k
Iteration of 58 : 22k
Iteration of 58 : 23k
Iteration of 58 : 24k
Iteration of 58 : 25k
Iteration of 58 : 26k
Iteration of 58 : 27k
Iteration of 58 : 28k
Iteration of 58 : 29k
Iteration of 58 : 30k
Iteration of 58 : 31k
Iteration of 58 : 32k
Iteration of 58 : 33k
Iteration of 58 : 34k
Iteration of 58 : 35k
Iteration of 58 : 36k
Iteration of 58 : 37k
Iteration of 58 : 38k
Iteration of 58 : 39k
Iteration of 58 : 40k
Iteration of 58 : 41k
Iteration of 58 : 42k
Iteration of 58 : 43k
Iteration of 58 : 44k
Iteration of 58 : 45k
Iteration of 58 : 4