In [1]:
import os
import pickle
import itertools
import io
import time
import bson
import threading

import pandas as pd
from scipy.misc import imread
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
import keras

from keras.applications.vgg16 import VGG16


Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def grouper(n, iterable):
    '''
    Given an iterable, it'll return size n chunks per iteration.
    Handles the last chunk too.
    '''
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk
        
class threadsafe_iter:
    """
    Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()

    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return self.it.__next__()

def threadsafe_generator(f):
    """
    A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

@threadsafe_generator
def get_features_label(documents, batch_size=32, return_labels=True):
    '''
    Given a document return X, y
    
    X is scaled to [0, 1] and consists of all images contained in document.
    y is given an integer encoding.
    '''
    
    
    for batch in grouper(batch_size, documents): 
        images = []
        labels = []

        for document in batch:
            category = document.get('category_id', '')
            img = document.get('imgs')[0]
            data = io.BytesIO(img.get('picture', None))
            im = imread(data)

            if category:    
                label = labelencoder.transform([category])
            else:
                label = None

            im = im.astype('float32') / 255.0

            images.append(im)
            labels.append(label)

        if return_labels:
            yield np.array(images), np.array(labels)
        else:
            yield np.array(images)

In [3]:
if os.path.isfile('labelencoder.pkl'):
    with open('labelencoder.pkl', 'rb') as f:
        labelencoder = pickle.load(f)
    categories = pd.read_csv('categories.csv')
    
else:
    # Get the category ID for each document in the training set.
    documents = bson.decode_file_iter(open('train_example.bson', 'rb'))
    categories = [(d['_id'], d['category_id']) for d in documents]
    categories = pd.DataFrame(categories, columns=['id', 'cat'])

    # Create a label encoder for all the labels found
    labelencoder = LabelEncoder()
    labelencoder.fit(categories.cat.unique().ravel())
    
    with open('labelencoder.pkl', 'wb') as f:
        pickle.dump(labelencoder, f)
        
    categories.to_csv('categories.csv')

In [13]:
model = VGG16(weights='imagenet', include_top=False)

generator = get_features_label(bson.decode_file_iter(open('train_example.bson', 'rb')), return_labels=False)

predictions = []

for i, batch in enumerate(generator):
    print(batch.shape)
    output = model.predict(batch)
    predictions.append(output)
    

(32, 180, 180, 3)
(32, 180, 180, 3)
(18, 180, 180, 3)


In [12]:
predictions[0].shape

(32, 5, 5, 512)

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________