# Extract Features For CatsAndDogs Test Dataset

**Objective:** Extract the features for the Test dataset for CatsAndDogs dataset using ResNet50 architecture.

## Load ResNet50 pretrained weights

In [1]:
from keras.applications import ResNet50
# load the ResNet50 network
print('Loading ResNet50 network weights')
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

Using Theano backend.


Loading ResNet50 network weights


## Get the Test Image Names


In [2]:
import pathlib
import random

ImageDir = './datasets/test'

# recursively go over the datset folder and get the file names.
ImageList = list(pathlib.Path(ImageDir).rglob('*.jpg'))

# test data, no need to shuffle.
#random.shuffle(ImageList)

print(len(ImageList))

12500


In [3]:
print(ImageList[0:4])

[PosixPath('datasets/test/1.jpg'), PosixPath('datasets/test/10.jpg'), PosixPath('datasets/test/100.jpg'), PosixPath('datasets/test/1000.jpg')]


### Get the Test Image ID

In [4]:
import os

In [5]:
ids = []
for imagePath in ImageList:
    file_name = imagePath.as_posix()
    class_label = file_name.split(os.path.sep)[-1].split('.')[0]
    #print(file_name, ' ', class_label)
    ids.append(int(class_label))

In [6]:
print(ids[0:3])

[1, 10, 100]


## Configurations

In [7]:
### Configuration Settings
# total number of images to process

# for initial testing the scripting
#NUM_IMAGES = 25

# for complete dataset, turn this ON.
NUM_IMAGES = len(ImageList)

### buffer info.
BUFF_SIZE = 1000

## batch
BATCH_SIZE=25

## DataSetWriter

In [8]:
import h5py
import os

In [9]:
db = h5py.File('./output/cats_and_dogs_test_features.hdf5', 'w')

In [10]:
#create the dictionary like dataset entries

# the output from the max pool layer from ResNet50

features = db.create_dataset('features', (NUM_IMAGES, 2048), dtype='float')
labels   = db.create_dataset('ID', (NUM_IMAGES,), dtype='int')

In [11]:
## Create the buffer for both the features and labels, so that we can flush them to disk when it is full.

#create buffer dictionary
buffer = { 'features': [], 'labels': [] }
# index to the list - features
feature_idx = 0

### Buffer Utilities

In [12]:
# write buffer utilities

## add the features and labels from buffer -> db
def flush_the_buffer():
    global feature_idx
    global buffer
    global features
    global labels
    
    to_idx = feature_idx + len(buffer['features'])
    features[feature_idx:to_idx] = buffer['features']
    labels[feature_idx:to_idx]   = buffer['labels']
    
    #update the feature idx
    feature_idx = to_idx
    
    #reset the buffer
    buffer = { 'features': [], 'labels': [] }
    return

In [13]:
## add the features and labels to the buffer
def add_to_buffer(feature_entries, labels):
    global buffer
    
    buffer['features'].extend(feature_entries)
    buffer['labels'].extend(labels)
    
    if len(buffer['features']) >= BUFF_SIZE :
        flush_the_buffer()
    return

In [14]:
## close the db
def close_the_database():
    global buffer
    global db
    
    if len(buffer['features']) > 0:
        flush_the_buffer()
        
    db.close()
    return

## Extract Features

* Load the image and resize them to 224x224 for ResNet50
* Preprocess the image for ResNet

* Use model.predict() method

In [15]:
import numpy as np
import cv2
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img

In [16]:
## resize them to one size
def preprocess_image(img, width, height, interpolation=cv2.INTER_AREA):
    return( cv2.resize(img, (width, height), interpolation))

In [17]:
# go over all the images to extract the features in batches
for i in np.arange(0, NUM_IMAGES, BATCH_SIZE):
    #
    # process them in batches
    #
    batchImageList = ImageList[i: i+BATCH_SIZE]
    batchLabelsList = ids[i: i+BATCH_SIZE]
    batchImages = []
    #print(i)
    
    # loop over each image in the batchImage List
    for (j, imagePath) in enumerate(batchImageList):
        fileName = imagePath.as_posix()
        # load the image
        img = cv2.imread(fileName)
        img = preprocess_image(img, 224, 224)
        
        # convert to array
        img = img_to_array(img)
        
        ##preprocess the input for the ResNet architecture.
        ## it needs in four dimensions, so expand
        img = np.expand_dims(img, axis=0)
        img = imagenet_utils.preprocess_input(img)
        
        # add image to the batch
        batchImages.append(img)
        
    ## now pass the images thru ResNet50 network architecture and
    batchImages = np.vstack(batchImages)
    extracted_features = model.predict(batchImages, batch_size=BATCH_SIZE)
    
    #print('extracted_features.shape :', extracted_features.shape)
    ## get the extracted features for each image after the max pooling layer.
    ## note that, the size after the max pooling layer is 2048 from the ResNet50 arch.
    #
    extracted_features = extracted_features.reshape((extracted_features.shape[0], 2048))
    
    ## add the features to the HDF5 dataset
    add_to_buffer(extracted_features, batchLabelsList)
    
## close the HDF5 dataset
close_the_database()

In [18]:
print('Feature extractions for Test Dataset, Done!')

Feature extractions for Test Dataset, Done!
