# Intel Scene Feature Extraction for Test Dataset

**Objective:** Extract the features for the Intel Scene Test Dataset.

## Load ResNet50

In [1]:
from keras.applications import ResNet50

Using Theano backend.


In [2]:
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

## Get Test files

In [3]:
from my_utils.datasets import DataSetLoader
import pandas as pd

In [4]:
IMAGE_DIR = 'dataset/train-scene classification'
TRAIN_IMAGE_LOC = IMAGE_DIR+'/train'

In [5]:
test_csv_df = pd.read_csv(IMAGE_DIR+'/test.csv')

In [6]:
test_csv_df.shape

(7301, 1)

In [7]:
test_csv_df.head()

Unnamed: 0,image_name
0,3.jpg
1,5.jpg
2,6.jpg
3,11.jpg
4,14.jpg


In [8]:
DataLoader = DataSetLoader()

In [9]:
test_files = DataLoader.get_filename_list(TRAIN_IMAGE_LOC, list(test_csv_df['image_name']))

In [10]:
test_files[0:5]

['dataset/train-scene classification/train/3.jpg',
 'dataset/train-scene classification/train/5.jpg',
 'dataset/train-scene classification/train/6.jpg',
 'dataset/train-scene classification/train/11.jpg',
 'dataset/train-scene classification/train/14.jpg']

In [11]:
len(test_files)

7301

## Get the Test File Name IDs

In [20]:
test_ids = list(test_csv_df['image_name'])

In [21]:
test_ids[0:4]

['3.jpg', '5.jpg', '6.jpg', '11.jpg']

In [15]:
test_ids[7300]

'24333.jpg'

In [22]:
# remove .jpg extension.
test_ids = [w.replace('.jpg', '') for w in test_ids]

In [24]:
test_ids[0:4]

['3', '5', '6', '11']

In [25]:
# convert the string to int
test_ids = [int(w) for w in test_ids]

In [26]:
test_ids[0:4]

[3, 5, 6, 11]

## Configurations

In [27]:
NUM_FILES = 20
#NUM_FILES = len(test_files) - 1

BATCH_SIZE = 10

## Create HDF5 Database

In [28]:
from my_utils.io import HDF5DataSetWriter

In [30]:
#resnet50 -> predict returns vector of size 2048=
db = HDF5DataSetWriter( (NUM_FILES, 2048), "./output/IntelSceneTestExtractedFeatures.hdf5", 
                               dataKey="test_features", 
                               valueKey='ID')


In [31]:
import cv2
import numpy as np
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img

In [34]:
## set up the progress bar
import progressbar

In [35]:
# initialize the progress bar
widgets = ["Extracting Test Features: ", progressbar.Percentage(), " ", 
           progressbar.Bar(), " ", progressbar.ETA()]

pbar = progressbar.ProgressBar(maxval=NUM_FILES,
                                widgets=widgets).start()

Extracting Test Features:   0% |                               | ETA:  --:--:--

In [36]:
# go over all images
for i in np.arange(0, NUM_FILES, BATCH_SIZE):
    
    # process them in terms of batches
    batchImages = []
    
    batchFileList = test_files[i: i+BATCH_SIZE]
    batchLabelList = test_ids[i: i+BATCH_SIZE]
    #print('outer for loop : {} '.format(i))
    
    for fileName in batchFileList:
        # load the file
        img = cv2.imread(fileName)
        img = cv2.resize(img, (224,224), interpolation=cv2.INTER_AREA)
        
        # convert to array
        img = img_to_array(img)
        
        #before preprocess, expand the dim.
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)  # for resnet50
        
        batchImages.append(img)
    
    #process the batch images
    batchImages = np.vstack(batchImages)
    
    # extract the features for the batch
    extracted_features = model.predict(batchImages, batch_size=BATCH_SIZE)
    
    ## separate out the features for each image.
    ## resnet50 spits out vector of 2048 for each image
    extracted_features = extracted_features.reshape((extracted_features.shape[0], 2048) )
    
    # add to db
    db.add(extracted_features, batchLabelList)
    pbar.update(i)
    

db.close()
pbar.finish()

Extracting Test Features: 100% |################################| Time: 0:00:51
