# This notebook can be used to process raw modeling images contained in "renault" and "other" folders in order to create a split into train and cross-validation images

The output consists in two csv files indicating the image path and label for training and cross-validation sets.  
We can also create HDF5 files containing the array of images in reduced size as well as their labels.

In [1]:
from os import listdir
from os.path import isfile, join
import random
from random import shuffle

import numpy as np
import pandas as pd
import h5py
import scipy
from scipy import ndimage

## Specify framework for HDF5 data creation
Use 'th' for Theano or 'tf' for Tensorflow

In [2]:
data_order = 'th'

## Indicating the path for each class folder

In [3]:
class1_path = './data/modeling/renault/*.jpg' # address where to save the hdf5 file
class2_path = './data/modeling/other/*.jpg' # address where to save the hdf5 file

## List all images of each class

In [5]:
import glob
class1_list = glob.glob(class1_path)
class2_list = glob.glob(class2_path)

class1_list = list(map(lambda path : path.replace('\\','/'), class1_list))
class2_list = list(map(lambda path : path.replace('\\','/'), class2_list))

print('Found', len(class1_list), 'class 1 images.')
print('Found', len(class2_list), 'class 2 images.')

Found 4657 class 1 images.
Found 4657 class 2 images.


# List Train/CV images and their labels

In [6]:
random.seed(2210)
train_ratio = 0.9

train_class1 = random.sample(class1_list, int(train_ratio * len(class1_list)))
train_class2 = random.sample(class2_list, int(train_ratio * len(class2_list)))
cval_class1 = [img for img in class1_list if img not in train_class1]
cval_class2 = [img for img in class2_list if img not in train_class2]

print('train_class1:', len(train_class1))
print('train_class2:', len(train_class2))
print('cval_class1:', len(cval_class1))
print('cval_class2', len(cval_class2))

train_class1: 4191
train_class2: 4191
cval_class1: 466
cval_class2 466


In [7]:
train_addrs = train_class1 + train_class2
cval_addrs = cval_class1 + cval_class2

train_labels = [1 for number in range(len(train_class1))] + [0 for number in range(len(train_class2))]
cval_labels = [1 for number in range(len(cval_class1))] + [0 for number in range(len(cval_class2))]

# shuffle training and cross-validation data
random.seed(2210)
c = list(zip(train_addrs, train_labels))
shuffle(c)
train_addrs, train_labels = zip(*c)

c = list(zip(cval_addrs, cval_labels))
shuffle(c)
cval_addrs, cval_labels = zip(*c)
    
print('train_addrs:', len(train_addrs))
print('train_labels:', len(train_labels))
print('cval_addrs:', len(cval_addrs))
print('cval_labels:', len(cval_labels))

train_addrs: 8382
train_labels: 8382
cval_addrs: 932
cval_labels: 932


# Save train and cross-validation images in pandas dataframes

In [8]:
train = pd.DataFrame({'image':train_addrs, 'label':train_labels})
cval = pd.DataFrame({'image':cval_addrs, 'label':cval_labels})

train.to_csv('./munge/train.csv', sep=';', index=False)
cval.to_csv('./munge/cval.csv', sep=';', index=False)

In [9]:
train.head(20)

Unnamed: 0,image,label
0,../data/modeling/renault/renault_koleos_gris_4...,1
1,../data/modeling/other/07685.jpg,0
2,../data/modeling/other/05516.jpg,0
3,../data/modeling/renault/renault_captur_gris_2...,1
4,../data/modeling/renault/renault_kadjar_gris_1...,1
5,../data/modeling/renault/renault_scenic_67.jpg,1
6,../data/modeling/renault/renault_fluence_6.jpg,1
7,../data/modeling/renault/renault_megane_55.jpg,1
8,../data/modeling/renault/renault_wing_sport_19...,1
9,../data/modeling/renault/renault_koleos_noir_6...,1


# Create a HDF5 file (Optional, not necessary for transfer learning)

There are two main libraries to work with HDF5 format, namely h5py and tables (PyTables).
We will be doing this task using hdf5. For implementing the "tables" method, please refer to http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html  
  
In h5py we create an array using create_dataset. Note that we should determine the exact size of array when you are defining it. We can use the create_dataset for labels as well and immediately put the labels on it. You can set the dtype of an array directly using numpy dypes.

In [9]:
# check the order of data and choose proper data shape to save images
if data_order == 'th':
    train_shape = (len(train_addrs), 3, 224, 224)
    cval_shape = (len(cval_addrs), 3, 224, 224)
elif data_order == 'tf':
    train_shape = (len(train_addrs), 224, 224, 3)
    cval_shape = (len(cval_addrs), 224, 224, 3)

# specify location of HDF5 files
train_hdf5_path = './munge/train.hdf5' # address where to save the train hdf5 file
cval_hdf5_path = './munge/cval.hdf5' # address where to save the test hdf5 file

# open a new train hdf5 file and create arrays
train_hdf5_file = h5py.File(train_hdf5_path, mode='w')
train_hdf5_file.create_dataset('features', train_shape, np.int8)
train_hdf5_file.create_dataset('labels', (len(train_addrs),), np.int8)
train_hdf5_file['labels'][...] = train_labels

# open a new test hdf5 file and create arrays
cval_hdf5_file = h5py.File(cval_hdf5_path, mode='w')
cval_hdf5_file.create_dataset('features', cval_shape, np.int8)
cval_hdf5_file.create_dataset('labels', (len(cval_addrs),), np.int8)
cval_hdf5_file['labels'][...] = cval_labels

In [10]:
print('Train shape:',train_shape)
print('C-Validation shape:', cval_shape)

Train shape: (8382, 3, 224, 224)
C-Validation shape: (932, 3, 224, 224)


Now we can read images one by one, apply preprocessing (only resize in our code) and then save it.

In [11]:
# loop over train addresses
for i in range(len(train_addrs)):
    
    # print how many images are saved every 1000 images
    if i % 500 == 0 and i > 1:
        print('Train data: {}/{}'.format(i, len(train_addrs)))
    
    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB
    addr = train_addrs[i]
    img = np.array(ndimage.imread(addr, flatten=False))
    img = scipy.misc.imresize(img, size=(224, 224))
    
    # if the data order is Theano, axis orders should change
    if data_order == 'th':
        img = np.rollaxis(img, 2)
        
    # save the image features
    train_hdf5_file['features'][i, ...] = img[None]

# close hdf5 file
train_hdf5_file.close()
print('Done!')

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.


Train data: 500/8382
Train data: 1000/8382
Train data: 1500/8382
Train data: 2000/8382
Train data: 2500/8382
Train data: 3000/8382
Train data: 3500/8382
Train data: 4000/8382
Train data: 4500/8382
Train data: 5000/8382
Train data: 5500/8382
Train data: 6000/8382
Train data: 6500/8382
Train data: 7000/8382
Train data: 7500/8382
Train data: 8000/8382
Done!


In [12]:
# loop over cross-validation addresses
for i in range(len(cval_addrs)):
    
    # print how many images are saved every 100 images
    if i % 100 == 0 and i > 1:
        print('C-Validation data: {}/{}'.format(i, len(cval_addrs)))
    
    # read an image and resize to (224, 224)
    addr = cval_addrs[i]
    img = np.array(ndimage.imread(addr, flatten=False))
    img = scipy.misc.imresize(img, size=(224, 224))
    
    # if the data order is Theano, axis orders should change
    if data_order == 'th':
        img = np.rollaxis(img, 2)
        
    # save the image features
    cval_hdf5_file['features'][i, ...] = img[None]
    
cval_hdf5_file.close()
print('Done!')

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.


C-Validation data: 100/932
C-Validation data: 200/932
C-Validation data: 300/932
C-Validation data: 400/932
C-Validation data: 500/932
C-Validation data: 600/932
C-Validation data: 700/932
C-Validation data: 800/932
C-Validation data: 900/932
Done!
