# Build the Environment

First, I should build the environment that I will make the development on. I have used anaconda environment based on Python 3.5v. 

After I create the environment based on Python 3.5, I can install these libraries by using **conda install** command. These libraries can be reached via on the website of Anaconda. 

```
# It creates the environment
conda create -n hadibakalim_env python=3.5

# It activates the environment
source activate hadibakalim_env

# Install the libraries
conda install -c menpo opencv3
conda install -c conda-forge matplotlib 
conda install -c anaconda keras-gpu 
conda install -c anaconda tensorflow-gpu
conda install -c conda-forge h5py 

```

# Load the Libraries

I should add the libraries that I need.

In [None]:
import os
from os.path import isdir
import numpy as np # library for scientific computing in Python
%matplotlib inline
import matplotlib.pyplot as plt # for plotting
import h5py # for reading and writing HDF5 files from Python.
import cv2 # OpenCV 3.2
from random import randint # to generate random numbers

## Rename the Images

I collect many images from online sources and categorize them in files which their names are related to classes. In this work, I have seven classes. I will give a name to every single image starting with **class label** and **index number** like *passat0001.jpg ..... passat4014.jpg* 

In [None]:
def rename_image(details):
    """ give name to images
    details['source_path']: where the images from different classes stay
    image_dict: dictionary includes image names of classes and labels
    details['zero_base']: To start with a certain base 0
    """
    # Create the dictionary
    image_dict = dict(zip(details['file_name'], details['image_label']))
    
    # Do it for every file
    for path_img, image_name in image_dict.items():
    
        # Create a path list contains the files
        file_names = sorted(os.listdir(details['source_path'] + path_img + '/'))
    
        # Do it for every file
        for index, name in enumerate(file_names):
            os.rename(details['source_path'] + path_img + '/' + name, \
                      details['source_path'] + path_img + '/' + image_name + \
                      str(index).zfill(details['zero_base'])+ details['image_type'])

In [None]:
details = {}

# I add the names of the files that contains images
details['file_name'] = ['VW-Passat', 'RENO-Fluence', 'FIAT-Linea', 'VW-Polo',\
                         'RENO-Toros', 'FIAT-Dogan', 'OtherClass']

# I label them like that
details['image_label'] = ['passat', 'fluence', 'linea', 'polo', 'toros', 'dogan', 'other']

# It is source path of files. This main path includes the files that I name at first line 
details['source_path'] = '../main/source/path/'

# It pads the string on the left with zeros
details['zero_base'] = 4

# It defines the image type
details['image_type'] = '.jpg'

In [None]:
rename_image(details)

# Create dictionary

Here I create a dictinary that will save image names corresponding to class file.

Keys are the file names that in *details['file_name']*.
Values are the lists that includes image names for every key.

In [None]:
def createDict(details):
    
    # It take all the entries under the source path
    contents = os.listdir(details['source_path'])
    
    # It keeps only the files under the source path
    classes = [each for each in contents if os.path.isdir(details['source_path'] + each)]
    print(classes)
    
    # Create a dictionary that will keep the lists
    database_detail = {}

    # For every file
    for each in classes:

        # Create a list that will keep the image names
        database_list = []

        # If the file under the source path matches with the one I enter in details['file_name']
        if len(os.listdir(details['source_path'] + each)) > 0 and each in details['file_name']:   

            class_path = details['source_path'] + each
            files = os.listdir(class_path)

            # Sort ascending 
            files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))) )

            # Add every image to the related list
            for img in files:
                database_list.append(img)

            # each is a file name, database list is a list that includes image names related to that file
            database_detail[each] = database_list
            
    return database_detail

In [None]:
details['database_detail'] = createDict(details)

Let's check the outcome

In [None]:
# Number of classes we have
print(len(details['database_detail']))
# Number of images which belongs to every classes
print(list(map(len, database_detail.values())))

## (Optional) Remove the images have unwanted size

If your dataset has kind of 95% same shape and you want to get rid of the others, you can use this snippet.

In [None]:
def removeAnormalShape(database_detail, hdf5_path, save_img_size):
    # Keep the file names as a list
    class_list = list(details['database_detail'].keys())
    
    for class_name in class_list:
        # path for every file
        path = details['source_path'] + class_name
        
        for index, each in enumerate(details['database_detail'][class_name]):
            
            img_path =  path + '/' + each
            # Read the image
            img = cv2.imread(img_path)
            
            # Check if it matches with wanted size
            if img.shape != details['save_img_size']:
                # If not, remove from dictionary
                details['database_detail'][class_name].remove(each)

In [None]:
details['save_img_size'] = (300, 300, 3)

In [None]:
removeAnormalShape(details['database_detail'])

In [None]:
# Number of classes we have
print(len(details['database_detail']))
# Number of images which belongs to every classes
print(list(map(len, details['database_detail'].values())))

## Save all Classes to HDF5 Files

In [None]:
def classFiles2hdf5(database_detail):
        
    for class_name in details['file_name']:
        
        hdf5_path = details['save_path'] + details['prefix'] + class_name + '.hdf5'
        
        if not os.path.exists(hdf5_path):
            os.mknod(hdf5_path)
            
        # opening like this, solves the problem of 'Unable to truncate a file which is already open'
        with h5py.File(hdf5_path,'w') as hdf5_file:

            data_shape = (len(database_detail[class_name]), details['save_img_size'][0], \
                          details['save_img_size'][1], details['save_img_size'][2])
            hdf5_file.create_dataset(details['hdf5_dataset_label'][0], data_shape, np.uint8)
            
            hdf5_file.create_dataset(details['hdf5_dataset_label'][1], (len(database_detail[class_name]), ), np.uint8)
            hdf5_file[details['hdf5_dataset_label'][1]][...] = details['file_name'].index(class_name)
            
            # We will add the hotlabel on next part
            
            # It is good to save the image with its name for easing to find when it is needed
            dt = h5py.special_dtype(vlen=bytes)
            hdf5_file.create_dataset(details['hdf5_dataset_label'][3], (len(database_detail[class_name]), ), dtype=dt)

            for index, each in enumerate(database_detail[class_name]):
                                     
                img_path = details['source_path'] + class_name + '/' + each
                img = cv2.imread(img_path)

                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
      
                hdf5_file[details['hdf5_dataset_label'][0]][index, ...] = img[None]
                hdf5_file[details['hdf5_dataset_label'][3]][index, ...] = each
                # We don't need to close to file like using hdf5_file.close()

In [None]:
# 
details['hdf5_dataset_label'] = ['batch', 'label', 'hotlabel', 'img_name']

# The image sizes that hdf5 files have
details['save_img_size'] = (300, 300, 3)

# We will save the hdf5 file under this path
details['save_path'] = 'save/to/this/path/'

# I want to give a prefix for hdf5 files, use it if you want
# Live it empty if you don't want to add a prefix
details['prefix'] = 'orig_'

In [None]:
# call the function that turn files into hdf5 files
classFiles2hdf5(details['database_detail'])

# Test the HDF5 Files

In [None]:
def show_test(test_path, hdf5_dataset, image_label):
      
    # open the hdf5 file
    with h5py.File(test_path,'r') as hdf5_file:
        
        # Total number of samples
        X_train = hdf5_file[hdf5_dataset[0]]
        Y_train = hdf5_file[hdf5_dataset[1]]
        img_name = hdf5_file[hdf5_dataset[3]]

        # randomly pick a number for index
        random_index = randint(0,len(Y_train))    

        img = X_train[random_index]
        print('Image name : ' + str(img_name[random_index]))
        print('Length : ' + str(len(X_train)))
        
        plt.imshow(img)
        plt.show()

        print('Label_class : ' + str(Y_train[random_index]))
        print('Label_class : ' + image_label[Y_train[random_index]])

In [None]:
# Pick one of the hdf5 files
test_path = 'call/from/save/path/orig_RENO-Fluence.hdf5'

show_test(test_path, details['hdf5_dataset_label'], details['image_label'])