### Import Library

In [7]:
import os
import imgaug as ia
from imgaug import augmenters as iaa
from imgaug import parameters as iap
import imageio
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

import random
from shutil import copyfile

from config import *

### 1. Create Synthetic Image

In [4]:
def trasnform(image):
    """
    Input an array of images, go through effect-related augmentation, then return back the array of images
    """
    # Below effects will be added to all images sequentially with randomised degree, normal distribution, details please refer to imgaug library
    p = iaa.Sequential([
        iaa.CropAndPad(percent=iap.Normal(0, 0.02)),
        iaa.Affine(scale = {"x": iap.Normal(1, 0.015), "y": iap.Normal(1, 0.015)}),
        iaa.Affine(translate_percent = {"x": iap.Normal(0, 0.015), "y": iap.Normal(0, 0.015)}),
        iaa.Affine(rotate = iap.Normal(0, 3)),
        iaa.Affine(shear = {"x": iap.Normal(0, 0.5), "y": iap.Normal(0, 0.5)}),
        iaa.AddToHueAndSaturation(value= iap.Normal(0, 20), per_channel = True),
        iaa.AddToBrightness(add = iap.Normal(0, 50)),
        iaa.GammaContrast(gamma = (0.2,1.5))   
    ], random_order=True)
    
    # Some of the below effects would be added to images with with randomised degree
    e = iaa.SomeOf((0, None), [
        iaa.GaussianBlur(sigma = iap.ChiSquare(2)),
        iaa.AdditiveGaussianNoise(scale=(0, 0.1*255)),
        iaa.ElasticTransformation(alpha=(0, 2.5), sigma=(0.25, 1)),
        iaa.Cutout(nb_iterations = (1, 3), size = (0.02, 0.07), fill_mode="gaussian", fill_per_channel=True)
    ], random_order=True)
    
    # Resize image to 1024 x 768 to ensure it fit to our designed network input
    r = iaa.Resize({"height": 1024, "width": 768})
    
    image = p(image=image)
    image = e(image=image)
    image = r(image=image)
    
    return image

In [5]:
def create_data(load_dir, save_dir, num_of_syn = None, train_val_test_ratio = None):
    '''input the load and save directory, the number of synthetic to be created and a list of train, validation and test ratio, i.e. [0.6, 0.2, 0.2], perform the synthetic creation and save into the saving directory.'''
    
    # handling None input value and check input validation of train_val_test_ratio
    if train_val_test_ratio == None:
        train_val_test_ratio = [0.6, 0.2, 0.2]
    if sum(train_val_test_ratio) != 1:
        print("Error! the sum of dataset ratio is not equal to 1")
        return 
    if num_of_syn == None:
        num_of_syn = 200
    
    # save a list of images name
    raw_data_list = os.listdir(load_dir)
    
    # removve ipynb checkpoint from the list if any
    while (raw_data_list[0] == '.ipynb_checkpoints')|(raw_data_list[0] == '.DS_Store'):
        raw_data_list = raw_data_list[1:]
    raw_data_list.sort()
    
    k = 0
    current_classes = "nothing"
    test_path = save_dir +  "/test/test" 
    os.makedirs(test_path)
    
    
    # loop through all raw data and perform synthetic transformation
    for i in raw_data_list:
        now = datetime.now()
        
        classes = i[:5]
        
        if classes != current_classes:
            train_path = save_dir +  "/train/" + classes
            val_path = save_dir +  "/validation/" + classes
            os.makedirs(train_path)
            os.makedirs(val_path)
            current_classes = classes
        
        
        img = load_dir + "/" + i
        img = imageio.imread(img)
        images = [trasnform(img) for _ in range(num_of_syn)]
        
        # create an array to indicate which dataset (train/validation/test) belong to each created image.
        train, test = train_test_split(np.linspace(0, num_of_syn, num=num_of_syn, endpoint=False).astype(int), 
                                   test_size=train_val_test_ratio[2])
        train, val = train_test_split(train, test_size = train_val_test_ratio[1]/(train_val_test_ratio[0] + train_val_test_ratio[1]))
        
        idx = np.repeat(0, num_of_syn)
        idx[train] = 1
        idx[val] = 2
        idx[test] = 3
        
        # load through all created image and define the saved link.
        for j in range(num_of_syn):
            if idx[j] == 1:
                saved_path = train_path
            elif idx[j] == 2:
                saved_path = val_path
            else:
                saved_path = test_path
            
            # find the correct item number to be saved
            saved_img_list = os.listdir(saved_path)
            rank = len(saved_img_list) + 1 
            
            save_path = saved_path + "/" + classes + "_" + str(rank) + ".jpeg"
            
            imageio.imwrite(save_path, images[j])
    
        k = k+1
        if k % 10 == 9:
            print("Finished", k+1, "images transformation, used", datetime.now()-now,"from pervious checking point.")

In [6]:
create_data(input_dir, synthetic_dir)

Finished 10 images transformation, used 0:04:33.226601 from pervious checking point.
Finished 20 images transformation, used 0:04:30.123661 from pervious checking point.
Finished 30 images transformation, used 0:02:36.083978 from pervious checking point.
Finished 40 images transformation, used 0:02:29.762521 from pervious checking point.
Finished 50 images transformation, used 0:02:23.570362 from pervious checking point.
Finished 60 images transformation, used 0:02:16.030326 from pervious checking point.
Finished 70 images transformation, used 0:02:22.032877 from pervious checking point.
Finished 80 images transformation, used 0:02:28.979680 from pervious checking point.
Finished 90 images transformation, used 0:02:19.159746 from pervious checking point.
Finished 100 images transformation, used 0:02:28.930212 from pervious checking point.
Finished 110 images transformation, used 0:02:18.317539 from pervious checking point.
Finished 120 images transformation, used 0:02:30.141957 from pe

### 2. Create file directory fitted to Siamese Network. i.e. Anchor, Positive & Negative

In [8]:
arr = os.listdir(synthetic_dir)
arr.sort()

for i in range(len(arr)):
    ds = arr[i]
    ds_dir = synthetic_dir + "/" + ds
    classes = os.listdir(ds_dir)
    classes.sort()
    if classes[0] == '.ipynb_checkpoints':
        classes = classes[1:]
    
    for j in range(len(classes)):
        
        now = datetime.now()
        
        imgs_dir = ds_dir + "/" + classes[j]
        imgs = os.listdir(imgs_dir) # a list of all images in the same classes
        imgs.sort()
        if imgs[0] == '.ipynb_checkpoints':
            imgs = imgs[1:]
        
        if ds == "test":
            anchor_dir = output_dir + "/" + ds + "/" + ds
            os.makedirs(anchor_dir)
        elif ds != "test":
            
            # Make directory only for train and validation set
            anchor_dir = output_dir + "/" + ds + "/anchor/" + classes[j]
            positive_dir = output_dir + "/" + ds + "/positive/" + classes[j]
            negative_dir = output_dir + "/" + ds + "/negative/" + classes[j]
            os.makedirs(anchor_dir)
            os.makedirs(positive_dir)
            os.makedirs(negative_dir)
            
            reorder_imgs = imgs.copy() 
            random.shuffle(reorder_imgs) # shuffle images in the same classes 
            other_classes = classes.copy() 
            other_classes.pop(j) # keep other classes
            
        for k in range(len(imgs)):
            ori_path = imgs_dir + "/" + imgs[k]
            if ds == "test":
                save_path = anchor_dir + "/" + imgs[k]
                copyfile(ori_path, save_path)
            elif ds != "test":
                # relocate the anchor 
                number_of_image_in_the_folder = str(len(os.listdir(anchor_dir))+1)
                anchor_path = anchor_dir + "/" + number_of_image_in_the_folder + ".jpeg"
                copyfile(ori_path, anchor_path)
                
                # save the shuffled images as the positive sample for the anchor
                ori_path = imgs_dir + "/" + reorder_imgs[k] 
                positive_path = positive_dir + "/" + number_of_image_in_the_folder + ".jpeg"
                copyfile(ori_path, positive_path)
                
                # randomly pick an image from another classes and save it as negative sample
                pick_dir = ds_dir + "/" + random.choice(other_classes) 
                other_imgs = os.listdir(pick_dir)
                other_imgs.sort()
            
                if other_imgs[0] == '.ipynb_checkpoints':
                    other_imgs = other_imgs[1:]
                
                ori_path = pick_dir + "/" + random.choice(other_imgs)
                negative_path = negative_dir + "/" + number_of_image_in_the_folder + ".jpeg"
                copyfile(ori_path, negative_path)
                
        #report time taken
        if j % 10 == 9:
            print("The", j+1, "/", len(classes), "-th images of", arr[i], "data set has been performed, used", datetime.now()-now,"time")
        

The 10 / 53 -th images of train data set has been performed, used 0:00:01.919996 time
The 20 / 53 -th images of train data set has been performed, used 0:00:01.551921 time
The 30 / 53 -th images of train data set has been performed, used 0:00:01.298729 time
The 40 / 53 -th images of train data set has been performed, used 0:00:01.100435 time
The 50 / 53 -th images of train data set has been performed, used 0:00:00.996069 time
The 10 / 53 -th images of validation data set has been performed, used 0:00:00.716045 time
The 20 / 53 -th images of validation data set has been performed, used 0:00:00.509220 time
The 30 / 53 -th images of validation data set has been performed, used 0:00:00.477276 time
The 40 / 53 -th images of validation data set has been performed, used 0:00:00.407891 time
The 50 / 53 -th images of validation data set has been performed, used 0:00:00.306233 time
