# Preprocessing

The clean and preprocess section will divide batch images into seperate class folders. 

The split section will divide those folders into train and validation sets. 

In [26]:
import os
import imghdr
import itertools
import numpy as np
import imageio
import scipy.misc
import matplotlib.pyplot as plt
from shutil import copyfile, move, copytree

In [27]:
cwd = os.getcwd()
print(cwd)

/Users/aske/Desktop/Project_AI/project_ai


# Batch Concatenation

Uncomment and run below cell if augmented data has been received in multiple batches.   
Add the unzipped batch folders into the "Received_Batches" folder, then run the cell below. 

In [28]:

source_folder = 'Received_Batches'
dst = 'batch'
print(dst)


if not os.path.exists(dst):
    os.makedirs(dst)
    

subfolders = os.listdir(os.path.join(cwd, source_folder))
print(subfolders)

i = 0

for sub in subfolders:
    if(sub != '.DS_Store'):
        #print(sub)
        
        subsubfolders = os.listdir(os.path.join(os.path.join(cwd, source_folder), sub))
        
        for subsub in subsubfolders:
    
            if("Store" in subsub):
                    continue
                
            if(sub != '.DS_Store'):
                #print(subsub)
                if(i < 10):
                    foldername = "0" + str(i)
                else:
                    foldername = str(i)
                src = cwd + "/" + source_folder + "/" + sub + "/" + subsub
                dest = cwd + "/" + dst + "/" + foldername
                #print(src)
                #print(dest)
                move(src, dest)
                i += 1
                

# End up with a folder called batch with numbered subfolders

# Do the option eye extraction after this and before splitting etc. 

batch
['batch_processed-2', '.DS_Store', 'batch_processed']


# Optional Eye Region Extraction
Set EXTRACT_EYE_REGION to true to extract a region around the glasses in the augmented frames.  

In [29]:
# Helper Classes and Functions

class MaskRegion:
    
    def __init__(self, rmin, rmax, cmin, cmax):
        self.rmin = rmin
        self.rmax = rmax
        self.cmin = cmin
        self.cmax = cmax
 
# https://stackoverflow.com/questions/31400769/bounding-box-of-numpy-array
def bbox(img):
    img = (img > 0)
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.argmax(rows), img.shape[0] - 1 - np.argmax(np.flipud(rows))
    cmin, cmax = np.argmax(cols), img.shape[1] - 1 - np.argmax(np.flipud(cols))
    return rmin, rmax, cmin, cmax

In [30]:

EXTRACT_EYE_REGION = True
W_PAD = 5 # Width Padding
H_PAD = 5 # Height Padding


if(EXTRACT_EYE_REGION):

    # CREATE EYE REGION BATCH FOLDER
    result_dir_name = 'batch_eye_region'

    if not os.path.exists(result_dir_name):
        os.makedirs(result_dir_name)

    image_folder = "batch"
    subfolders = os.listdir(os.path.join(cwd,'batch'))

    for sub in subfolders:
        if(value != '.DS_Store'):        
            for subdir, dirs, files in os.walk(os.path.join(cwd, image_folder, sub)):
                # Create Mask dict
                mask_dict = {}
                
                dst = os.path.join(cwd, result_dir_name, sub)
                if not os.path.exists(dst):
                    os.makedirs(dst)
                
                for file in files:
                    if("mask" in file):
                        # Read in image
                        mask_image = imageio.imread(os.path.join(cwd, image_folder, sub, file))
                        # Extract coordinates from bounding box
                        rmin, rmax, cmin, cmax = bbox(mask_image[:, :, 1])
                        mask_region = MaskRegion(rmin, rmax, cmin, cmax)
                        file_elements = file.split("_")
                        mask_name = "_".join((file_elements[1], file_elements[3], file_elements[4], file_elements[5]))
                        mask_dict[mask_name] = mask_region

                # Use mask dict to extract eye region from images. 
                for file in files:
                    if("mask" in file):
                        continue
                    
                    if(imghdr.what(os.path.join(subdir, file)) == 'png'):
                        if('_' in file):
                            file_elements = file.split("_")
                            mask_name = mask_name = "_".join((file_elements[1], file_elements[3], file_elements[4], file_elements[5]))
                            # Read in image
                            im = imageio.imread(os.path.join(cwd, image_folder, sub, file))
                            # Extract eye region
                            mr = mask_dict[mask_name]
                            eye_region = im[mr.rmin-W_PAD:mr.rmax+W_PAD, mr.cmin-H_PAD:mr.cmax+H_PAD, :]
                            scipy.misc.imsave(os.path.join(cwd, dst, file), eye_region)



## Split, Clean and Preprocess

In [32]:
TEST_SPLIT = 0.2
TRAIN_SPLIT = 1.0 - TEST_SPLIT

print('Splitting data into {} training data and {} validation data'.format(TRAIN_SPLIT, TEST_SPLIT))

frames = ["HazzBerry", "GerretLight", "Enzo", 'M14', 'M10']
#colors = ['obsidianBlack']
colors = ['obsidianBlack', 'glamourRed', 'goldPotato', 'tornadoBlue', 'lushGreen']

result_dir_name = 'processed_data'
train_dir = 'train'
val_dir = 'val'
image_folder = 'batch'

if(EXTRACT_EYE_REGION):
    image_folder = 'batch_eye_region'


if not os.path.exists(result_dir_name):
        os.makedirs(result_dir_name)
        
if not os.path.exists(os.path.join(result_dir_name, train_dir)):
        os.makedirs(os.path.join(result_dir_name, train_dir))
        
if not os.path.exists(os.path.join(result_dir_name, val_dir)):
        os.makedirs(os.path.join(result_dir_name, val_dir))

for pair in itertools.product(frames, colors):
    folder_name = '_'.join(pair)
    dir_in_train = result_dir_name +'/' + train_dir + '/' + folder_name
    dir_in_val = result_dir_name +'/' + val_dir + '/' + folder_name

    if not os.path.exists(dir_in_train):
        os.makedirs(dir_in_train)
    
    if not os.path.exists(dir_in_val):
        os.makedirs(dir_in_val)
    
subfolders = os.listdir(os.path.join(cwd,'batch'))

print("Total folders " + str(len(subfolders) - 1))


# Divide into train and test


for sub in subfolders:
    if(sub != '.DS_Store'):
        if np.random.rand(1) < TEST_SPLIT:
            split_folder = val_dir
        else:
            split_folder = train_dir
                
        for subdir, dirs, files in os.walk(cwd+'/batch/' + sub):
            
            for file in files:
                if("mask" in file):
                    continue
                    
                if(imghdr.what(os.path.join(subdir, file)) == 'png'):
                    if('_' in file):
                        file_elements = file.split("_")
                        folder = file_elements[1] + '_' + file_elements[2]
                        source = os.path.join(cwd,image_folder, sub, file)
                        dst = os.path.join(cwd, result_dir_name, split_folder, folder) + '/' +  sub + '_' + file
                        copyfile(source, dst)
                        

Splitting data into 0.8 training data and 0.2 validation data
Total folders 47
