# Preprocessing

The clean and preprocess section will divide batch images into seperate class folders. 

The split section will divide those folders into train and validation sets. 

In [1]:
import os
import imghdr
import itertools
import numpy as np
import imageio
import scipy.misc
import matplotlib.pyplot as plt
import random
from math import floor
from shutil import copyfile, move, copytree

random.seed(101)

In [2]:
cwd = os.getcwd()
print(cwd)

/Users/aske/Desktop/Project_AI/project_ai


# Batch Concatenation

Uncomment and run below cell if augmented data has been received in multiple batches.   
Add the unzipped batch folders into the "Received_Batches" folder, then run the cell below. 

In [3]:

source_folder = 'Received_Batches'
dst = 'batch'

if not os.path.exists(dst):
    os.makedirs(dst)
    
subfolders = os.listdir(os.path.join(cwd, source_folder))
print(subfolders)

i = 0

for sub in subfolders:
    if(sub != '.DS_Store'):

        subsubfolders = os.listdir(os.path.join(os.path.join(cwd, source_folder), sub))
        
        for subsub in subsubfolders:
    
            if("Store" in subsub):
                    continue
                
            if(sub != '.DS_Store'):
                #print(subsub)
                if(i < 10):
                    foldername = "0" + str(i)
                else:
                    foldername = str(i)
                src = cwd + "/" + source_folder + "/" + sub + "/" + subsub
                dest = cwd + "/" + dst + "/" + foldername
                #print(src)
                #print(dest)
                move(src, dest)
                i += 1
                

['.DS_Store', 'batch 7', 'batch 6', 'batch_processed-9', 'batch_processed-8', 'batch', 'batch 4', 'batch 3', 'batch 2', 'batch 5']


# Optional Eye Region Extraction
Set EXTRACT_EYE_REGION to true to extract a region around the glasses in the augmented frames.  

In [4]:
# Helper Classes and Functions

class MaskRegion:
    
    def __init__(self, rmin, rmax, cmin, cmax):
        self.rmin = rmin
        self.rmax = rmax
        self.cmin = cmin
        self.cmax = cmax
 
# https://stackoverflow.com/questions/31400769/bounding-box-of-numpy-array
def bbox(img):
    img = (img > 0)
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.argmax(rows), img.shape[0] - 1 - np.argmax(np.flipud(rows))
    cmin, cmax = np.argmax(cols), img.shape[1] - 1 - np.argmax(np.flipud(cols))
    return rmin, rmax, cmin, cmax

In [5]:

EXTRACT_EYE_REGION = True
W_PAD = 10 # Width Padding
H_PAD = 10 # Height Padding


if(EXTRACT_EYE_REGION):

    # CREATE EYE REGION BATCH FOLDER
    result_dir_name = 'batch_eye_region'

    if not os.path.exists(result_dir_name):
        os.makedirs(result_dir_name)

    image_folder = "batch"
    subfolders = os.listdir(os.path.join(cwd,'batch'))

    for sub in subfolders:
        if(sub != '.DS_Store'):        
            for subdir, dirs, files in os.walk(os.path.join(cwd, image_folder, sub)):
                # Create Mask dict
                mask_dict = {}
                
                dst = os.path.join(cwd, result_dir_name, sub)
                if not os.path.exists(dst):
                    os.makedirs(dst)
                
                for file in files:
                    if("mask" in file):
                        # Read in image
                        mask_image = imageio.imread(os.path.join(cwd, image_folder, sub, file))
                        # Extract coordinates from bounding box
                        rmin, rmax, cmin, cmax = bbox(mask_image[:, :, 1])
                        mask_region = MaskRegion(rmin, rmax, cmin, cmax)
                        elements = file.split("_")
                        mask_name = "_".join((elements[1], elements[3], elements[4], elements[5]))
                        mask_dict[mask_name] = mask_region

                # Use mask dict to extract eye region from images. 
                for file in files:
                    if("mask" in file):
                        continue
                    
                    if(imghdr.what(os.path.join(subdir, file)) == 'png'):
                        if('_' in file):
                            elements = file.split("_")
                            mask_name = mask_name = "_".join((elements[1], elements[3], elements[4], elements[5]))
                            # Read in image
                            im = imageio.imread(os.path.join(cwd, image_folder, sub, file))
                            # Extract eye region
                            mr = mask_dict[mask_name]
                            eye_region = im[mr.rmin-W_PAD:mr.rmax+W_PAD, mr.cmin-H_PAD:mr.cmax+H_PAD, :]
                            scipy.misc.imsave(os.path.join(cwd, dst, file), eye_region)



`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


## Split, Clean and Preprocess

In [6]:
TEST_SPLIT = 0.2
TRAIN_SPLIT = 1.0 - TEST_SPLIT

# Helper function to divide folder list
# From https://stackoverflow.com/questions/42471570/how-to-split-documents-into-training-set-and-test-set
def get_training_and_testing_sets(file_list):
    split = TRAIN_SPLIT
    split_index = floor(len(file_list) * split)
    training = file_list[:split_index]
    testing = file_list[split_index:]
    return training, testing

print('Splitting data into {} training data and {} validation data'.format(TRAIN_SPLIT, TEST_SPLIT))

frames = ["HazzBerry", "GerretLight", "Enzo", 'M14', 'M10']
colors = ['obsidianBlack', 'glamourRed', 'goldPotato', 'tornadoBlue', 'lushGreen']

result_dir_name = 'processed_data'
train_dir = 'train'
val_dir = 'val'
image_folder = 'batch'

if(EXTRACT_EYE_REGION):
    image_folder = 'batch_eye_region'


if not os.path.exists(result_dir_name):
        os.makedirs(result_dir_name)
        
if not os.path.exists(os.path.join(result_dir_name, train_dir)):
        os.makedirs(os.path.join(result_dir_name, train_dir))
        
if not os.path.exists(os.path.join(result_dir_name, val_dir)):
        os.makedirs(os.path.join(result_dir_name, val_dir))

for pair in itertools.product(frames, colors):
    folder_name = '_'.join(pair)
    dir_in_train = result_dir_name +'/' + train_dir + '/' + folder_name
    dir_in_val = result_dir_name +'/' + val_dir + '/' + folder_name

    if not os.path.exists(dir_in_train):
        os.makedirs(dir_in_train)
    
    if not os.path.exists(dir_in_val):
        os.makedirs(dir_in_val)
    
subfolders = os.listdir(os.path.join(cwd,'batch'))

# Shuffle the list
random.shuffle(subfolders)

# Divide into train and test
training, testing = get_training_and_testing_sets(subfolders)

total = (len(subfolders))
t_count = len(training)
v_count = len(testing)

print("Dividing {} image folders into {} folders for training and {} for testing".format(total, t_count, v_count))
      
# Divide into train and test
for sub in subfolders:
    if(sub != '.DS_Store'):
        if sub in training:
            split_folder = train_dir
        else:
            split_folder = val_dir
                
        for subdir, dirs, files in os.walk(cwd+'/batch/' + sub):
            
            for file in files:
                if("mask" in file):
                    continue
                    
                if(imghdr.what(os.path.join(subdir, file)) == 'png'):
                    if('_' in file):
                        file_elements = file.split("_")
                        folder = file_elements[1] + '_' + file_elements[2]
                        source = os.path.join(cwd,image_folder, sub, file)
                        dst = os.path.join(cwd, result_dir_name, split_folder, folder) + '/' +  sub + '_' + file
                        copyfile(source, dst)

Splitting data into 0.8 training data and 0.2 validation data
Dividing 200 image folders into 160 folders for training and 40 for testing
