In [None]:
import tensorflow as tf
import json
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
import albumentations as alb
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil

### 2.2 Limit GPU Memory Growth

In [None]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
tf.config.list_physical_devices('GPU')

### 2.3 Load Image into TF Data Pipeline

In [None]:
data_wd = Path('/data') / 'raspi_face_detection'
img_path = data_wd / 'images'
labels_path = data_wd / 'labels'

In [None]:
images = tf.data.Dataset.list_files(f'{img_path}/*.jpg', shuffle=False)

In [None]:
images.as_numpy_iterator().next()

In [None]:
def load_image(x): 
    byte_img = tf.io.read_file(x)
    img = tf.io.decode_jpeg(byte_img)
    return img

In [None]:
images = images.map(load_image)

In [None]:
images.as_numpy_iterator().next()

In [None]:
type(images)

### 2.4 View Raw Images with Matplotlib

In [None]:
image_generator = images.batch(4).as_numpy_iterator()

In [None]:
plot_images = image_generator.next()

In [None]:
fig, ax = plt.subplots(ncols=4, figsize=(20,20))
for idx, image in enumerate(plot_images):
    ax[idx].imshow(image) 
plt.show()

# 3. Partition Unaugmented Data

### 3.1 MANUALLY SPLT DATA INTO TRAIN TEST AND VAL

#### 3.1.1 Find Errors Where x/y-Max < x/y-min

In [None]:
errors = list()

augmentor = alb.Compose([alb.RandomCrop(width=450, height=450), 
                         alb.HorizontalFlip(p=0.5), 
                         alb.RandomBrightnessContrast(p=0.2),
                         alb.RandomGamma(p=0.2), 
                         alb.RGBShift(p=0.2), 
                         alb.VerticalFlip(p=0.5)], 
                         bbox_params=alb.BboxParams(format='albumentations', 
                                                    label_fields=['class_labels']))

EMPTY_PHOTOS = ['de1c5f22-0cbd-11ef-9a53-dca632a68397', 'dce7e950-0cbd-11ef-9a53-dca632a68397', '22814110-0d58-11ef-abd8-dca632a68397', 'dbb39962-0cbd-11ef-9a53-dca632a68397']

fns = list(img_path.glob('*'))
for fn in fns:
    stem = fn.stem

    if stem in EMPTY_PHOTOS:
        continue
    
    img_fn   = img_path / f'{stem}.jpg'
    label_fn = labels_path / f'{stem}.json'
    
    
    img = cv2.imread(str(img_fn))
    with open(label_fn, 'r') as f:
        label = json.load(f)
    
    shapes = label['shapes']
    for shape in shapes:
        coords = [0,0,0,0]
        points = shape['points']
        coords[0] = points[0][0]
        coords[1] = points[0][1]
        coords[2] = points[1][0]
        coords[3] = points[1][1]
        coords = list(np.divide(coords, [640,480,640,480]))
    
        try:
            augmented = augmentor(image=img, bboxes=[coords], class_labels=['face'])
        except ValueError:
            errors.append([stem, shape['label']])

print(len(errors))

In [None]:
print(pd.DataFrame(errors).to_csv())

### 3.1.1 Programmatically Split Data into Train/Test/Val

In [None]:
labels = list()
for label_fn in labels_path.glob('*'):
    with open(label_fn, 'r') as f:
        label_json = json.load(f)
    labels.append(dict(
        image_path=Path(label_json['imagePath']).name,
        labels=', '.join(sorted([x['label'] for x in label_json['shapes']]))
    ))

In [None]:
df_labels = pd.DataFrame(labels)
df_labels.labels.value_counts()

In [None]:
train, test = train_test_split(df_labels, test_size=0.3, random_state=42, stratify=df_labels.labels)
test, val = train_test_split(test, test_size=0.5, random_state=42, stratify=test.labels)

In [None]:
print('Training Set')
print(train.labels.value_counts().to_string(header=False))
print('\nTest Set')
print(test.labels.value_counts().to_string(header=False))
print('\nValidation Set')
print(val.labels.value_counts().to_string(header=False))

In [None]:
model_sets_files = dict(
    train=train.image_path.tolist(),
    test=test.image_path.tolist(),
    val=val.image_path.tolist(),
)
print('Model Set Percentages Check')
print({k: len(v) for k, v in model_sets_files.items()})
print({k: len(v) / len(df_labels) for k, v in model_sets_files.items()})

### 3.2 Moving Images and Labels

In [None]:
model_sets_path = data_wd / 'model_sets'

In [None]:
list(model_sets_path.glob('*'))

In [None]:
folder = 'train'
model_set_filepath = model_sets_path / folder
model_set_filepath_images = model_set_filepath / 'images'
model_set_filepath_labels = model_set_filepath / 'labels'
for p in [model_set_filepath, model_set_filepath_images, model_set_filepath_labels]:
    p.mkdir(exist_ok=True)

In [None]:
for folder in ['train','test','val']:
    model_set_filepath = model_sets_path / folder
    model_set_filepath_images = model_set_filepath / 'images'
    model_set_filepath_labels = model_set_filepath / 'labels'
    for p in [model_set_filepath, model_set_filepath_images, model_set_filepath_labels]:
        p.mkdir(exist_ok=True)

    for img_filename in model_sets_files[folder]:
        src_img_filepath, dst_img_filepath = [p / img_filename for p in [img_path, model_set_filepath_images]]
        shutil.copyfile(src_img_filepath, dst_img_filepath)
        
        label_filename = img_filename.split('.')[0]+'.json'
        src_label_filepath, dst_label_filepath = [p / label_filename for p in [labels_path, model_set_filepath_labels]]
        shutil.copyfile(src_label_filepath, dst_label_filepath)