# Preprocessing and Splitting the Datasets
Before training the model, you must preprocess the dataset(s) which will create directories containing preprocessed and augmented images. These images will be used to train the model and can be used during inference as well.

In [1]:
import os, shutil
from pathlib import Path

source_path = 'raw_data'
dest_path = 'data'

if os.path.exists(os.path.join(dest_path, 'Train')):
    shutil.rmtree(os.path.join(dest_path, 'Train'))
    shutil.rmtree(os.path.join(dest_path, 'Validation'))

os.mkdir(os.path.join(dest_path, 'Binarized'))
os.mkdir(os.path.join(dest_path, 'Annotations'))
os.mkdir(os.path.join(dest_path, 'Train'))
os.mkdir(os.path.join(dest_path, 'Validation'))
os.mkdir(os.path.join(dest_path, 'Train', 'Images'))
os.mkdir(os.path.join(dest_path, 'Train', 'Annotations'))
os.mkdir(os.path.join(dest_path, 'Validation', 'Images'))
os.mkdir(os.path.join(dest_path, 'Validation', 'Annotations'))

## Binarize each dataset and dump Images and Annotations

In [2]:
import cv2

def binarize(image): # image <- cv2 image
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 10)
    return binary

all_datasets = os.listdir(source_path)
for dataset in all_datasets:
    dataset_path = os.path.join(source_path, dataset)
    all_images = os.listdir(os.path.join(dataset_path, 'Images'))
    all_annotations = os.listdir(os.path.join(dataset_path, 'Annotations'))
    if len(all_images) > 0:
        for img in all_images:
            image = cv2.imread(os.path.join(dataset_path, 'Images', img), 0)
            binarized = binarize(image)
            cv2.imwrite(os.path.join(dest_path, 'Binarized', img), binarized)
        
        for gt in all_annotations :
            shutil.copy(os.path.join(dataset_path, 'Annotations', gt), os.path.join(dest_path, 'Annotations', gt))
        print(f"Completed '{dataset}' preprocessing : {len(all_images)} images")
    else:
        print(f"Dataset {dataset} images not found")

Completed 'Bongabdo' preprocessing : 3 images


# Augmentation of Preprocessed data and Generating duplicate GTs

In [3]:
import numpy as np
from PIL import Image, ImageEnhance

img_path = os.path.join(dest_path, 'Binarized')
gt_path = os.path.join(dest_path, 'Annotations')
all_images = os.listdir(img_path)
print(f"Before Augmentation, Number of Images : {len(all_images)}")

for img in all_images:
    original = Image.open(os.path.join(img_path, img))
    image = img.rsplit('.', maxsplit = 1)[0]
    c=1

    # Rotate 2.5 anticlockwise
    new = original.rotate(2.5, expand=True, fillcolor=255)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))
    c += 1

    # Rotate 5 anticlockwise
    new = original.rotate(5, expand=True, fillcolor=255)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))
    c += 1

    # Rotate 2.5 clockwise
    new = original.rotate(-2.5, expand=True, fillcolor=255)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))
    c += 1

    # Rotate 5 clockwise
    new = original.rotate(-5, expand=True, fillcolor=255)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))
    c += 1

    enhancer = ImageEnhance.Brightness(original)
    
    # Increase Brightness
    new = enhancer.enhance(5)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))
    c += 1
    
    # Decrease Brightness
    new = enhancer.enhance(0.85)
    new.save(os.path.join(img_path, f"{image}_{c}.jpg"))
    shutil.copy(os.path.join(gt_path, f"{image}.txt"), os.path.join(gt_path, f"{image}_{c}.txt"))

print(f"After Augmentation, Number of Images : {len(os.listdir(os.path.join(img_path)))}")

Before Augmentation, Number of Images : 3
After Augmentation, Number of Images : 21


The below code checks if the Annotations of the corresponding Images in Train split have been copied accordingly within Train split after Augmentation. This should output `True` in case the Images and corresponding Annotations have been properly copied.

In [4]:
all_imgs = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Binarized'))]
all_gt = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Annotations'))]

print(f"Train set check : {all_imgs == all_gt}")
print(f"Train images : {len(all_imgs)}\tTrain Annotations : {len(all_gt)}\t{len(all_imgs) == len(all_gt)}")

Train set check : True
Train images : 21	Train Annotations : 21	True


## Resize Augmented Images

In [5]:
def resizeAugmentedImages(image : str):
    img = cv2.imread(image, 0)
    h, w = img.shape[0], img.shape[1] # (h, w)
    new_height = int(np.ceil((w * 16)/9))
    if h > new_height :
        new_width = int(np.ceil((h * 9)/16))
        new_img = np.full((h, new_width), 255, dtype=np.uint8) # In case of np.full -> (h, w)
        new_img[:, :w] = img # (h, w)
    else:
        new_img = np.full((new_height, w), 255, dtype=np.uint8)
        new_img[:h, :] = img # (h, w)
    return cv2.resize(new_img, (360, 640))

all_imgs = os.listdir(os.path.join(dest_path, 'Binarized'))
for img in all_imgs:
    cv2.imwrite(os.path.join(dest_path, 'Binarized', img), resizeAugmentedImages(os.path.join(dest_path, 'Binarized', img)))

# Creating Train and Validation split

In [6]:
train_ratio = 0.85

import numpy as np
all_images = np.array(os.listdir(os.path.join(dest_path, 'Binarized')))
total = len(all_images)
train_size = int(total * train_ratio)
val_size = total - train_size
random_permutation = np.random.permutation(total)

img_train = all_images[random_permutation[:train_size]]
img_val = all_images[random_permutation[-val_size:]]
print((len(img_train)+len(img_val)) == total)

for img in img_train :
    shutil.move(os.path.join(dest_path, 'Binarized', img), os.path.join(dest_path, 'Train', 'Images', img))
    gt = img.rsplit('.', maxsplit = 1)[0] + '.txt'
    shutil.move(os.path.join(dest_path, 'Annotations', gt), os.path.join(dest_path, 'Train', 'Annotations', gt))
for img in img_val :
    shutil.move(os.path.join(dest_path, 'Binarized', img), os.path.join(dest_path, 'Validation', 'Images', img))
    gt = img.rsplit('.', maxsplit = 1)[0] + '.txt'
    shutil.move(os.path.join(dest_path, 'Annotations', gt), os.path.join(dest_path, 'Validation', 'Annotations', gt))

if len(os.listdir(os.path.join(dest_path, 'Binarized'))) > 0 :
    print(os.listdir(os.path.join(dest_path, 'Binarized')))
else:
    shutil.rmtree(os.path.join(dest_path, 'Binarized'))
if len(os.listdir(os.path.join(dest_path, 'Annotations'))) > 0 :
    print(os.listdir(os.path.join(dest_path, 'Annotations')))
else:
    shutil.rmtree(os.path.join(dest_path, 'Annotations'))

True


The below code checks if the Annotations of the corresponding Images in Train/Validation splits have been copied accordingly within Train/Validation splits. This should output `True` in case the Images and corresponding Annotations have been properly copied.

In [7]:
all_imgs = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Train', 'Images'))]
all_gt = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Train', 'Annotations'))]

print(f"Train set check : {all_imgs == all_gt}")

all_imgs = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Validation', 'Images'))]
all_gt = [i.rsplit('.', maxsplit=1)[0] for i in os.listdir(os.path.join(dest_path, 'Validation', 'Annotations'))]

print(f"Validation set check : {all_imgs == all_gt}")

Train set check : True
Validation set check : True
