In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/JnJ Synthetic/data_conversion

/content/drive/MyDrive/JnJ Synthetic/data_conversion


In [3]:
import glob
import os
import json
import re
from distutils.dir_util import copy_tree
from random import shuffle
import shutil
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s]: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

from src.create_annotations import *

# Pre-processing

Tidy up data output from Unity to clean it up into train and validation set for Mask RCNN.

In [None]:
main_folder = '/content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5'

In [None]:
# function to rename files to make them unique

def unique_files(main_folder):
  # run through every folder generated by the different scenes in Unity
  for folder in os.listdir(main_folder):
    path = os.path.join(main_folder, folder)
    inner_folders = os.listdir(path)

    # get the name of the RGB and Semantic folders
    img_folder = str([i for i in inner_folders if i.startswith('RGB')][0])
    semantic_folder = str([i for i in inner_folders if i.startswith('Semantic')][0])

    # rename each file to include the folder name in its name
    img_path = os.path.join(path, img_folder)
    img_files = os.listdir(img_path)
    for file in img_files:
      new_file = f'{folder}_{file}'
      os.rename(os.path.join(img_path, file), os.path.join(img_path, new_file))
    
    semantic_path = os.path.join(path, semantic_folder)
    semantic_files = os.listdir(semantic_path)
    for file in semantic_files:
      new_file = f'{folder}_{file}'
      os.rename(os.path.join(semantic_path, file), os.path.join(semantic_path, new_file))
    
    logging.info(f'Completed renaming of files for {folder} folder!')

In [None]:
unique_files(main_folder)

2021-09-16 05:31:54 [INFO]: Completed renaming of files for aa42780f-bf8e-4e3b-a124-71dc0396d961 folder!
2021-09-16 05:31:54 [INFO]: Completed renaming of files for c064b49a-ee5f-48af-ac65-56d0c714af1b folder!
2021-09-16 05:31:55 [INFO]: Completed renaming of files for aa0beb8d-e18b-4a67-951c-1d48dcf3427a folder!
2021-09-16 05:31:56 [INFO]: Completed renaming of files for 8a305747-f032-44d7-9081-0545c49df4f3 folder!
2021-09-16 05:31:56 [INFO]: Completed renaming of files for e03361ec-eba0-4bf5-8684-489755d4a0d7 folder!
2021-09-16 05:31:57 [INFO]: Completed renaming of files for 416e4f97-dec7-490b-bde1-d984b8b52a13 folder!
2021-09-16 05:31:57 [INFO]: Completed renaming of files for 45438485-fa0d-4493-8d17-f859ad6a8f0e folder!


In [None]:
# function to add all files into a single directory
def group_data(main_folder):
  # create a new final folder with sub-folders of 'images' and 'semantic'
  main_path = os.path.join(main_folder, 'final')
  final_path = os.path.join(main_path, 'images')
  final_path_2 = os.path.join(main_path, 'semantic')
  try:
    os.mkdir(main_path)
    os.mkdir(final_path)
    os.mkdir(final_path_2)
  except FileExistsError:
    # directory already exists
    pass

  for folder in os.listdir(main_folder):
    if folder != 'final': # no need to do for this folder
      path = os.path.join(main_folder, folder)
      inner_folders = os.listdir(path)

      # get the name of the RGB and Semantic folders
      img_folder = str([i for i in inner_folders if i.startswith('RGB')][0])
      img_path = os.path.join(path, img_folder)

      semantic_folder = str([i for i in inner_folders if i.startswith('Semantic')][0])
      semantic_path = os.path.join(path, semantic_folder)

      # copy all items from these paths to the final folders
      copy_tree(img_path, final_path)
      copy_tree(semantic_path, final_path_2)

      logging.info(f'Completed copying files from {folder} folder!')

In [None]:
group_data(main_folder)

2021-09-16 05:33:01 [INFO]: Completed copying files from aa42780f-bf8e-4e3b-a124-71dc0396d961 folder!
2021-09-16 05:33:11 [INFO]: Completed copying files from c064b49a-ee5f-48af-ac65-56d0c714af1b folder!
2021-09-16 05:33:23 [INFO]: Completed copying files from aa0beb8d-e18b-4a67-951c-1d48dcf3427a folder!
2021-09-16 05:34:14 [INFO]: Completed copying files from 8a305747-f032-44d7-9081-0545c49df4f3 folder!
2021-09-16 05:35:03 [INFO]: Completed copying files from e03361ec-eba0-4bf5-8684-489755d4a0d7 folder!
2021-09-16 05:35:29 [INFO]: Completed copying files from 416e4f97-dec7-490b-bde1-d984b8b52a13 folder!
2021-09-16 05:35:54 [INFO]: Completed copying files from 45438485-fa0d-4493-8d17-f859ad6a8f0e folder!


In [None]:
# split into train n val folders

def train_val_split(main_folder):
  main_path = os.path.join(main_folder, 'final')
  img_path = os.path.join(main_path, 'images')
  semantic_path = os.path.join(main_path, 'semantic')

  # # create required folders (train & val)
  train_path = os.path.join(main_path, 'train')
  train_img_path = os.path.join(train_path, 'images')
  train_seg_path = os.path.join(train_path, 'segmentation')

  val_path = os.path.join(main_path, 'val')
  val_img_path = os.path.join(val_path, 'images')
  val_seg_path = os.path.join(val_path, 'segmentation')

  try:
    os.mkdir(train_path)
    os.mkdir(train_img_path)
    os.mkdir(train_seg_path)

    os.mkdir(val_path)
    os.mkdir(val_img_path)
    os.mkdir(val_seg_path)

  except FileExistsError:
    # directory already exists
    pass


  # shuffle images in folder randomly
  images = os.listdir(img_path)
  shuffle(images)

  # split into train and val sets
  train_ratio = 0.8
  no_of_images = int(train_ratio * len(images))
  train_set = images[0: no_of_images]
  val_set = images[no_of_images: -1]

  # copy images and segmentations into new train and val folders
  for name in train_set:
    file = os.path.join(img_path, name)
    shutil.copy(file, os.path.join(train_img_path, name))

    seg_name = name.replace('rgb', 'segmentation')
    seg_file = os.path.join(semantic_path, seg_name)
    shutil.copy(seg_file, os.path.join(train_seg_path, seg_name))

  logging.info(f'Completed copying image and segmentation files to train folder!')

  for name in val_set:
    file = os.path.join(img_path, name)
    shutil.copy(file, os.path.join(val_img_path, name))

    seg_name = name.replace('rgb', 'segmentation')
    seg_file = os.path.join(semantic_path, seg_name)
    shutil.copy(seg_file, os.path.join(val_seg_path, seg_name))
  
  logging.info(f'Completed copying image and segmentation files to validation folder!')

In [None]:
train_val_split(main_folder)

2021-09-16 10:21:28 [INFO]: Completed copying image and segmentation files to train folder!
2021-09-16 10:21:30 [INFO]: Completed copying image and segmentation files to validation folder!


# Processing into Mask RCNN format

Get image details from RGB images and segmentation details from segmentation images and convert them into COCO format.

In [4]:
# Label ids of the dataset
category_ids = {
    "labo": 0
}

# Define which colors match which categories in the images
# multiply value in json by 255
category_colors = {
    "(0, 0, 255)": 0, # labo
}

# Define the ids that are a multiplolygon. e.g. wall, roof and sky
multipolygon_ids = []

In [5]:
# Get "images" and "annotations" info 
def images_annotations_info(main_directory):
  mask_path = os.path.join(main_directory, 'segmentation')
  img_path = os.path.join(main_directory, 'images')

  # This id will be automatically increased as we go
  annotation_id = 0
  image_id = 0
  annotations = []
  images = []
  
  for mask_image in os.listdir(mask_path):
    if mask_image.endswith(".png"):
      # if its an augmented image, make use of original image reference for polygons
      if '_aug' in mask_image:
        short_mask_name = mask_image.replace('_aug', '')
      else:
        short_mask_name = mask_image

      mask_image = os.path.join(mask_path, mask_image)

      # The mask image is in Semantic folder but the original image is in RGB folder.
      # We make a reference to the original file 
      # compare RGB vs segmentation corresponding images using their names
      rgb_name = short_mask_name.replace('segmentation', 'rgb') # COCO format does not have path in name
      original_file_name = os.path.join(img_path, rgb_name)


      # Open the image and (to be sure) we convert it to RGB
      mask_image_open = Image.open(mask_image).convert("RGB")
      w, h = mask_image_open.size

      # "images" info 
      
      # if its an augmented image, use the special '_aug' identifier
      if '_aug' in mask_image:
        rgb_name = rgb_name.replace('.png', '_aug.png')
      else:
        rgb_name = rgb_name
      image = create_image_annotation(rgb_name, w, h, image_id)
      images.append(image)

      sub_masks = create_sub_masks(mask_image_open, w, h)
      for color, sub_mask in sub_masks.items():
          category_id = category_colors[color]

          # "annotations" info
          polygons, segmentations = create_sub_mask_annotation(sub_mask)

          # Check if we have classes that are a multipolygon
          if category_id in multipolygon_ids:
              # Combine the polygons to calculate the bounding box and area
              multi_poly = MultiPolygon(polygons)
                              
              annotation = create_annotation_format(multi_poly, segmentations, image_id, category_id, annotation_id)

              annotations.append(annotation)
              annotation_id += 1
          else:
              for i in range(len(polygons)):
                  # Cleaner to recalculate this variable
                  segmentation = [np.array(polygons[i].exterior.coords).ravel().tolist()]
                  
                  annotation = create_annotation_format(polygons[i], segmentation, image_id, category_id, annotation_id)
                  
                  annotations.append(annotation)
                  annotation_id += 1
      image_id += 1
  return images, annotations, annotation_id

In [6]:
def run_pipeline(main_directory):
    # Get the standard COCO JSON format
    coco_format = get_coco_json_format()

    # Create category section
    coco_format["categories"] = create_category_annotation(category_ids)

    # Create images and annotations sections
    coco_format["images"], coco_format["annotations"], annotation_cnt = images_annotations_info(main_directory)

    # put json into same folder as RGB images
    img_path = os.path.join(main_directory, 'images')
    annot_path = img_path
    mask_path = os.path.join(main_directory, 'segmentation')
    
    with open(f"{annot_path}/annotations.json", "w") as outfile:
        json.dump(coco_format, outfile)
    
    logging.info("Created %d annotations for images in folder: %s" % (annotation_cnt, mask_path))

In [7]:
# do for training set
train_dir = '/content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5/final/train'
run_pipeline(train_dir)

2021-09-17 03:28:49 [INFO]: Created 761 annotations for images in folder: /content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5/final/train/segmentation


In [8]:
# do for val set
val_dir = '/content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5/final/val'
run_pipeline(val_dir)

2021-09-17 03:30:13 [INFO]: Created 197 annotations for images in folder: /content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5/final/val/segmentation


# Create test set JSON file

In [None]:
# Label ids of the dataset
category_ids = {
    "labo": 0
}

# Define which colors match which categories in the images
# multiply value in json by 255
category_colors = {
    "(0, 0, 255)": 0, # labo
}

# Define the ids that are a multiplolygon. e.g. wall, roof and sky
multipolygon_ids = []

In [None]:
# Get "images" and "annotations" info 
def test_json(main_directory):
  img_path = main_directory

  # This id will be automatically increased as we go
  annotation_id = 0
  image_id = 0
  annotations = []
  images = []
  
  for image in os.listdir(img_path):
    if image.endswith(".png"):
      short_name = image
      image = os.path.join(img_path, image)

      # Open the image and (to be sure) we convert it to RGB
      image_open = Image.open(image).convert("RGB")
      w, h = image_open.size

      # "images" info 
      image_info = create_image_annotation(short_name, w, h, image_id)
      images.append(image_info)

      image_id += 1
  return images, annotations, annotation_id

In [None]:
def run_test(main_directory):
    # Get the standard COCO JSON format
    coco_format = get_coco_json_format()

    # Create category section
    coco_format["categories"] = create_category_annotation(category_ids)

    # Create images and annotations sections
    coco_format["images"], coco_format["annotations"], annotation_cnt = test_json(main_directory)

    # put json into same folder as RGB images
    annot_path = main_directory
    
    with open(f"{annot_path}/annotations.json", "w") as outfile:
        json.dump(coco_format, outfile)
    
    logging.info("Created %d annotations for images in folder" % (annotation_cnt))

In [None]:
# do for test set
test_dir = '/content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5/final/test'
run_test(test_dir)

2021-09-16 05:49:44 [INFO]: Created 0 annotations for images in folder


# Image Augmentation

Run this on the training and validation dataset to add variety to the training.

In [None]:
pip install -U albumentations

Collecting albumentations
  Downloading albumentations-1.0.3-py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 5.0 MB/s 
Collecting opencv-python-headless>=4.1.1
  Downloading opencv_python_headless-4.5.3.56-cp37-cp37m-manylinux2014_x86_64.whl (37.1 MB)
[K     |████████████████████████████████| 37.1 MB 45 kB/s 
Installing collected packages: opencv-python-headless, albumentations
  Attempting uninstall: albumentations
    Found existing installation: albumentations 0.1.12
    Uninstalling albumentations-0.1.12:
      Successfully uninstalled albumentations-0.1.12
Successfully installed albumentations-1.0.3 opencv-python-headless-4.5.3.56


In [None]:
main_folder = '/content/drive/MyDrive/JnJ Synthetic/data_conversion/data/labo_5'

In [None]:
import albumentations as A
import cv2
from matplotlib import pyplot as plt
%matplotlib inline

- OpenCV reads images in BGR.
- matplotlib displays images in RGB.

In [None]:
def img_augmentation(main_folder):
  train_path = os.path.join(main_folder, 'final/train/images')
  val_path = os.path.join(main_folder, 'final/val/images')

  # start with the training set
  train_imgs = os.listdir(train_path)

  # read each image path with cv2 to convert to image array
  train_list = [cv2.imread(os.path.join(train_path, i)) for i in train_imgs]
  # convert from BGR to RGB
  # train_list = [cv2.cvtColor(i, cv2.COLOR_RGB2BGR) for i in train_list]

  # Declare an augmentation pipeline
  transform = A.Compose([
      A.HueSaturationValue(p=0.3),
      A.RandomContrast(p=0.3, limit=0.5),
      A.RandomBrightness(p=0.3, limit=0.4),
      A.ToGray(p=0.5),
      A.GaussNoise(var_limit=(50,70), p=0.3),
      A.ISONoise(p=0.3, intensity=(0.3,0.7)),
      A.MotionBlur(p=0.3),
      A.GaussianBlur(p=0.3)
  ])

  # run augmentation on every image and save into same folder
  # convert back from RGB to BGR
  # save it with new name (add '_aug' behind every file name)
  for idx,image in enumerate(train_list):
    transformed = transform(image=image)['image']
    # transformed = cv2.cvtColor(transformed, cv2.COLOR_BGR2RGB)
    new_name = train_imgs[idx].replace('.png', '') + '_aug.png'
    cv2.imwrite(os.path.join(train_path, new_name), transformed)
  logging.info(f'Completed augmentations for train folder!')
  

  # do for validation set
  val_imgs = os.listdir(val_path)
  val_list = [cv2.imread(os.path.join(val_path, i)) for i in val_imgs]
  # val_list = [cv2.cvtColor(i, cv2.COLOR_RGB2BGR) for i in val_list]

  for idx,image in enumerate(val_list):
    transformed = transform(image=image)['image']
    # transformed = cv2.cvtColor(transformed, cv2.COLOR_BGR2RGB)
    new_name = val_imgs[idx].replace('.png', '') + '_aug.png'
    cv2.imwrite(os.path.join(val_path, new_name), transformed)  
  logging.info(f'Completed augmentations for val folder!')

In [None]:
img_augmentation(main_folder)

2021-09-16 10:22:50 [INFO]: Completed augmentations for train folder!
2021-09-16 10:23:01 [INFO]: Completed augmentations for val folder!
