## Data preperation for SAI dataset

This file consists of script that 
 1. convert SAI dataset into Detectron2 compatible format
 2. shuffle dataset into train, val and test.

Note that dataset must organised in the standard data structure illustrated in the README.md.

### Helper functions

In [3]:
import os, json, shutil


def split_dataset(label_dicts: list, split_ratio: list) -> dict:
    r"""
    split dataset into three subsets: train, val and test.

    Args:
     - label_dicts (list): is the label dictionary. User must shuffle before split.
     - split_ratio (list): this is a list consisting of the ratio between train, val, test.
       e.g. split_ratio = [8,1,1] denotes train:val:test = 8:1:1
    """

    # split dataset
    train_num = int(len(label_dicts) * (split_ratio[0] / sum(split_ratio)))
    val_num = int(len(label_dicts) * (split_ratio[1] / sum(split_ratio)))
    test_num = int(len(label_dicts) * (split_ratio[2] / sum(split_ratio)))
    print(f"set [train:val:test] to [{train_num}:{val_num}:{test_num}]")

    # initialise variables
    label_cat = {}
    label_cat["train"] = label_dicts[:train_num]
    # When there is test set
    if split_ratio[2] != 0:
        label_cat["val"] = label_dicts[train_num : train_num + val_num]
        label_cat["test"] = label_dicts[train_num + val_num :]
    # Test set may consit of data cuz we use int(.) to convert length*ratio to index
    # code below guarantees no data in test set.
    else:
        label_cat["val"] = label_dicts[train_num:]
        label_cat["test"] = []

    return label_cat


def save_ext_dataset(dataset_name: str, data_path: str, label_cat: dict):
    r"""
    save dataset and label to its correponding folders.

    Args:
     - dataset_name (str): name of the dataset.
     - dataset_path (str): path of the dataset.
     - label_cat (dict): a dictionary consisting of the labels of train, val and test set. Get this with function split_dataset()
    """

    # iterate through different subset.
    for d in ["train", "val", "test"]:

        # INITIALISATION for {d}
        # set up output dir for {d}
        sub_img_dir = os.path.join(data_path, dataset_name, "images", d)
        shutil.rmtree(sub_img_dir, ignore_errors=True)
        os.makedirs(sub_img_dir, exist_ok=True)

        # set up label file for {d}
        sub_label_filenames = os.path.join(
            data_path, dataset_name, "labels", f"labels_{d}.json"
        )
        open(sub_label_filenames, "w").close()  # reset file

        # CREATE CORRESPONDING SET for {d}
        # construct label file
        with open(sub_label_filenames, "a") as f:
            f.write("[")

        # extract training and validation imgs and labels from /images and labels.json
        for idx, v in enumerate(label_cat[d]):

            # copy training set from images to /train
            orig = os.path.join(
                data_path,
                dataset_name,
                "images",
                "all",
                os.path.basename(v["file_name"]),
            )
            dst = os.path.join(sub_img_dir, os.path.basename(v["file_name"]))
            shutil.copyfile(orig, dst)

            # save dict to label_train.json
            with open(sub_label_filenames, "a") as f:
                json.dump(v, f)
                if idx < len(label_cat[d]) - 1:  # do not add , at the end of the
                    f.write(",")

        with open(sub_label_filenames, "a") as f:
            f.write("]")
            f.close()

### Convert, shuffle and split
This script does:
1. convert COCO labeling format to Detectron2 compatible
2. shuffle and split dataset into train, val, and test sets

In [23]:
import os, json, random

"""_SOURCE FORMAT
Source dataset follow stantard coco format. It consists
    "images": (a list of image object below)
        "file_name": (string)
        "width": (int)
        "height": (int)
        "id": (this is used to link with the annotations)
    "annotations": (list of annotation object below) 
        "bbox": (a list consisting of four values)
        "area": (int)
        "iscorwd": (default set to 0)
        "category_id": (this is used to link with the categories)
        "keypoints": (a list of key points)
        "segmentations": (a list of segmentation list)
        "num_keypoints": (int)
        "image_id": (this is used to link with the images)
        "id": (this is the id of the annotation)
    "categories": (list of category object below)
        "id": (int, category id)
        "name": (string, name of the category)

See https://www.immersivelimit.com/tutorials/create-coco-annotations-from-scratch/#coco-dataset-format for further instruction.

"""

"""_summary_
TARGET FORMAT
Target format is a list of dictionaries.
Each dictionary consists of the following entry
    "file_name": (string)
    "image_id": (int)
    "height": (int)
    "width": (int)
    "annotations": (list of annotations below)
        "bbox": (a list consisting of four values)
        "bbox_mode": (default set to 0)
        "segmentation": (a list of segmentation list)
        "category_id": (default set to 0)
"""


dataset_dir = "../../../google-drive/stomaVDP"
dataset_name = "2023-SAI-arabidopsis-42"
seed_num = 28825252
split_ratio = [8, 2, 0]  # The ratio stands for train:val:test

img_dir = os.path.join(dataset_dir, dataset_name, "images", "all")
label_input = os.path.join(dataset_dir, dataset_name, "labels", "labels.json")
label_output_dir = os.path.join(dataset_dir, dataset_name, "labels")

print(f"Start parsing data from {label_input}...")

# Load Coco format JSON file
source_annotations = None
with open(label_input) as f:
    source_annotations = json.load(f)

img_num = len(source_annotations["images"])
print(f"Number of images loaded: {img_num}")

# init label list
label_dicts = []

print("processing......")
for source_img in source_annotations["images"]:
    # init image dict
    target_img = {}

    # set basic info
    target_img["file_name"] = source_img["file_name"]
    target_img["image_id"] = source_img["id"]
    target_img["height"] = source_img["height"]
    target_img["width"] = source_img["width"]

    # init annotation list
    target_img["annotations"] = []

    for source_anno in source_annotations["annotations"]:
        # init annotation dict
        target_anno = {}

        if (
            source_img["id"] == source_anno["image_id"]
        ):  # the annotation belongs to the image
            # set annotation info
            target_anno["bbox"] = source_anno["bbox"]
            target_anno["bbox_mode"] = 0  # this value is by default
            target_anno["category_id"] = (
                0  # this value is by default as we only have one class
            )

            # init segmentation list
            target_anno["segmentation"] = []

            for seg in source_anno["segmentation"]:
                # set segmentation info
                target_anno["segmentation"].append(seg)

            # append annotation to img dict
            target_img["annotations"].append(target_anno)

    print(len(target_img["annotations"]))
    # append img to label list
    label_dicts.append(target_img)

# Shuffle, split and save datas and labels
random.seed(seed_num)
random.shuffle(label_dicts)
label_cat = split_dataset(label_dicts, split_ratio)

# Save to files
save_ext_dataset(dataset_name, dataset_dir, label_cat)
print(f"processed data is saved to {os.path.join(dataset_dir, dataset_name)}")

Start parsing data from ../2023-SAI-arabidopsis-42/labels/labels.json...
Number of images loaded: 42
processing......
4
4
7
4
8
6
4
13
4
7
8
3
10
4
6
12
4
7
6
5
3
9
5
8
10
5
8
7
9
5
6
10
9
8
4
6
9
6
10
6
10
11
set [train:val:test] to [33:8:0]
processed data is saved to ../2023-SAI-arabidopsis-42
