In [184]:
from pathlib import Path
from yamlu import ls
from yamlu.coco_read import CocoReader

In [185]:
import os
import json
import collections
import shutil
import random

# Creating the Yolo Dataset from the COCO Datasets

In handwritten-diagram-datasets/datasets/ we have different datasets in COCO format. Need to convert these to YOLO format.

### Step 1: Set up destination directories/source directories

In [186]:
os.mkdir("yolo_dataset")
os.mkdir("yolo_dataset/images")
os.mkdir("yolo_dataset/labels/")

In [187]:
data_dirs = {'fa': "handwritten-diagram-datasets/datasets/fa", # finite automata - FINAL STATES ARE NOT LABELED CORRECLTY FOR OUR PURPOSES: SKIP
             'fca': "handwritten-diagram-datasets/datasets/fca", # online flowchart: INCLUDE
             'fcb': "handwritten-diagram-datasets/datasets/fcb", # offline flowchart: INCLUDE
             'fcb_scan': "handwritten-diagram-datasets/datasets/fcb_scan", # DUPLICATE OF fcb: SKIP
             'didi': "handwritten-diagram-datasets/datasets/didi"}

### Step 2: Export the COCO labels from the json files to YOLO labels in new directory

In [188]:
labels_dict = {
    'final': ['circle', 'rectangle', 'parallelogram', 'diamond', 'arrow', 'text'],
    'general': ['connection', 'data', 'decision', 'process', 'terminator', 'text', 'arrow']
}

label_map = { # what each COCO label should become in the YOLO labels
    'connection': 'circle',
    'data': 'parallelogram',
    'decision': 'diamond',
    'process': 'rectangle',
    'terminator': 'circle',
    'text': 'text',
    'arrow': 'arrow'
}

In [189]:
def parse_coco_json_to_yolo_with_cocoreader(coco_dir, dest_dir, labels, label_map=None, train=True, doPrint=False):
    label_set = set(labels)
    
    scan_reader = CocoReader(Path(coco_dir))

    if train:
        ann_imgs = scan_reader.parse_split("train")
    else:
        ann_imgs = scan_reader.parse_split("test")

    for image in ann_imgs:
        img_name = image.filename
        img_width, img_height = image.size
        with open(os.path.join(dest_dir, img_name[:-4] + '.txt'), 'w') as f:
            for annotation in image.annotations:
                if (label_map and label_map[annotation.category] not in label_set) or (not label_map and annotation.category not in label_set):
                    print(f"WARNING: There is an annotation not in the label set/map with label {annotation.text}.")
                    continue
                
                xmin = annotation.bb.l
                ymin = annotation.bb.t
                xmax = annotation.bb.r
                ymax = annotation.bb.b
                
                xcenter = ((xmin + xmax) / 2) / img_width
                ycenter = ((ymin + ymax) / 2) / img_height
                width = (xmax - xmin) / img_width
                height = (ymax - ymin) / img_height
                if label_map:
                    f.write(f"{labels.index(label_map[annotation.category])} {xcenter} {ycenter} {width} {height}\n")
                    if doPrint:
                        print(img_name, labels.index(label_map[annotation.category]), xcenter, ycenter, width, height)
                else:
                    f.write(f"{labels.index(annotation.category)} {xcenter} {ycenter} {width} {height}\n")
                    if doPrint:
                        print(img_name, labels.index(annotation.category), xcenter, ycenter, width, height)


In [190]:
# FCB train labels
parse_coco_json_to_yolo_with_cocoreader(data_dirs['fcb'], "yolo_dataset/labels", labels_dict['final'], label_map)

100%|███████████████████████████████████████████████████████████████████████████████| 280/280 [00:02<00:00, 102.65it/s]


In [191]:
# FCB test labels
parse_coco_json_to_yolo_with_cocoreader(data_dirs['fcb'], "yolo_dataset/labels", labels_dict['final'], label_map, train=False)

100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [00:02<00:00, 94.23it/s]


In [192]:
# FCA train labels
parse_coco_json_to_yolo_with_cocoreader(data_dirs['fca'], "yolo_dataset/labels", labels_dict['final'], label_map)

100%|███████████████████████████████████████████████████████████████████████████████| 248/248 [00:02<00:00, 108.06it/s]


In [193]:
# FCA test labels
parse_coco_json_to_yolo_with_cocoreader(data_dirs['fca'], "yolo_dataset/labels", labels_dict['final'], label_map, train=False)

100%|███████████████████████████████████████████████████████████████████████████████| 171/171 [00:01<00:00, 105.77it/s]


### Step 3: Copy the images from handwritten datasets to COCO datasets

In [194]:
def move_all_imgs(src_dir, dest_dir, keep=4):
    ctr = 0
    for img in list(filter(lambda x: "jpg" in x or "png" in x,  os.listdir(src_dir))):
        if ctr < keep:
            shutil.copy(os.path.join(src_dir, img), os.path.join(dest_dir, img))
            ctr += 1
        else:
            shutil.move(os.path.join(src_dir, img), os.path.join(dest_dir, img))

In [195]:
# FCB train images
move_all_imgs(os.path.join(data_dirs["fcb"], "train"), "yolo_dataset/images")

In [196]:
# FCB test images
move_all_imgs(os.path.join(data_dirs["fcb"], "test"), "yolo_dataset/images")

In [197]:
# FCA train images
move_all_imgs(os.path.join(data_dirs["fca"], "train"), "yolo_dataset/images")

In [198]:
# FCA test images
move_all_imgs(os.path.join(data_dirs["fca"], "test"), "yolo_dataset/images")

### Step 4: Create the labels.txt file from the COCO json file

In [199]:
def get_labels_txt_file(json_file):
    with open(json_file, 'r') as f:
        j = json.load(f)

    labels = list()
    
    for cat in j["categories"]:
        labels.append(cat['name'])

    return labels

In [200]:
def write_labels_txt_file(labels, dest_file):
    with open(dest_file, 'w') as f:
        for label in labels:
            f.write(label + '\n')

In [201]:
get_labels_txt_file(data_dirs['fca'] + "/train.json")

['connection', 'data', 'decision', 'process', 'terminator', 'text', 'arrow']

In [202]:
get_labels_txt_file(data_dirs['fcb'] + "/train.json")

['connection', 'data', 'decision', 'process', 'terminator', 'text', 'arrow']

In [203]:
write_labels_txt_file(labels_dict['final'], "yolo_dataset/labels.txt")

### Step 5: Random Train/Test Split

In [204]:
os.mkdir("yolo_dataset/train")
os.mkdir("yolo_dataset/test")
os.mkdir("yolo_dataset/train/images")
os.mkdir("yolo_dataset/train/labels")
os.mkdir("yolo_dataset/test/images")
os.mkdir("yolo_dataset/test/labels")

In [205]:
p = 0.1 # proportion of images to be put in test set

img_list = os.listdir("yolo_dataset/images")
for img in img_list:
    if random.uniform(0, 1) <= p:
        # move to test set
        shutil.move(os.path.join("yolo_dataset/images", img), os.path.join("yolo_dataset/test/images", img))  
        shutil.move(os.path.join("yolo_dataset/labels", img[:-4] + '.txt'), os.path.join("yolo_dataset/test/labels", img[:-4] + '.txt'))  
    else:
        # move to train set
        shutil.move(os.path.join("yolo_dataset/images", img), os.path.join("yolo_dataset/train/images", img))  
        shutil.move(os.path.join("yolo_dataset/labels", img[:-4] + '.txt'), os.path.join("yolo_dataset/train/labels", img[:-4] + '.txt')) 

In [206]:
os.rmdir("yolo_dataset/images")
os.rmdir("yolo_dataset/labels")

# ...Now have yolo_dataset directory with the correct files from COCO datasets