In [11]:
from pycocotools.coco import COCO
import re
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
import random
import json
import os
import shutil
from pathlib import Path
import os.path
import subprocess
import os
import glob
import sys
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

def coco_to_yolo_bb(x1, y1, w, h, image_w, image_h):
    return [((2*x1 + w)/(2*image_w)) , ((2*y1 + h)/(2*image_h)), w/image_w, h/image_h]

def mkdir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

def rmdir(dataset_dir):
    if os.path.exists(dataset_dir):
        shutil.rmtree(dataset_dir)

def get_cat_data(coco):
    cats = coco.loadCats(coco.getCatIds())
    cat_map = {}
    for i,cat in enumerate(cats):
        cat_map[cat['id']] = i
    catIds = coco.getCatIds()
    return catIds, cat_map

def get_random_img_ids(numImgs,coco):
    imgIds = coco.getImgIds()
    random.shuffle(imgIds)
    imgIds = list(filter(lambda imgId: img_id_has_annotation(imgId, coco), imgIds))
    imgIds = list(filter(lambda imgId: img_id_has_positive_bboxes(imgId,coco), imgIds))
    #add a filter for labels outside frame
    #have a discussion about these images probably
    return imgIds[:numImgs]

def img_id_has_positive_bboxes(imgId,coco):
    annIds = coco.getAnnIds(imgIds=imgId)
    anns = coco.loadAnns(annIds)
    for ann in anns:
        negative_bbox = False 
        for bbox_val in ann['bbox']:
            if bbox_val < 0:
                negative_bbox = True
        if negative_bbox == True:
            return False
    return True

def img_id_has_annotation(imgId,coco):
    annIds = coco.getAnnIds(imgIds=imgId)
    if(len(annIds) > 0):
        return True
    else:
        return False

def write_label_files(numImgs, cat_map, catIds, label_dir1, coco):
    """
    Some images:
    *Do not have annotations.
    *Have negative bboxes
    """
    imgIds = get_random_img_ids(numImgs,coco)

    for imgId in imgIds:
        img = coco.loadImgs(imgId)[0] #If only 1 imgId, len() = 1
        label_dir = Path(label_dir1)
        filename = img['file_name'].split('.')[0] + '.txt'
        with open(label_dir/filename, 'w') as file:
            annIds = coco.getAnnIds(imgIds = imgId, catIds=catIds)
            anns = coco.loadAnns(annIds)
            for i,ann in enumerate(anns):
                x_center, y_center, width, height = coco_to_yolo_bb(*ann['bbox'], img['width'], img['height'])
                file.write(f'{cat_map[ann["category_id"]]} {x_center} {y_center} {width} {height}\n')
    return

def copy_images_to_dataset_dir(src_image_dir, dest_image_dir, image_extension, src_label_dir):
    """
    x number of labels are written to the labels dir by write_label_files(x)
    This function copies the corresponding images into the sibling images dir, completing the dataset folder for YOLO training
    """
    fs = os.listdir(src_label_dir)
    for f in fs:
        if f == '.DS_Store':
            continue
        src = str(src_image_dir/f.split('.')[0]) + image_extension
        dest = str(dest_image_dir/ f.split('.')[0]) + image_extension
        if os.path.isfile(src):
            shutil.copyfile(src, dest)
        else:
            print(f'{src} is not a file')
    return

def dataset_helper(dataset_name):
    #i.e. [n, 2500, t]
    [name, num_imgs, set] = re.split('(\d+)', dataset_name)
    if name == 'c':
        file_extension = '.jpg'
        if set == 't':
            print('coco train set not download')
            return
        elif set == 'v':
            src_img_dir = '/usr/src/raw_datasets/coco/val2017'
            annotations_json_file = '/usr/src/raw_datasets/coco/annotations/instances_val2017.json'
    if name == 'n' :
        file_extension = '.png'
        if set == 't':
            src_img_dir = '/usr/src/raw_datasets/nightowls/nightowls_training'
            annotations_json_file = '/usr/src/raw_datasets/nightowls/nightowls_training.json'
        elif set == 'v':
            src_img_dir = '/usr/src/raw_datasets/nightowls/nightowls_validation'
            annotations_json_file = '/usr/src/raw_datasets/nightowls/nightowls_validation.json'

    dataset_dir = f'/usr/src/datasets/{dataset_name}'
    base_dir = f'{dataset_dir}/base'
    label_dir =  f'{base_dir}/labels'
    img_dir = f'{base_dir}/images'

    rmdir(dataset_dir)
    print(f'{dataset_dir} removed')

    mkdir(dataset_dir)
    mkdir(base_dir)
    mkdir(label_dir)
    mkdir(img_dir)
    print(f'{dataset_dir} created')
    print(f'{base_dir} created')
    print(f'{img_dir} created')
    print(f'{label_dir} created')

    return src_img_dir, label_dir, img_dir, annotations_json_file, int(num_imgs), file_extension


def produce_dataset(dataset_name):
    src_img_dir, label_dir, img_dir, annotations_json_file, num_imgs, file_extension  = dataset_helper(dataset_name)

    print(f'Using {annotations_json_file}')
    coco=COCO(annotations_json_file)

    catIds, cat_map = get_cat_data(coco)
    write_label_files(num_imgs, cat_map, catIds, label_dir,coco) 
    copy_images_to_dataset_dir(Path(src_img_dir),Path(img_dir), file_extension, label_dir) #dataset/images

    os.system(f'ls {img_dir} | wc -l ; ls {label_dir} | wc -l')
    return

if __name__ == "__main__":

    #produce_nightowls_training_set(100) #max = 130064
    produce_dataset('n100t') #max = 130064

    #coco=COCO('/usr/src/raw_datasets/nightowls/nightowls_validation.json')
    #produce_nightowls_validation_set(100)
#
#    annotations_json_file = '/Users/azakaria/Documents/yolo datasets/coco/instances_train2017.json'
#    coco=COCO(annotations_json_file)
#    produce_coco_training_set(480)
#
#    annotations_json_file = '/usr/src/raw_datasets/coco/annotations/instances_val2017.json'
#    coco=COCO(annotations_json_file)
#    produce_coco_validation_set(5000)

/usr/src/datasets/n100t removed
/usr/src/datasets/n100t created
/usr/src/datasets/n100t/base created
/usr/src/datasets/n100t/base/images created
/usr/src/datasets/n100t/base/labels created
Using /usr/src/raw_datasets/nightowls/nightowls_training.json
loading annotations into memory...
Done (t=0.64s)
creating index...
index created!
100
100


In [10]:
catIds, cat_map = get_cat_data()
#print(catIds)
#print(cat_map)

def confirm_files_origin(dir0, dir1):
    #confirm files in dir0 come from dir1
    dir1_images = os.listdir(dir1)
    dir0_images = os.listdir(dir0)
    for f in dir1_images:
        if f not in dir0_images:
            print(f'{f} not in {dir0_images}')
            return 
    print("All image files originate from the correct source dataset")
    print("Note: label files are derived from annotations.json")
    return

#confirm_files_origin('/usr/src/raw_datasets/nightowls/nightowls_validation','/usr/src/datasets/source/n100v/images')

#def test_get_random_img_ids():
    #for v in get_random_img_ids(100):


TypeError: get_cat_data() missing 1 required positional argument: 'coco'