In [18]:
from glob import glob
import labelme2coco
import os
import shutil
import json
import numpy as np
from tqdm import tqdm

### Clean data annotations

Move all the annotations file to one folder

In [None]:
data_folders = glob(os.getcwd() + '/../data/birds/annotations/all/*')
data_files = glob(os.getcwd() + '/../data/birds/annotations/22*/*')
len(data_folders), len(data_files)

In [None]:
def CopyAnno(data_files, data_folders):

    out_fns = [f.split('/')[-1] for f in data_folders]
    out_dir = os.getcwd() + '/../data/birds/annotations/all'
    n = 0

    for p in tqdm(data_files):
        in_fn = p.split('/')[-1]
        if in_fn not in out_fns:
            n += 1
            shutil.copy(p, out_dir)
    if n!=0:
        print('Done copying', n, ' files.')
    else:
        print('No files copied.')

Convert filepaths to apply labelme2coco functions

In [None]:
data_folders = glob(os.getcwd() + '/../data/birds/annotations/all/*')

for d in tqdm(data_folders):
    with open(d, "r") as f:
        anno_json = json.load(f)
        anno_json['imagePath'] = anno_json['imagePath'].replace('\\','/')

    with open(d, "w") as f:
        json_string = json.dumps(anno_json)
        f.write(json_string)

In [None]:
# import functions
from labelme2coco import get_coco_from_labelme_folder, save_json

# set labelme training data directory
labelme_folder = os.getcwd() + '/../data/birds/annotations/all'

# convert labelme annotations to coco
out = get_coco_from_labelme_folder(labelme_folder)

In [None]:
# set path for coco json to be saved
export_dir = os.getcwd() + '/../data/birds/'

# export train coco json
save_json(out.json, export_dir + "coco_annotations.json")

## Data cleaning

In [2]:
path_in = os.getcwd() + '/../data/birds/annotations/coco_annotations.json'

with open(path_in, "r") as f:
    anno = json.load(f)


In [4]:
anno.keys()

dict_keys(['images', 'annotations', 'categories'])

In [7]:
anno['images'][0].keys()

dict_keys(['height', 'width', 'id', 'file_name'])

In [8]:
anno['annotations'][0].keys()

dict_keys(['iscrowd', 'image_id', 'bbox', 'segmentation', 'category_id', 'id', 'area'])

In [79]:
anno['categories']

[{'id': 0, 'name': 'bird', 'supercategory': 'bird'},
 {'id': 1, 'name': 'car', 'supercategory': 'car'},
 {'id': 2, 'name': 'dog', 'supercategory': 'dog'},
 {'id': 3, 'name': 'human', 'supercategory': 'human'}]

Remove the images with annotations other than bird: 'car', 'dog', 'human'.

    (1) find images containing the annotations
    (2) find if birds are on these images
    (3) remove the annotations

In [30]:
anno_id = [a['category_id'] for a in anno['annotations']]
img_id = [a['image_id'] for a in anno['annotations']]

In [53]:
cond = [a != 0 for a in anno_id]
print('There are ', np.count_nonzero(cond), 'annotations not bird.')
cat, count = np.unique(np.array(anno_id)[cond], return_counts=True)
print('Other annotations of', count[0],'"car",', count[1],'"dog",', count [2],'"human".')

img_id_other = np.array(img_id)[cond]
print('Image IDs ', img_id_other)

There are  6 annotations not bird.
Other annotations of 3 "car", 1 "dog", 2 "human".
Image IDs  [ 295  296  778 1215 1216 1525]


In [71]:
## Remove Annotations other than birds
anno_list = [a for a in anno['annotations'] if a['category_id'] == 0]

## Remove images containing no annotations
img_ids = np.unique([a['image_id'] for a in anno_list])
img_list = [a for a in anno['images'] if a['id'] in img_ids]

In [73]:
len(anno['images']), len(img_list)

(2194, 2186)

In [78]:
len(np.unique(img_ids)), len(np.unique(img_id))

(2186, 2191)

In [82]:
anno['categories'][0]

{'id': 0, 'name': 'bird', 'supercategory': 'bird'}

In [88]:
new_anno = {}
new_anno['images'] = img_list
new_anno['annotations'] = anno_list
new_anno['categories'] = [{'id':0, 'name':'bird'}]

In [89]:
with open(path_in, "w") as f:
    json_string = json.dumps(new_anno)
    f.write(json_string)