In [193]:
import json
import numpy as np
from pycocotools.coco import COCO
import os
import os.path as osp
import copy

In [194]:
data_dir = '../data'
train_path = osp.join(data_dir, 'train.json')
train_all_path = osp.join(data_dir, 'train_all.json')
org_test_path = osp.join(data_dir, 'test_original.json')
new_test_path = osp.join(data_dir, 'test.json')
leak_path = osp.join(data_dir, 'batch_03', 'data.json')

##### train.json #####
with open(train_path, 'r') as f:
    train_json = json.load(f)

##### train_all.json #####
with open(train_all_path, 'r') as f:
    train_all_json = json.load(f)
    
##### test_original.json (original) #####
with open(org_test_path, 'r') as f:
    org_test_json = json.load(f)
    
##### test.json (new) #####
with open(new_test_path, 'r') as f:
    new_test_json = json.load(f)

##### data.json (leakage) #####
with open(leak_path, 'r') as f:
    data_json = json.load(f)
    
coco_org = COCO(org_test_path) # test_original.json
coco_new = COCO(new_test_path) # test.json
coco_leak = COCO(leak_path) # data.json

new_train_json = copy.deepcopy(train_json)
new_train_all_json = copy.deepcopy(train_all_json)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.51s)
creating index...
index created!


In [195]:
org_img_fpaths = [img_info['file_name'] for img_info in org_test_json['images']]
new_img_fpaths = [img_info['file_name'] for img_info in new_test_json['images']]
leak_img_fpaths = sorted(list(set(org_img_fpaths) - set(new_img_fpaths)))

print("[Original number of images]: {}".format(len(org_img_fpaths)))
print("[New number of images]: {}".format(len(new_img_fpaths)))
print("[Number of leakages]: {}".format(len(leak_img_fpaths)))

[Original number of images]: 819
[New number of images]: 624
[Number of leakages]: 195


In [196]:
# [1] 유출된 image가 segmentation이 있는지 확인
    # [1.1] data.json (leak_json)에서 leak_img_fpaths에 있는 file_name에 해당하는 img_id 가져오기 [0,10,334,...]
    # [1.2] 각 img_id에 해당하는 annotation의 length > 0 이면 data.json 에서 'images'하고 'annotations' 가져와서 합칠 data structure 만들기
# [2] train.json과 train_all.json에 합치기

leak_img_infos = [img_info for img_info in data_json['images'] if img_info['file_name'] in leak_img_fpaths]
leak_img_ids = [img_info['id'] for img_info in leak_img_infos]

# DS for Extra data
new_train_img_infos = []
new_train_ann_infos = []
new_train_all_img_infos = []
new_train_all_ann_infos = []

# Last index for image_id and ann_id of original train.json / train_all.json
train_json_img_next_id = train_json['images'][-1]['id'] + 1
train_json_ann_next_id = train_json['annotations'][-1]['id'] + 1
train_all_json_img_next_id = train_all_json['images'][-1]['id'] + 1
train_all_json_ann_next_id = train_all_json['annotations'][-1]['id'] + 1

for img_id, img_info in zip(leak_img_ids, leak_img_infos):
    ann_ids = coco_leak.getAnnIds(img_id)
    ann_infos = coco_leak.loadAnns(ann_ids)
    
    for ann_info in ann_infos:
        if ann_info['category_id'] == 0 or len(ann_info['segmentation']) == 0:
            continue
        # Annotations
        copy1 = copy.deepcopy(ann_info)
        copy2 = copy.deepcopy(ann_info)
        
        copy1.update({
            'id': train_json_ann_next_id,
            'image_id': train_json_img_next_id
        })
        
        copy2.update({
            'id': train_all_json_ann_next_id,
            'image_id': train_all_json_img_next_id
        })
        
        new_train_ann_infos.append(copy1)
        new_train_all_ann_infos.append(copy2)
        
        train_json_ann_next_id += 1
        train_all_json_ann_next_id += 1
    
    # Images
    img_copy1 = copy.deepcopy(img_info)
    img_copy2 = copy.deepcopy(img_info)
    
    img_copy1.update({'id':train_json_img_next_id})
    img_copy2.update({'id':train_all_json_img_next_id})
    
    new_train_img_infos.append(img_copy1)
    new_train_all_img_infos.append(img_copy2)
    
    train_json_img_next_id += 1
    train_all_json_img_next_id += 1

In [197]:
print("[Number of new images for train.json]:", len(new_train_img_infos))
print("[Number of new annotations for train.json]:", len(new_train_ann_infos))
print("[Number of new images for train_all.json]:", len(new_train_all_img_infos))
print("[Number of new annotations for train_all.json]:", len(new_train_all_ann_infos))

[Number of new images for train.json]: 195
[Number of new annotations for train.json]: 691
[Number of new images for train_all.json]: 195
[Number of new annotations for train_all.json]: 691


In [198]:
new_train_json['images'].extend(new_train_img_infos)
new_train_json['annotations'].extend(new_train_ann_infos)

new_train_all_json['images'].extend(new_train_all_img_infos)
new_train_all_json['annotations'].extend(new_train_all_ann_infos)

In [199]:
print("-"*60)
print("[Number of bonus images for train.json]:", len(new_train_img_infos))
print("[Number of bonus annotations for train.json]:", len(new_train_ann_infos))
print("[Number of bonus images for train_all.json]:", len(new_train_all_img_infos))
print("[Number of bonus annotations for train_all.json]:", len(new_train_all_ann_infos))
print("-"*60)
print("[Number of original images for train.json]:", len(train_json['images']))
print("[Number of original annotations for train.json]:", len(train_json['annotations']))
print("[Number of original images for train_all.json]:", len(train_all_json['images']))
print("[Number of original annotations for train_all.json]:", len(train_all_json['annotations']))
print("-"*60)
print("[Number of combined images for train.json]:", len(new_train_json['images']))
print("[Number of combined annotations for train.json]:", len(new_train_json['annotations']))
print("[Number of combined images for train_all.json]:", len(new_train_all_json['images']))
print("[Number of combined annotations for train_all.json]:", len(new_train_all_json['annotations']))
print("-"*60)

------------------------------------------------------------
[Number of bonus images for train.json]: 195
[Number of bonus annotations for train.json]: 691
[Number of bonus images for train_all.json]: 195
[Number of bonus annotations for train_all.json]: 691
------------------------------------------------------------
[Number of original images for train.json]: 2617
[Number of original annotations for train.json]: 20988
[Number of original images for train_all.json]: 3272
[Number of original annotations for train_all.json]: 26240
------------------------------------------------------------
[Number of combined images for train.json]: 2812
[Number of combined annotations for train.json]: 21679
[Number of combined images for train_all.json]: 3467
[Number of combined annotations for train_all.json]: 26931
------------------------------------------------------------


> # Save new json file

In [200]:
train_json_save_path = '../data/train_new.json'
train_all_json_save_path = '../data/train_all_new.json'

with open(train_json_save_path, 'w', encoding='utf-8') as f:
    json.dump(new_train_json, f)
    
with open(train_all_json_save_path, 'w', encoding='utf-8') as f:
    json.dump(new_train_all_json, f)