In [1]:
import pandas as pd

import json

from tqdm import tqdm
import os
import copy

In [2]:
# google drive data 폴더에 있는 ver3 내부 csv 파일들을 받아야 합니다.
def data_cleasing(json_file):
    print(f'***{json_file}***')


    # 전처리할 데이터 불러오기
    data_dir = '../../../data/stratified_kfold'

    with open(os.path.join(data_dir, json_file), 'r', encoding='UTF-8') as data_json:
        data = json.load(data_json)
    
        info = data['info']
        licenses = data['licenses']
        categories = data['categories']
    
        images = data['images']
        annotations = data['annotations']

    # 제외할 이미지 목록 불러오기
    df_image_exclude = pd.read_csv('./ver3/image.csv')
    img_exclude_list = list(df_image_exclude.file_name)

    # 이미지 제외하기
    new_images = []
    new_annotations = []

    annotation_id = 0
    image_id = 0

    cnt = 0

    print('excluding images...')
    for img in tqdm(images):

        file_name = img['file_name']

        for ann in annotations:
            # 제외할 목록에 있다면 넘어가기
            if file_name in img_exclude_list:
                continue
        
            if ann['image_id'] == img['id']:
                new_annotations.append(copy.deepcopy(ann))
                new_annotations[-1]['id'] = annotation_id
                new_annotations[-1]['image_id'] = image_id
                annotation_id += 1

        if file_name in img_exclude_list:
            cnt += 1
        else:
            new_images.append(copy.deepcopy(img))
            new_images[-1]['id'] = image_id
            image_id += 1

    print(f'{cnt} images excluded.')

    # 잘못된 category_id 변경해주기
    df_anno = pd.read_csv('./ver3/anno.csv')

    wrong_anno_files = list(df_anno.file_name)
    wrong_cats = df_anno.category
    
    cnt = 0

    for f, c in zip(wrong_anno_files, wrong_cats):
        # Plastic bag 내부 object의 category가 2가지인 경우도 있다.
        c = c.split(', ')

        for img in new_images:
            # revise해야할 file_name과 겹친다면
            if f == img['file_name']:
                for i in range(len(new_annotations)):
                    if new_annotations[i]['image_id'] == img['id']:
                        # 바뀌어야 할 category와 같다면 (Plastic bag 내부의 object라면)
                        if new_annotations[i]['category_id'] == int(c[0]):
                            new_annotations[i]['category_id'] = 8
                            cnt += 1

                        if len(c) > 1:
                            if new_annotations[i]['category_id'] == int(c[1]):
                                new_annotations[i]['category_id'] = 8
                                cnt += 1
    
    print(f'{cnt} categories changed.')

    # Plastic bag 내부 object category가 외부 object category와 겹친다면,
    # Plastic bag bbox 안 object의 category 변경하기
    df_bbox = pd.read_csv('./ver3/bbox.csv')

    wrong_img_id = []

    for img in new_images:
        if img['file_name'] in list(df_bbox.file_name):
            wrong_img_id.append(img['id'])

    cnt = 0

    for img_id in wrong_img_id:
        for i in range(len(new_annotations)):
            # Plastic bag의 bbox 구하기
            if img_id == new_annotations[i]['image_id'] and new_annotations[i]['category_id'] == 8:
                bbox_pb = new_annotations[i]['bbox']

                x_min_pb = bbox_pb[0]
                y_min_pb = bbox_pb[1]
                x_max_pb = x_min_pb + bbox_pb[2]
                y_max_pb = y_min_pb + bbox_pb[3]

                for j in range(len(new_annotations)):
                    if j != i and new_annotations[j]['image_id'] == img_id:
                        bbox = new_annotations[j]['bbox']

                        x_min = bbox[0]
                        y_min = bbox[1]
                        x_max = x_min + bbox[2]
                        y_max = y_min + bbox[3]

                        # Plastic bag bbox 안에 object가 있다면 Plastic bag으로 바꿔주기
                        if x_min_pb <= x_min and y_min_pb <= y_min and x_max_pb >= x_max and y_max_pb >= y_max:
                            new_annotations[j]['category_id'] = 8
                            cnt += 1

    print(f'{cnt} categories changed.')

    json_file = json_file.split('.')[0]
    print(f'saving revised {json_file}...')
    print()

    with open(os.path.join(data_dir, f'{json_file}_revised.json'), 'w', encoding='UTF-8') as new_json:
        json.dump({ 'info': info, 'licenses': licenses, 'images': new_images, 
            'annotations': new_annotations, 'categories': categories}, new_json, indent=1)


In [3]:
data_cleasing('train0.json')
data_cleasing('train1.json')
data_cleasing('train2.json')
data_cleasing('train3.json')
data_cleasing('train4.json')
data_cleasing('val0.json')
data_cleasing('val1.json')
data_cleasing('val2.json')
data_cleasing('val3.json')
data_cleasing('val4.json')

***train0.json***
excluding images...


100%|██████████| 2775/2775 [01:18<00:00, 35.40it/s]


39 images excluded.
47 categories changed.
16 categories changed.
saving revised train0...

***train1.json***
excluding images...


100%|██████████| 2771/2771 [01:20<00:00, 34.53it/s]


41 images excluded.
65 categories changed.
15 categories changed.
saving revised train1...

***train2.json***
excluding images...


100%|██████████| 2774/2774 [01:21<00:00, 34.13it/s]


52 images excluded.
54 categories changed.
23 categories changed.
saving revised train2...

***train3.json***
excluding images...


100%|██████████| 2770/2770 [01:20<00:00, 34.41it/s]


45 images excluded.
48 categories changed.
27 categories changed.
saving revised train3...

***train4.json***
excluding images...


100%|██████████| 2778/2778 [01:18<00:00, 35.53it/s]


43 images excluded.
46 categories changed.
27 categories changed.
saving revised train4...

***val0.json***
excluding images...


100%|██████████| 692/692 [00:07<00:00, 92.78it/s] 


16 images excluded.
18 categories changed.
11 categories changed.
saving revised val0...

***val1.json***
excluding images...


100%|██████████| 696/696 [00:06<00:00, 100.77it/s]


14 images excluded.
0 categories changed.
12 categories changed.
saving revised val1...

***val2.json***
excluding images...


100%|██████████| 693/693 [00:06<00:00, 102.19it/s]


3 images excluded.
11 categories changed.
4 categories changed.
saving revised val2...

***val3.json***
excluding images...


100%|██████████| 697/697 [00:06<00:00, 103.16it/s]


10 images excluded.
17 categories changed.
0 categories changed.
saving revised val3...

***val4.json***
excluding images...


100%|██████████| 689/689 [00:07<00:00, 94.45it/s] 


12 images excluded.
19 categories changed.
0 categories changed.
saving revised val4...

