In [10]:
from pycocotools.coco import COCO
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## JSON to DataFrame

In [11]:
data_dir = '/opt/ml/detection/dataset'
anno_root = os.path.join(data_dir, 'train.json')

with open(anno_root) as json_file:
    anns = json.load(json_file)

label_name = [ann_dict['name'] for ann_dict in anns['categories']]

df = pd.json_normalize(anns['annotations'])
df[["X_min", "Y_min", "Width", "Height"]] = list(df.bbox)
df.drop(columns='bbox', inplace=True)
df['image_path'] = df['image_id'].apply(lambda x: 'train/'+str(x).zfill(4)+'.jpg')
df['sqrt_area'] = np.sqrt(df['area'])
df['category_name'] = df['category_id'].apply(lambda x: label_name[x])
df = df[['id', 'image_id', 'image_path', 'category_id', 'category_name', 'area', 'sqrt_area', 'X_min', 'Y_min', 'Width', 'Height', 'iscrowd']]
if len(df['iscrowd'].unique()) == 1:
    df.drop(columns='iscrowd', inplace=True)

print(len(df))
df.head()

23144


Unnamed: 0,id,image_id,image_path,category_id,category_name,area,sqrt_area,X_min,Y_min,Width,Height
0,0,0,train/0000.jpg,0,General trash,257301.66,507.24911,197.6,193.7,547.8,469.7
1,1,1,train/0001.jpg,3,Metal,10402.56,101.992941,0.0,407.4,57.6,180.6
2,2,1,train/0001.jpg,7,Plastic bag,26259.36,162.0474,0.0,455.6,144.6,181.6
3,3,1,train/0001.jpg,4,Glass,69096.17,262.861503,722.3,313.4,274.3,251.9
4,4,1,train/0001.jpg,5,Plastic,24164.58,155.449606,353.2,671.0,233.7,103.4


## bbox 면적이 20000이하인 데이터

In [12]:
query = df[df['area'] < 20000]['id']
query

1            1
7            7
8            8
12          12
17          17
         ...  
23131    23131
23132    23132
23136    23136
23137    23137
23141    23141
Name: id, Length: 8543, dtype: int64

## image당 bbox의 개수가 5개 이하인 데이터

In [13]:
query = df.groupby('image_id').size()
image_paths = sorted(list(set(df['image_id'].values)))
fivebbox = [path for path in image_paths if query[path] <= 5]
fivebbox[:5]

[0, 2, 3, 6, 7]

## export JSON

In [14]:
def read_json(root):
    with open(root, 'r') as f:
        json_data = json.load(f)
    return json_data


def get_area_ids(dataframe, minimum=None, maximum=None):
    if (maximum) and (not minimum):
        return dataframe[dataframe['area'] <= maximum]['id'].values
    elif (not maximum) and (minimum):
        return dataframe[dataframe['area'] >= minimum]['id'].values
    elif (maximum) and (minimum):
        return dataframe[dataframe['area'] >= minimum and dataframe['area'] <= maximum].values
    elif (not maximum) and (not minimum):
        print("input limitation values!!")


def get_bbox_ids(dataframe, minimum=None, maximum=None):
    query = dataframe.groupby('image_id').size()
    image_ids = sorted(list(set(df['image_id'].values)))
    if (maximum) and (not minimum):
        return [i for i in image_ids if query[i] <= maximum]
    elif (not maximum) and (minimum):
        return [i for i in image_ids if query[i] >= minimum]
    elif (maximum) and (minimum):
        return [i for i in image_ids if (query[i] >= minimum) and (query[i] <= maximum)]
    elif (not maximum) and (not minimum):
        print("input limitation values!!")


def save_json(anno_root, ids, save_dir):
    json_file = read_json(anno_root)
    json_file['images'] = [image for image in json_file['images'] if image['id'] in ids]
    json_file['annotations'] = [anno for anno in json_file['annotations'] if anno['image_id'] in ids]
    
    with open(save_dir, 'w', encoding='utf-8') as save_file:
        json.dump(json_file, save_file, indent='\t')
    
    print(f'json file: {save_dir} save done!')


In [16]:
anno_root = os.path.join(data_dir, 'train.json')
save_dir = '../utils/area20000.json'

ids = get_area_ids(df, maximum=20000)
print(ids[:10])

save_json(anno_root, ids, save_dir)

[ 1  7  8 12 17 19 20 21 22 23]
json file: ../utils/area20000.json save done!
