In [122]:
import pandas as pd; pd.options.mode.chained_assignment = None
import numpy as np

# coco format Dataset annotation file에서 원하는 정보를 쉽게 가져올 수 있도록 도와주는 API
from pycocotools.coco import COCO


In [123]:
def df_gen(coco_obj):
    df = pd.DataFrame()
    image_ids = []
    class_name = []
    class_id = []
    x_min = []
    y_min = []
    x_max = []
    y_max = []
    classes = ["General trash", "Paper", "Paper pack", "Metal", "Glass", 
            "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]

    for image_id in coco_obj.getImgIds():                       # category ID를 input으로 그에 해당하는 Image ID를 return
        imageinfo = coco_obj.loadImgs(image_id)[0]              # Image ID를 input으로 annotations의 image dict 전체(상세정보) return 
        ann_ids = coco_obj.getAnnIds(imgIds=imageinfo['id'])
        anns = coco_obj.loadAnns(ann_ids)
        filename = imageinfo['file_name']

        for ann in anns:
            image_ids.append(filename)
            class_name.append(classes[ann['category_id']])
            class_id.append(ann['category_id'])
            x_min.append(float(ann['bbox'][0]))
            y_min.append(float(ann['bbox'][1]))
            x_max.append(float(ann['bbox'][0]) + float(ann['bbox'][2]))
            y_max.append(float(ann['bbox'][1]) + float(ann['bbox'][3]))

    df['image_id'] = image_ids
    df['class_name'] = class_name
    df['class_id'] = class_id
    df['x_min'] = x_min
    df['y_min'] = y_min
    df['x_max'] = x_max
    df['y_max'] = y_max
    
    return df

### Dataframe generation

In [None]:
init_train = COCO('./dataset/coco_annotations.json')
coco_train = COCO('./dataset/train_fold0.json')   #train_fold0
coco_val   = COCO('./dataset/val_fold0.json')     #val_fold0

init_df = df_gen(init_train)
train_df = df_gen(coco_train)
val_df = df_gen(coco_val)

In [125]:
init_df

Unnamed: 0,image_id,class_name,class_id,x_min,y_min,x_max,y_max
0,train/0000.jpg,General trash,0,197.6,193.7,745.4,663.4
1,train/0001.jpg,Metal,3,0.0,407.4,57.6,588.0
2,train/0001.jpg,Plastic bag,7,0.0,455.6,144.6,637.2
3,train/0001.jpg,Glass,4,722.3,313.4,996.6,565.3
4,train/0001.jpg,Plastic,5,353.2,671.0,586.9,774.4
...,...,...,...,...,...,...,...
23139,train/4882.jpg,Plastic,5,0.0,116.2,944.1,930.3
23140,train/4882.jpg,Plastic bag,7,302.1,439.3,567.3,655.4
23141,train/4882.jpg,General trash,0,511.3,451.1,570.0,481.3
23142,train/4882.jpg,Paper,1,255.0,421.4,526.7,616.5


In [126]:
train_df

Unnamed: 0,image_id,class_name,class_id,x_min,y_min,x_max,y_max
0,train/0001.jpg,Metal,3,0.0,407.4,57.6,588.0
1,train/0001.jpg,Plastic bag,7,0.0,455.6,144.6,637.2
2,train/0001.jpg,Glass,4,722.3,313.4,996.6,565.3
3,train/0001.jpg,Plastic,5,353.2,671.0,586.9,774.4
4,train/0001.jpg,Plastic,5,3.7,448.5,781.9,690.5
...,...,...,...,...,...,...,...
18496,train/4882.jpg,Plastic,5,0.0,116.2,944.1,930.3
18497,train/4882.jpg,Plastic bag,7,302.1,439.3,567.3,655.4
18498,train/4882.jpg,General trash,0,511.3,451.1,570.0,481.3
18499,train/4882.jpg,Paper,1,255.0,421.4,526.7,616.5


In [127]:
val_df

Unnamed: 0,image_id,class_name,class_id,x_min,y_min,x_max,y_max
0,train/0000.jpg,General trash,0,197.6,193.7,745.4,663.4
1,train/0003.jpg,Paper pack,2,462.2,369.4,696.1,624.0
2,train/0003.jpg,Styrofoam,6,773.3,3.0,961.7,431.4
3,train/0007.jpg,Clothing,9,109.0,104.7,860.7,581.6
4,train/0007.jpg,Clothing,9,95.4,438.7,804.1,991.1
...,...,...,...,...,...,...,...
4638,train/4875.jpg,Paper pack,2,441.2,482.0,656.9,597.8
4639,train/4875.jpg,Paper pack,2,450.4,309.3,644.4,495.0
4640,train/4876.jpg,General trash,0,199.6,552.3,307.0,637.0
4641,train/4876.jpg,General trash,0,290.8,722.3,465.8,1023.7


In [None]:
train_df.class_name.value_counts().sort_index() / len(train_df)*100

### Test Code
1. Total num of box : 23144
2. Total image num  : 4883 (0~4882.jpg)
3. Mean box area :


In [59]:
def check_num_box(df_ori, df_train, df_val):
    if len(df_ori.index) == len(df_train.index)+len(df_val.index):
        print(f'Total number of boxes : {len(df_ori.index)}')
        print(f'Train: {len(df_train.index)}, Val: {len(df_val.index)}')
        print('Correct!')
    else:
        print('Wrong Split result!')
        print(f'Total: {len(df_ori.index)}, Train: {len(df_train.index)}, Val: {len(df_val.index)}')

In [60]:
check_num_box(init_df, train_df, val_df)

Total number of boxes : 23144
Train: 18501, Val: 4643
Correct!


In [96]:
# 합쳐서 sort, 갯수랑 각 row 비교하면 됨
def check_img_dup_and_cls(df_ori, df_train, df_val):
    se_ori = df_ori.groupby('image_id')["class_id"].unique().sort_index()
    se_train = df_train.groupby('image_id')["class_id"].unique()
    se_val = df_val.groupby('image_id')["class_id"].unique()
    se_concat = pd.concat([se_train, se_val]).sort_index()
    
    if len(se_concat) != len(se_ori):
        print('image duplication happened!')
    else:
        not_same_id = []
        for idx in se_ori.index:
            if len(se_ori[idx]) != len(se_concat[idx]):
                not_same_id.append(idx)
            elif not all(x in se_concat[idx] for x in se_ori[idx]):
                not_same_id.appned(idx)
        if not_same_id: 
            print(f'Different value found at {not_same_id}!')
        else:
            print('Same class number in each images! \nNo problem!')

In [97]:
check_img_dup_and_cls(init_df, train_df, val_df)

Same class number in each images! 
No problem!


In [119]:
def check_area(df_train, df_val):
    area_train = int(((df_train.x_max - df_train.x_min)*(df_train.y_max - df_train.y_min)).values.mean())
    area_val = int(((df_val.x_max - df_val.x_min)*(df_val.y_max - df_val.y_min)).values.mean())
    if abs(area_train-area_val)/area_train*100 >= 10.0:
        print("Area diff is too big!")
        print(f'train mean area: {area_train}, val mean area: {area_val}')
    else:
        print('No problem about area!')
    

In [120]:
def check_dataset_avail(df_ori, df_train, df_val):
    check_num_box(df_ori, df_train, df_val)
    check_img_dup_and_cls(df_ori, df_train, df_val)
    check_area(df_train, df_val)

In [121]:
check_dataset_avail(init_df, train_df, val_df)

Total number of boxes : 23144
Train: 18501, Val: 4643
Correct!
Same class number in each images! 
No problem!
No problem about area!
