In [1]:
from pycocotools.coco import COCO
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import random
random.seed(0)

In [2]:
annotation = '../../../detection/dataset/train.json' # annotation 경로
coco = COCO(annotation)
annotations = dict(coco.anns)

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


In [3]:
annotations[3]

{'image_id': 1,
 'category_id': 4,
 'area': 69096.17,
 'bbox': [722.3, 313.4, 274.3, 251.9],
 'iscrowd': 0,
 'id': 3}

In [4]:
images = dict()
for i in tqdm(annotations):
    if annotations[i]['image_id'] not in images:
        row = dict()
        for cat in range(len(coco.cats)):
            row[cat] = False
        images[annotations[i]['image_id']] = row
    images[annotations[i]['image_id']][annotations[i]['category_id']] = True

  0%|          | 0/23144 [00:00<?, ?it/s]

In [5]:
df = pd.DataFrame(columns=['img_id'] + [i for i in range(len(coco.cats))])

In [6]:
for idx in tqdm(images):
    row = images[idx]
    row['img_id'] = idx
    df = df.append(row, ignore_index=True)

  0%|          | 0/4883 [00:00<?, ?it/s]

In [7]:
df

Unnamed: 0,img_id,0,1,2,3,4,5,6,7,8,9
0,0,True,False,False,False,False,False,False,False,False,False
1,1,True,False,False,True,True,True,False,True,False,False
2,2,False,False,False,True,False,False,False,False,False,False
3,3,False,False,True,False,False,False,True,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
4878,4878,True,False,False,False,False,False,False,False,False,False
4879,4879,True,False,False,False,False,True,False,True,False,False
4880,4880,True,False,False,False,False,False,False,False,False,False
4881,4881,False,True,False,False,False,False,False,True,False,False


In [8]:
df.sum(axis=0)

img_id    11919403
0             2105
1             1714
2              642
3              598
4              340
5             1369
6              512
7             1893
8               46
9              229
dtype: object

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, df[[i for i in range(len(coco.cats))]], test_size=0.1, random_state=0, shuffle=True, stratify=df[[8,9]])

In [10]:
X_train, X_test, y_train, y_test = X_train.sort_index(), X_test.sort_index(), y_train.sort_index(), y_test.sort_index()

In [11]:
len(df), len(X_train), len(X_test)

(4883, 4394, 489)

In [12]:
X_train

Unnamed: 0,img_id,0,1,2,3,4,5,6,7,8,9
1,1,True,False,False,True,True,True,False,True,False,False
2,2,False,False,False,True,False,False,False,False,False,False
3,3,False,False,True,False,False,False,True,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False
5,5,True,True,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
4878,4878,True,False,False,False,False,False,False,False,False,False
4879,4879,True,False,False,False,False,True,False,True,False,False
4880,4880,True,False,False,False,False,False,False,False,False,False
4881,4881,False,True,False,False,False,False,False,True,False,False


In [13]:
df_ann = pd.DataFrame(columns=['img_id', 'category'])
for i in tqdm(annotations):
    df_ann = df_ann.append({'img_id':annotations[i]['image_id'], 'category': annotations[i]['category_id']}, ignore_index=True)

  0%|          | 0/23144 [00:00<?, ?it/s]

In [14]:
a = []
for idx in X_train.index:
    a += df_ann[df_ann['img_id'] == idx].index.tolist()
len(a)

20800

In [15]:
a = []
for idx in X_test.index:
    a += df_ann[df_ann['img_id'] == idx].index.tolist()
len(a)

2344

## Make Train & Validation dataset JSON¶

In [16]:
def makeJson(df, filename):
    json_config = {
        'info':coco.dataset['info'],
        'licenses':coco.dataset['licenses'],
        'images':[],
        'categories':coco.dataset['categories'],
        'annotations':[]
    }
    for idx in df['img_id'].unique():
        json_config['images'].append(coco.dataset['images'][idx])
    for idx in df.index:
        for ann_idx in df_ann[df_ann['img_id'] == idx].index:
            json_config['annotations'].append(coco.dataset['annotations'][ann_idx])
    with open(filename, 'w') as fp:
        json.dump(json_config, fp)

In [17]:
makeJson(X_train, 'split_train.json')
makeJson(X_test, 'split_valid.json')