In [15]:
import random
import numpy as np
import pandas as pd
from pycocotools.coco import COCO
from collections import Counter, defaultdict


In [16]:

coco = COCO('/opt/ml/detection/dataset/train.json')

df = pd.DataFrame(coco.dataset['annotations'])
X = df['id']              # 객체 번호 [0~23143]
y = df['category_id']     # 객체 당 카테고리 번호 [0~9]
groups = df['image_id']   # 이미지 번호 [0~4882]
seed = 777
k = 5


loading annotations into memory...
Done (t=0.16s)
creating index...
index created!


In [61]:
len(coco.getImgIds())

4883

In [17]:

labels_num = y.max() + 1
# https://stackoverflow.com/a/39132900/14019325
# 기존 코드의 첫번째 loop와 동일합니다. 각 image 별 label 개수를 확인합니다.
y_counts_per_group = df.groupby(['image_id', 'category_id']).size().unstack(fill_value=0)
y_counts_per_fold = np.zeros((k, labels_num))

# scale을 미리 계산하여 연산을 줄입니다.
y_norm_counts_per_group = y_counts_per_group / y_counts_per_group.sum()
# suffle & sort
shuffled_and_sorted_index = y_norm_counts_per_group.sample(frac=1, random_state=seed).std(axis=1).sort_values(ascending=False).index
y_norm_counts_per_group = y_norm_counts_per_group.loc[shuffled_and_sorted_index]

groups_per_fold = defaultdict(set)

for g, y_counts in zip(y_norm_counts_per_group.index, y_norm_counts_per_group.values):
    best_fold = None
    min_eval = None
    for fold_i in range(k):
        # 기존 코드 eval_y_counts_per_fold 와 동일합니다.
        y_counts_per_fold[fold_i] += y_counts
        fold_eval = y_counts_per_fold.std(axis=0).mean()  # numpy를 활용하여 연산을 단순화 합니다.
        y_counts_per_fold[fold_i] -= y_counts
        if min_eval is None or fold_eval < min_eval:
            min_eval = fold_eval
            best_fold = fold_i
    y_counts_per_fold[best_fold] += y_counts
    groups_per_fold[best_fold].add(g)

In [27]:
def get_distribution(y_vals):
        y_distr = Counter(y_vals)
        y_vals_sum = sum(y_distr.values())
        return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

all_groups = set(groups)

distrs = [get_distribution(y)]
index = ['training set']

for i in range(k):
    train_groups = all_groups - groups_per_fold[i]
    test_groups = groups_per_fold[i]

    train_df = df.loc[groups.isin(train_groups)]
    valid_df = df.loc[groups.isin(test_groups)]

    distrs.append(get_distribution(train_df['category_id'].values))
    index.append(f'train set - fold {i + 1}')
    distrs.append(get_distribution(valid_df['category_id'].values))
    index.append(f'validation set - fold {i + 1}')

print('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(y) + 1)])

Distribution per class:


Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train set - fold 1,17.14%,27.45%,3.88%,4.04%,4.24%,12.71%,5.45%,22.37%,0.69%,2.02%
validation set - fold 1,17.14%,27.45%,3.87%,4.04%,4.24%,12.73%,5.47%,22.37%,0.67%,2.03%
train set - fold 2,17.13%,27.45%,3.88%,4.05%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 2,17.15%,27.42%,3.87%,4.04%,4.25%,12.72%,5.46%,22.37%,0.69%,2.03%
train set - fold 3,17.14%,27.45%,3.87%,4.05%,4.24%,12.72%,5.45%,22.37%,0.69%,2.03%
validation set - fold 3,17.13%,27.44%,3.89%,4.04%,4.24%,12.71%,5.47%,22.39%,0.69%,2.01%
train set - fold 4,17.14%,27.44%,3.87%,4.05%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 4,17.13%,27.45%,3.89%,4.04%,4.25%,12.70%,5.44%,22.38%,0.69%,2.03%
train set - fold 5,17.14%,27.44%,3.88%,4.04%,4.24%,12.71%,5.46%,22.38%,0.69%,2.03%


In [101]:
import random
random.randint(0,4)

1

In [35]:
valid_df

Unnamed: 0,image_id,category_id,area,bbox,iscrowd,id
41,7,9,358485.73,"[109.0, 104.7, 751.7, 476.9]",0,41
42,7,9,391485.88,"[95.4, 438.7, 708.7, 552.4]",0,42
51,10,0,102452.04,"[377.9, 255.0, 368.4, 278.1]",0,51
77,15,6,4887.96,"[844.2, 499.5, 48.3, 101.2]",0,77
78,15,3,138850.34,"[423.9, 390.4, 424.1, 327.4]",0,78
...,...,...,...,...,...,...
23139,4882,5,768591.81,"[0.0, 116.2, 944.1, 814.1]",0,23139
23140,4882,7,57309.72,"[302.1, 439.3, 265.2, 216.1]",0,23140
23141,4882,0,1772.74,"[511.3, 451.1, 58.7, 30.2]",0,23141
23142,4882,1,53008.67,"[255.0, 421.4, 271.7, 195.1]",0,23142
