In [1]:
import json
import numpy as np

IMAGE_NUM = 4883
CLASS_NUM = 10

In [2]:
json_file = open('/opt/ml/detection/dataset/train.json')
infos = json.load(json_file)
json_file.close()


annotations = infos['annotations']
#print(annotations)

In [5]:
import pandas as pd

image_id = []
category_id = []
area = []
bbox = []
iscrowd = []
id = []

for i in range(len(annotations)):
    image_id.append(annotations[i]["image_id"])
    category_id.append(annotations[i]["category_id"])
    area.append(annotations[i]["area"])
    bbox.append(annotations[i]["bbox"])
    iscrowd.append(annotations[i]["iscrowd"])
    id.append(annotations[i]["id"])

train = pd.DataFrame({"image_id":image_id, 
"category_id":category_id, 
"area": area, 
"bbox": bbox, 
"iscrowd": iscrowd, 
"id": id})
train.head()

#train.to_csv("train_strategies.csv",mode='w')

Unnamed: 0,image_id,category_id,area,bbox,iscrowd,id
0,0,0,257301.66,"[197.6, 193.7, 547.8, 469.7]",0,0
1,1,3,10402.56,"[0.0, 407.4, 57.6, 180.6]",0,1
2,1,7,26259.36,"[0.0, 455.6, 144.6, 181.6]",0,2
3,1,4,69096.17,"[722.3, 313.4, 274.3, 251.9]",0,3
4,1,5,24164.58,"[353.2, 671.0, 233.7, 103.4]",0,4


In [6]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [7]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [8]:
#train_x = pd.read_csv('../input/train/train.csv')
train_x = train
train_y = train_x.category_id.values#category id로 나누기
groups = np.array(train_x.id.values)#id로 그룹

def get_distribution(y_vals):
        y_distr = Counter(y_vals)
        y_vals_sum = sum(y_distr.values())
        return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [11]:
distrs = [get_distribution(train_y)]
index = ['training set']

train_dict = dict()
valid_dict = dict()

train_dict['info'] = infos['info']
train_dict['licenses'] = infos['licenses']
train_dict['images'] = list()
train_dict['annotations'] = list()
train_dict['categories'] = infos['categories']
valid_dict['info'] = infos['info']
valid_dict['licenses'] = infos['licenses']
valid_dict['images'] = list()
valid_dict['annotations'] = list()
valid_dict['categories'] = infos['categories']


for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]#골라진 id들이 들어가있음


    print("폴더 dev",fold_ind,dev_groups)
    print("폴더 val",fold_ind,val_groups)

    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')

#마지막 fold를 가져옴.
for img_info in infos['images']:
        image_id = img_info['id']
        if image_id in val_groups:
            valid_dict['images'].append(img_info)
        else:
            train_dict['images'].append(img_info)

    
for ann_info in infos['annotations']:
    image_id = ann_info['image_id']
    if image_id in val_groups:
        valid_dict['annotations'].append(ann_info)
    else:
        train_dict['annotations'].append(ann_info)

display('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

폴더 dev 0 [    0     1     2 ... 23141 23142 23143]
폴더 val 0 [    6     9    11 ... 23125 23137 23140]
폴더 dev 1 [    0     2     3 ... 23140 23142 23143]
폴더 val 1 [    1     7     8 ... 23134 23136 23141]
폴더 dev 2 [    0     1     2 ... 23140 23141 23143]
폴더 val 2 [    3    12    14 ... 23120 23131 23142]
폴더 dev 3 [    1     2     3 ... 23140 23141 23142]
폴더 val 3 [    0    13    22 ... 23113 23138 23143]
폴더 dev 4 [    0     1     3 ... 23141 23142 23143]
폴더 val 4 [    2     4     5 ... 23132 23135 23139]


'Distribution per class:'

Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
development set - fold 0,17.14%,27.45%,3.87%,4.04%,4.24%,12.72%,5.46%,22.38%,0.69%,2.02%
validation set - fold 0,17.13%,27.43%,3.88%,4.06%,4.25%,12.71%,5.46%,22.36%,0.69%,2.03%
development set - fold 1,17.14%,27.45%,3.87%,4.05%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 1,17.12%,27.44%,3.89%,4.04%,4.25%,12.72%,5.46%,22.37%,0.69%,2.03%
development set - fold 2,17.14%,27.45%,3.88%,4.05%,4.25%,12.71%,5.46%,22.37%,0.69%,2.02%
validation set - fold 2,17.13%,27.44%,3.87%,4.04%,4.23%,12.72%,5.47%,22.38%,0.69%,2.03%
development set - fold 3,17.13%,27.44%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 3,17.15%,27.46%,3.87%,4.04%,4.24%,12.71%,5.45%,22.38%,0.69%,2.01%
development set - fold 4,17.13%,27.44%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%


In [12]:
with open("strategies_train.json", "w") as json_train:
    json.dump(train_dict, json_train)
with open("strategies_valid.json", "w") as json_valid:
    json.dump(valid_dict, json_valid)