In [3]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import json
from sklearn.model_selection import StratifiedGroupKFold
from copy import deepcopy

In [5]:
"""
idx를 받아 train/valid split annotation json 생성하는 함수

idx: annotations json 리스트에서의 index
id : Image/Annotation 식별 id
"""

def generate_split(train_idx, val_idx):
    train_data = deepcopy(data)
    val_data = deepcopy(data)
    
    df_imgs = pd.DataFrame(data['images'])
    df_annos = pd.DataFrame(data['annotations'])
    
    train_img_ids = df_annos.loc[train_idx]['image_id'].unique()
    val_img_ids = df_annos.loc[val_idx]['image_id'].unique()

    train_data['images'] = df_imgs[df_imgs['id'].isin(train_img_ids)].to_dict('records')
    train_data['annotations'] = df_annos.iloc[train_idx].to_dict('records')
    val_data['images'] = df_imgs[df_imgs['id'].isin(val_img_ids)].to_dict('records')
    val_data['annotations'] = df_annos.iloc[val_idx].to_dict('records')
    
    return train_data, val_data

In [None]:
annotation = '/opt/ml/input/data/train_all.json'

with open(annotation) as f: data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1)) # X : 더미 값.
y = np.array([v[1] for v in var])         # y : 클래스
groups = np.array([v[0] for v in var])    # groups : 그룹 (이미지 파일)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

print('전체 Box 개수:', len(X))
# train_idx, val_idx 는 annotation의 idx임. (!= id)
for i, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f'Fold {i} - {len(train_idx)}, {len(val_idx)}')
    train_data, val_data = generate_split(train_idx, val_idx)

    with open(f'/opt/ml/input/data/train-kfold-{i}.json', 'w') as f:
        json.dump(train_data, f, indent=2)

    with open(f'/opt/ml/input/data/val-kfold-{i}.json', 'w') as f:
        json.dump(val_data, f, indent=2)

전체 Box 개수: 26240
Fold 0 - 21199, 5041
Fold 1 - 20976, 5264
Fold 2 - 20887, 5353
Fold 3 - 21056, 5184
Fold 4 - 20842, 5398


In [None]:
# check distribution

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]

    # assert len(set(train_gr) & set(val_gr)) == 0
    distrs.append(get_distribution(train_y))

    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])