In [1]:
import json
import os
import random
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

def split_dataset():
    output_dir = '/opt/ml/input/data'
    annotation = '/opt/ml/input/data/train_all.json'
    with open(annotation) as f: dataset = json.load(f)

    images = dataset['images']
    annotations = dataset['annotations']
    categories = dataset['categories']

    # file_name에 prefix 디렉토리까지 포함 (CocoDataset 클래스를 사용하는 경우)
    # for image in images:
    #     image['file_name'] = '{}/{}'.format(image['file_name'][0], image['file_name'])
    
    var = [(ann['image_id'], ann['category_id']) for ann in annotations]
    X = np.ones((len(dataset['annotations']),1))
    y = np.array([v[1] for v in var])
    groups = np.array([v[0] for v in var])

    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
        train_images = [x for x in images if x.get('id') in groups[train_idx]]
        val_images = [x for x in images if x.get('id') in groups[val_idx]]
        train_annotations = [x for x in annotations if x.get('image_id') in groups[train_idx]]
        val_annotations = [x for x in annotations if x.get('image_id') in groups[val_idx]]
        
        train_data = {
            'images': train_images,
            'annotations': train_annotations,
            'categories': categories,
        }

        val_data = {
            'images': val_images,
            'annotations': val_annotations,
            'categories': categories,
        }

        output_train_json = os.path.join(output_dir, f'fold_{fold_idx}_train.json')
        output_val_json = os.path.join(output_dir, f'fold_{fold_idx}_val.json')

        with open(output_train_json, 'w') as train_writer:
            json.dump(train_data, train_writer)
        print(f'write {output_train_json}')

        with open(output_val_json, 'w') as val_writer:
            json.dump(val_data, val_writer)
        print(f'write {output_val_json}')

In [2]:
split_dataset()

write /opt/ml/input/data/fold_0_train.json
write /opt/ml/input/data/fold_0_val.json
write /opt/ml/input/data/fold_1_train.json
write /opt/ml/input/data/fold_1_val.json
write /opt/ml/input/data/fold_2_train.json
write /opt/ml/input/data/fold_2_val.json
write /opt/ml/input/data/fold_3_train.json
write /opt/ml/input/data/fold_3_val.json
write /opt/ml/input/data/fold_4_train.json
write /opt/ml/input/data/fold_4_val.json


In [5]:
import pandas as pd
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs, index = [], []

for fold in range(5):
    train, val = f'/opt/ml/input/data/fold_{fold}_train.json', f'/opt/ml/input/data/fold_{fold}_val.json'
    with open(train) as f1: train_dataset = json.load(f1)
    with open(val) as f2: val_dataset = json.load(f2)
    
    train_y = np.array([ann['category_id'] for ann in train_dataset['annotations']])
    val_y = np.array([ann['category_id'] for ann in val_dataset['annotations']])
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold}')
    index.append(f'val - fold{fold}')

annotation = '/opt/ml/input/data/train_all.json'
with open(annotation) as f: data = json.load(f)
categories =  ['Background', 'General trash', 'Paper', 'Paper pack', 'Metal',
                'Glass', 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing']
y = np.array([ann['category_id'] for ann in data['annotations']])
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,Background,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
train - fold0,0.00%,10.75%,35.78%,2.54%,2.15%,2.37%,11.82%,4.96%,28.74%,0.27%,0.62%
val - fold0,0.00%,10.00%,34.22%,2.38%,2.12%,2.14%,11.60%,5.77%,30.75%,0.12%,0.89%
train - fold1,0.00%,10.48%,34.53%,2.36%,2.26%,2.53%,12.08%,5.03%,29.76%,0.24%,0.73%
val - fold1,0.00%,11.09%,39.27%,3.12%,1.67%,1.52%,10.58%,5.45%,26.61%,0.25%,0.44%
train - fold2,0.00%,10.37%,35.78%,2.68%,2.01%,2.22%,11.45%,5.08%,29.48%,0.27%,0.66%
val - fold2,0.00%,11.51%,34.34%,1.85%,2.67%,2.73%,13.04%,5.27%,27.74%,0.11%,0.75%
train - fold3,0.00%,10.84%,35.58%,2.60%,2.14%,2.14%,11.77%,5.25%,28.76%,0.21%,0.71%
val - fold3,0.00%,9.65%,35.11%,2.14%,2.16%,3.07%,11.81%,4.57%,30.61%,0.35%,0.54%
train - fold4,0.00%,10.57%,35.75%,2.37%,2.16%,2.37%,11.76%,5.26%,28.90%,0.21%,0.65%
val - fold4,0.00%,10.71%,34.48%,3.06%,2.07%,2.17%,11.82%,4.56%,30.01%,0.37%,0.76%
