In [1]:
import json
import os
import funcy
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [3]:
# fold된 data annotation file 저장 경로
save_annotation_path = '../../dataset'

# annotation file
annotation = '../../dataset/train.json'

with open(annotation) as f:
    data = json.load(f)
    info = data['info']
    licences = data['licenses']
    images = data['images']
    categories = data['categories']
    anns = data['annotations']

In [4]:
# file을 json 형태로 저장
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'images': images, 
            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=False)

def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)

def filter_images(images, annotations):
    ann_ids = funcy.lmap(lambda i: int(i['image_id']), annotations)
    return funcy.lfilter(lambda a: int(a['id']) in ann_ids, images)

In [5]:
var = [(ann['image_id'],ann['category_id']) for ann in anns]
X = np.ones((len(data['annotations']), 1))  # dummy
y = np.array([v[1] for v in var])   # category_id
groups = np.array([v[0] for v in var])  # group (image_id)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_anns = []
    val_anns = []
    
    train_file_name = f'train_kfold_{fold_idx}.json'
    val_file_name = f'val_kfold_{fold_idx}.json'
    
    for id in train_idx:
        train_anns.append(anns[id])
        
    train_anns = np.array(train_anns)
        
    for id in val_idx:
        val_anns.append(anns[id])
        
    val_anns = np.array(val_anns)
    
        
    save_coco(os.path.join(save_annotation_path, train_file_name),info,licences,filter_images(images,train_anns),filter_annotations(train_anns, images), categories)
    print(f'{fold_idx} train annotation saved as {train_file_name}')
    save_coco(os.path.join(save_annotation_path, val_file_name),info,licences,filter_images(images,val_anns),filter_annotations(val_anns, images), categories)
    print(f'{fold_idx} val annotation saved as {val_file_name}')
    print('')

0 train annotation saved as train_kfold_0.json
0 val annotation saved as val_kfold_0.json

1 train annotation saved as train_kfold_1.json
1 val annotation saved as val_kfold_1.json

2 train annotation saved as train_kfold_2.json
2 val annotation saved as val_kfold_2.json

3 train annotation saved as train_kfold_3.json
3 val annotation saved as val_kfold_3.json

4 train annotation saved as train_kfold_4.json
4 val annotation saved as val_kfold_4.json



In [6]:
# check distribution
import pandas as pd
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0 
    distrs.append(get_distribution(train_y))

    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,16.96%,27.45%,3.79%,4.13%,4.48%,12.61%,5.51%,22.28%,0.77%,2.02%
val - fold0,17.85%,27.42%,4.23%,3.70%,3.26%,13.15%,5.25%,22.77%,0.35%,2.02%
train - fold1,17.14%,27.24%,4.01%,3.98%,4.28%,12.77%,5.38%,22.32%,0.67%,2.20%
val - fold1,17.12%,28.17%,3.41%,4.26%,4.12%,12.51%,5.72%,22.57%,0.73%,1.38%
train - fold2,17.31%,27.39%,3.83%,4.08%,4.13%,12.80%,5.14%,22.68%,0.69%,1.94%
val - fold2,16.42%,27.68%,4.05%,3.88%,4.70%,12.36%,6.76%,21.12%,0.69%,2.35%
train - fold3,17.30%,27.47%,3.87%,4.06%,4.22%,12.63%,5.49%,22.39%,0.63%,1.95%
val - fold3,16.50%,27.36%,3.88%,3.99%,4.33%,13.07%,5.33%,22.30%,0.92%,2.32%
train - fold4,16.97%,27.67%,3.88%,3.97%,4.10%,12.77%,5.76%,22.20%,0.68%,2.00%
