In [1]:
import json
import random
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

In [35]:
def random_split_dataset(root_folder, val_ratio, random_seed):
    random.seed(random_seed)
    input_json = os.path.join(root_folder, 'train.json')

    with open(input_json) as f:
        data = json.load(f)

    images = data['images']
    annotations = data['annotations']

    image_ids = [x.get('id') for x in images]
    random.shuffle(image_ids)

    num_val = int(len(image_ids) * val_ratio)

    image_ids_val, image_ids_train = set(image_ids[:num_val]), set(image_ids[num_val:])

    train_images = [x for x in images if x.get('id') in image_ids_train]
    val_images = [x for x in images if x.get('id') in image_ids_val]
    train_annotations = [x for x in annotations if x.get('image_id') in image_ids_train]
    val_annotations = [x for x in annotations if x.get('image_id') in image_ids_val]

    train_data = {
        'info' : data['info'],
        'licenses': data['licenses'],
        'images': train_images,
        'annotations': train_annotations,
        'categories': data['categories'],
    }

    val_data = {
        'info' : data['info'],
        'licenses': data['licenses'],
        'images': val_images,
        'annotations': val_annotations,
        'categories': data['categories'],
    }

    output_train_path = os.path.join(root_folder, 'train_split.json')
    output_valid_path = os.path.join(root_folder, 'val.json')

    with open(output_train_path, 'w') as f:
        json.dump(train_data, f, indent=4)

    with open(output_valid_path, 'w') as f:
        json.dump(val_data, f, indent=4)


In [2]:
random_split_dataset(root_folder='../../dataset/', val_ratio=0.2, random_seed=42)

In [2]:
def kfold_split_dataset(input_json, kfold, random_seed):
    with open(input_json) as f:
        data = json.load(f)

    var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

    X = np.zeros((len(data['annotations']),1))
    y = np.array([v[1] for v in var])

    groups = np.array([v[0] for v in var])
    fold = StratifiedGroupKFold(n_splits=kfold, shuffle=True, random_state=random_seed)
    
    images = data['images']
    annotations = data['annotations']

    for idx, (train_idx, val_idx) in enumerate(fold.split(X, y, groups)):
        train_ids, val_ids = groups[train_idx], groups[val_idx]
        print('images_length : ',len(pd.Series(groups[train_idx]).unique()), len(pd.Series(groups[val_idx]).unique()))
        print("annotation", len(y[train_idx]), len(y[val_idx]))
        
        train_images = [x for x in images if x.get('id') in train_ids]
        val_images = [x for x in images if x.get('id') in val_ids]
        
        train_annotations = [x for x in annotations if x.get('image_id') in train_ids]
        val_annotations = [x for x in annotations if x.get('image_id') in val_ids]

        train_data = {
            'info' : data['info'],
            'licenses' : data['licenses'],
            'images' : train_images,
            'categories' : data['categories'],
            'annotations' : train_annotations
            }

        val_data= {
            'info' : data['info'],
            'licenses' : data['licenses'],
            'images' : val_images,
            'categories' : data['categories'],
            'annotations' : val_annotations
            }

        with open(f'../../dataset/train_fold_{idx}.json', 'w') as f:
            json.dump(train_data, f, indent=4)

        with open(f'../../dataset/val_fold_{idx}.json', 'w') as f:
            json.dump(val_data, f, indent=4)


In [3]:
kfold_split_dataset(input_json='../../dataset/train.json', kfold=5, random_seed=42)

images_length :  3908 975
annotation 18808 4336
images_length :  3907 976
annotation 18052 5092
images_length :  3903 980
annotation 18438 4706
images_length :  3900 983
annotation 18573 4571
images_length :  3914 969
annotation 18705 4439
