In [23]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [24]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [25]:
train_x = pd.read_csv('/opt/ml/detection/object-detection-level2-cv-04/dataset/train_modify.csv')
train_y = train_x.class_id.values
groups = np.array(train_x.image_id.values)

def get_distribution(y_vals):
        y_distr = Counter(y_vals)
        y_vals_sum = sum(y_distr.values())
        return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [26]:
distrs = [get_distribution(train_y)]
index = ['training set']
developement_set = []
validation_set = []
dev_group = []
val_group = []
for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')
    developement_set.append(dev_ind)
    validation_set.append(val_ind)
    dev_group.append(dev_groups)
    val_group.append(val_groups)

display('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

'Distribution per class:'

Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9
training set,17.06%,27.35%,3.89%,4.08%,4.21%,12.79%,5.51%,22.36%,0.69%,2.07%
development set - fold 0,17.06%,27.35%,3.89%,4.08%,4.21%,12.79%,5.51%,22.36%,0.69%,2.07%
validation set - fold 0,17.05%,27.32%,3.89%,4.09%,4.22%,12.79%,5.50%,22.34%,0.70%,2.09%
development set - fold 1,17.06%,27.35%,3.89%,4.08%,4.21%,12.79%,5.51%,22.35%,0.69%,2.07%
validation set - fold 1,17.05%,27.35%,3.90%,4.09%,4.20%,12.78%,5.51%,22.36%,0.70%,2.07%
development set - fold 2,17.06%,27.35%,3.89%,4.08%,4.21%,12.79%,5.51%,22.36%,0.69%,2.07%
validation set - fold 2,17.06%,27.36%,3.90%,4.07%,4.20%,12.79%,5.51%,22.35%,0.70%,2.07%
development set - fold 3,17.06%,27.35%,3.89%,4.08%,4.21%,12.79%,5.50%,22.35%,0.69%,2.07%
validation set - fold 3,17.07%,27.35%,3.88%,4.08%,4.21%,12.79%,5.51%,22.36%,0.68%,2.07%
development set - fold 4,17.06%,27.34%,3.89%,4.08%,4.21%,12.79%,5.51%,22.35%,0.69%,2.07%


In [27]:
for i in range(5):
    devgroup_num = len(set(dev_group[i]))
    valgroup_num = len(set(val_group[i]))
    print(f"k-{i}",devgroup_num, valgroup_num, devgroup_num + valgroup_num)

k-0 3896 985 4881
k-1 3901 980 4881
k-2 3903 978 4881
k-3 3913 968 4881
k-4 3911 970 4881


In [28]:
for i in range(5):
    devset_num = len(developement_set[i])
    valset_num = len(validation_set[i])
    print(f"k-{i}",devset_num, valset_num, devset_num + valset_num)

k-0 18359 4597 22956
k-1 18363 4593 22956
k-2 18365 4591 22956
k-3 18368 4588 22956
k-4 18369 4587 22956


In [29]:
import json

train_json = "/opt/ml/detection/object-detection-level2-cv-04/dataset/train_modify.json"
with open(train_json, "r", encoding="utf8") as outfile:
    json_data = json.load(outfile)
base_annotations = json_data['annotations']
base_images = json_data['images']

In [30]:
annotations = [base_annotations[index] for index in developement_set[0]]
json_data['annotations'] = annotations
print(len(json_data['annotations']))

18359


In [31]:
for i in range(5):
    annotations = [base_annotations[index] for index in developement_set[i]]
    images = [base_images[index] for index in sorted(list(set(dev_group[i])))]
    json_data['annotations'] = annotations
    json_data['images'] = images
    with open(f'train_{i}_modify.json', 'w', encoding='utf-8') as make_file:
        json.dump(json_data, make_file, indent="\t")

In [32]:
for i in range(5):
    annotations = [base_annotations[index] for index in validation_set[i]]
    images = [base_images[index] for index in sorted(list(set(val_group[i])))]
    json_data['annotations'] = annotations
    json_data['images'] = images
    with open(f'valid_{i}_modify.json', 'w', encoding='utf-8') as make_file:
        json.dump(json_data, make_file, indent="\t")