In [10]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from collections import defaultdict

In [35]:
# annotation = {train.json dataset file 경로}
annotation = './dataset/train.json'
mis_label = [85, 884, 1313, 1633, 1823, 1846, 2381, 2424,
    2702, 2971, 3140, 3207, 3216, 3915, 4284, 4297, 4480, 4753, 4779]
    # 잘못 label 된 데이터들 -> 모두 제거한다

too_much_label = []
    # 40개 이상의 annotation을 가진 이미지 list

num_annotations = defaultdict(int)
    # image당 annotation 숫자 세기 위한 dict
    # (한 이미지에 40개 이상의 이미지있다면, 그 이미지 빼줘야함)

with open(annotation) as f: data = json.load(f)

for ann in data['annotations']:
    num_annotations[ann['image_id']] += 1


# for key in num_annotations.keys():
#     if num_annotations[key] >= 70:
#         too_much_label.append(key)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations'] if ann['image_id'] not in (mis_label or too_much_label)]
X = np.ones((len(var),1))   # 각 annotation 의 index
y = np.array([v[1] for v in var])           # image_id 모음
groups = np.array([v[0] for v in var])      # cateogry_id 모음

print("Wrong label : ", mis_label)
# print("Have more ")
print("Before eliminate mislabel : ", len(data['annotations']))
print("After  eliminate mislabel : ", len(var))

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=41)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx]) # image_id
    print(" ", y[train_idx])           # category_id
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])

Wrong label :  [85, 884, 1313, 1633, 1823, 1846, 2381, 2424, 2702, 2971, 3140, 3207, 3216, 3915, 4284, 4297, 4480, 4753, 4779]
Before eliminate mislabel :  23144
After  eliminate mislabel :  23072
TRAIN: [   0    1    1 ... 4881 4881 4881]
  [0 3 7 ... 7 1 7]
 TEST: [   4    4    4 ... 4882 4882 4882]
  [1 1 1 ... 0 1 1]
TRAIN: [   1    1    1 ... 4882 4882 4882]
  [3 7 4 ... 0 1 1]
 TEST: [   0   10   11 ... 4879 4879 4879]
  [0 0 0 ... 0 7 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4872 4874 4880]
  [7 0 0 ... 6 2 0]
TRAIN: [   0    2    3 ... 4882 4882 4882]
  [0 3 2 ... 0 1 1]
 TEST: [   1    1    1 ... 4881 4881 4881]
  [3 7 4 ... 7 1 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   2    3    3 ... 4870 4877 4877]
  [3 2 6 ... 1 7 7]


In [36]:
from collections import Counter
import pandas as pd


In [37]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

In [38]:
distrs = [get_distribution(y)]
index = ['training set']

In [39]:
for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

In [40]:
categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.10%,27.49%,3.88%,4.04%,4.22%,12.73%,5.45%,22.38%,0.69%,2.03%
train - fold0,17.07%,27.68%,3.92%,4.20%,4.33%,12.45%,5.64%,22.10%,0.67%,1.94%
val - fold0,17.24%,26.67%,3.71%,3.31%,3.71%,13.92%,4.66%,23.59%,0.76%,2.43%
train - fold1,17.04%,27.84%,3.80%,3.86%,3.97%,12.92%,5.15%,22.60%,0.78%,2.04%
val - fold1,17.37%,26.06%,4.20%,4.75%,5.22%,11.94%,6.68%,21.49%,0.33%,1.97%
train - fold2,17.32%,27.03%,3.87%,3.94%,4.36%,13.04%,5.40%,22.28%,0.59%,2.17%
val - fold2,16.33%,29.13%,3.93%,4.37%,3.69%,11.61%,5.63%,22.76%,1.05%,1.51%
train - fold3,16.84%,27.46%,3.91%,4.06%,4.49%,12.40%,5.65%,22.56%,0.78%,1.85%
val - fold3,18.17%,27.59%,3.74%,3.94%,3.10%,14.07%,4.65%,21.67%,0.31%,2.75%
train - fold4,17.25%,27.42%,3.90%,4.11%,3.93%,12.84%,5.42%,22.37%,0.63%,2.14%


In [41]:
# annotation = {dataset 경로/K-fold}
output_filename = "./dataset/K-fold"

In [42]:
for idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_images, val_images = [], []
    train_annotations, val_annotations = [], []
    for i in groups[train_idx]: # image_id
        train_images.append(data["images"][i].copy())
    for i in groups[val_idx]:   # image_id
        val_images.append(data["images"][i].copy())
    for annotation in data["annotations"]:
        if annotation["image_id"] in groups[val_idx]:
            val_annotations.append(annotation.copy())
        else:
            train_annotations.append(annotation.copy())

    train_split = {
            "images": train_images,
            "annotations": train_annotations,
            "info": data.get("info", {}),
            "licenses": data.get("licenses", []),
            "categories": data["categories"],
        }

    val_split = {
            "images": val_images,
            "annotations": val_annotations,
            "info": data.get("info", {}),
            "licenses": data.get("licenses", []),
            "categories": data["categories"],
        }
    
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}{idx+1}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

print("Split Done !")