In [5]:
import json
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold

annotation = "./dataset/train.json"

with open(annotation) as f: data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in var])

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=2024)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])

TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   7    7   18 ... 4876 4876 4876]
  [9 9 6 ... 0 0 2]
TRAIN: [   0    2    3 ... 4882 4882 4882]
  [0 3 2 ... 0 1 1]
 TEST: [   1    1    1 ... 4879 4879 4879]
  [3 7 4 ... 0 7 7]
TRAIN: [   1    1    1 ... 4881 4881 4881]
  [3 7 4 ... 7 1 7]
 TEST: [   0    3    3 ... 4882 4882 4882]
  [0 2 6 ... 0 1 1]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4881 4881 4881]
  [7 0 0 ... 7 1 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   2   12   12 ... 4868 4874 4878]
  [3 1 1 ... 2 2 0]


In [6]:
# check distribution
from collections import Counter
import pandas as pd

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0 
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,17.37%,28.00%,3.90%,4.04%,3.91%,12.58%,5.20%,22.37%,0.72%,1.92%
val - fold0,16.19%,25.20%,3.76%,4.07%,5.59%,13.28%,6.51%,22.39%,0.57%,2.44%
train - fold1,16.99%,27.35%,3.97%,3.80%,4.41%,13.27%,5.59%,21.99%,0.70%,1.93%
val - fold1,17.69%,27.82%,3.53%,4.96%,3.62%,10.62%,4.96%,23.81%,0.64%,2.36%
train - fold2,17.18%,27.98%,3.74%,4.08%,4.18%,12.58%,5.48%,22.12%,0.57%,2.08%
val - fold2,16.94%,25.24%,4.43%,3.90%,4.50%,13.26%,5.38%,23.41%,1.17%,1.76%
train - fold3,17.24%,26.67%,3.94%,4.05%,4.27%,12.66%,5.69%,22.67%,0.78%,2.02%
val - fold3,16.72%,30.48%,3.64%,4.02%,4.13%,12.93%,4.53%,21.23%,0.32%,2.02%
train - fold4,16.89%,27.22%,3.84%,4.25%,4.45%,12.50%,5.34%,22.71%,0.67%,2.15%


### k-fold json 생성

In [7]:
annotation = "./dataset/train.json"

with open(annotation) as f:
    data = json.load(f)

# Your existing code for creating X, y, groups, and cv

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_data = {'images': [], 'annotations': []}
    val_data = {'images': [], 'annotations': []}

    train_image_ids = set(groups[train_idx])
    val_image_ids = set(groups[val_idx])

    # Populate train_data and val_data
    for image_info in data['images']:
        if image_info['id'] in train_image_ids:
            train_data['images'].append(image_info)
        elif image_info['id'] in val_image_ids:
            val_data['images'].append(image_info)

    for annotation_info in data['annotations']:
        if annotation_info['image_id'] in train_image_ids:
            train_data['annotations'].append(annotation_info)
        elif annotation_info['image_id'] in val_image_ids:
            val_data['annotations'].append(annotation_info)
    
    # 추가할 categories 정보
    categories_info = [ 
        {"id": 0, "name": "General trash", "supercategory": "General trash"},
        {"id": 1, "name": "Paper", "supercategory": "Paper"},
        {"id": 2, "name": "Paper pack", "supercategory": "Paper pack"},
        {"id": 3, "name": "Metal", "supercategory": "Metal"},
        {"id": 4, "name": "Glass", "supercategory": "Glass"},
        {"id": 5, "name": "Plastic", "supercategory": "Plastic"},
        {"id": 6, "name": "Styrofoam", "supercategory": "Styrofoam"},
        {"id": 7, "name": "Plastic bag", "supercategory": "Plastic bag"},
        {"id": 8, "name": "Battery", "supercategory": "Battery"},
        {"id": 9, "name": "Clothing", "supercategory": "Clothing"}
    ]

    train_data['categories'] = categories_info
    val_data['categories'] = categories_info

    json_folder_path = f"./dataset/k-fold-{cv.random_state}"
    if not os.path.exists(json_folder_path):
        os.makedirs(json_folder_path)

    # Save train JSON file
    train_json_path = os.path.join(json_folder_path, f"train_fold{fold_ind}.json")
    with open(train_json_path, 'w') as train_file:
        json.dump(train_data, train_file, indent=2)

    # Save validation JSON file
    val_json_path = os.path.join(json_folder_path, f"valid_fold{fold_ind}.json")
    with open(val_json_path, 'w') as val_file:
        json.dump(val_data, val_file, indent=2)