# StratifiedGroupKFold v2

쓰레기 데이터셋 Train/Valid Set 분할 코드

In [1]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import json
from sklearn.model_selection import StratifiedGroupKFold
from copy import deepcopy

In [2]:
"""
idx를 받아 train/valid split annotation json 생성하는 함수

idx: annotations json 리스트에서의 index
id : Image/Annotation 식별 id
"""

def generate_split(train_idx, val_idx):
    train_data = deepcopy(data)
    val_data = deepcopy(data)
    
    df_imgs = pd.DataFrame(data['images'])
    df_annos = pd.DataFrame(data['annotations'])
    
    train_img_ids = df_annos.loc[train_idx]['image_id'].unique()
    val_img_ids = df_annos.loc[val_idx]['image_id'].unique()

    train_data['images'] = df_imgs[df_imgs['id'].isin(train_img_ids)].to_dict('records')
    train_data['annotations'] = df_annos.iloc[train_idx].to_dict('records')
    val_data['images'] = df_imgs[df_imgs['id'].isin(val_img_ids)].to_dict('records')
    val_data['annotations'] = df_annos.iloc[val_idx].to_dict('records')
    
    return train_data, val_data

In [3]:
annotation = '/opt/ml/dataset/train.json'

with open(annotation) as f: data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1)) # X : 더미 값.
y = np.array([v[1] for v in var])         # y : 클래스
groups = np.array([v[0] for v in var])    # groups : 그룹 (이미지 파일)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

print('전체 Box 개수:', len(X))
# train_idx, val_idx 는 annotation의 idx임. (!= id)
for i, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f'Fold {i} - {len(train_idx)}, {len(val_idx)}')
    train_data, val_data = generate_split(train_idx, val_idx)

    with open(f'/opt/ml/dataset/train-kfold-{i}.json', 'w') as f:
        json.dump(train_data, f, indent=2)

    with open(f'/opt/ml/dataset/val-kfold-{i}.json', 'w') as f:
        json.dump(val_data, f, indent=2)

전체 Box 개수: 23144
Fold 0 - 18633, 4511
Fold 1 - 18075, 5069
Fold 2 - 18631, 4513
Fold 3 - 18454, 4690
Fold 4 - 18783, 4361


# 분포 확인

In [4]:
# check distribution

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]

    # assert len(set(train_gr) & set(val_gr)) == 0
    distrs.append(get_distribution(train_y))

    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,16.96%,27.45%,3.79%,4.13%,4.48%,12.61%,5.51%,22.28%,0.77%,2.02%
val - fold0,17.85%,27.42%,4.23%,3.70%,3.26%,13.15%,5.25%,22.77%,0.35%,2.02%
train - fold1,17.14%,27.24%,4.01%,3.98%,4.28%,12.77%,5.38%,22.32%,0.67%,2.20%
val - fold1,17.12%,28.17%,3.41%,4.26%,4.12%,12.51%,5.72%,22.57%,0.73%,1.38%
train - fold2,17.31%,27.39%,3.83%,4.08%,4.13%,12.80%,5.14%,22.68%,0.69%,1.94%
val - fold2,16.42%,27.68%,4.05%,3.88%,4.70%,12.36%,6.76%,21.12%,0.69%,2.35%
train - fold3,17.30%,27.47%,3.87%,4.06%,4.22%,12.63%,5.49%,22.39%,0.63%,1.95%
val - fold3,16.50%,27.36%,3.88%,3.99%,4.33%,13.07%,5.33%,22.30%,0.92%,2.32%
train - fold4,16.97%,27.67%,3.88%,3.97%,4.10%,12.77%,5.76%,22.20%,0.68%,2.00%
