In [1]:
import os
import json
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedGroupKFold


In [2]:
# load json annotation = {dataset file 경로}
annotation = './dataset/train.json'

# json 파일을 읽어옴
with open(annotation) as f: data = json.load(f)

# 
var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in var])
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=8)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])


TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   3    3    4 ... 4875 4875 4878]
  [2 6 1 ... 2 2 0]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   8    8    8 ... 4871 4873 4880]
  [6 0 7 ... 1 0 0]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4881 4881 4881]
  [7 0 0 ... 7 1 7]
TRAIN: [   1    1    1 ... 4881 4881 4881]
  [3 7 4 ... 7 1 7]
 TEST: [   0    6   15 ... 4882 4882 4882]
  [0 1 6 ... 0 1 1]
TRAIN: [   0    3    3 ... 4882 4882 4882]
  [0 2 6 ... 0 1 1]
 TEST: [   1    1    1 ... 4872 4877 4877]
  [3 7 4 ... 6 7 7]


In [3]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0
    
    dis_train = get_distribution(train_y)
    dis_val = get_distribution(val_y)
    dis_diff = abs(np.array(list(map(float, [i[:-1] for i in dis_train]))) - 
                      np.array(list(map(float, [i[:-1] for i in dis_val]))))
    
    dis_diff = np.append(dis_diff,(sum(dis_diff)))
    
    distrs.append(dis_train + ['None'])
    distrs.append(dis_val + ['None'])
    distrs.append(dis_diff)
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    index.append(f'diff - fold{fold_ind}')
    

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)] + ['sum'])


Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing,sum
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%,
train - fold0,17.27%,27.43%,3.61%,4.19%,4.52%,12.31%,5.67%,22.20%,0.68%,2.12%,
val - fold0,16.61%,27.49%,4.91%,3.47%,3.18%,14.29%,4.63%,23.05%,0.72%,1.64%,
diff - fold0,0.66,0.06,1.3,0.72,1.34,1.98,1.04,0.85,0.04,0.48,8.47
train - fold1,17.26%,27.29%,3.96%,3.88%,4.44%,12.94%,5.58%,22.04%,0.66%,1.94%,
val - fold1,16.63%,28.08%,3.51%,4.72%,3.43%,11.80%,4.94%,23.73%,0.79%,2.37%,
diff - fold1,0.63,0.79,0.45,0.84,1.01,1.14,0.64,1.69,0.13,0.43,7.75
train - fold2,17.03%,27.53%,3.85%,3.90%,3.83%,12.83%,5.60%,22.55%,0.80%,2.08%,
val - fold2,17.54%,27.12%,3.98%,4.57%,5.79%,12.28%,4.93%,21.72%,0.26%,1.81%,
diff - fold2,0.51,0.41,0.13,0.67,1.96,0.55,0.67,0.83,0.54,0.27,6.54


In [4]:
train_data = {
    'info' : data['info'],
    'licenses' : data['licenses'],
    'categories' : data['categories']
}

val_data = {
    'info' : data['info'],
    'licenses' : data['licenses'],
    'categories' : data['categories']
}

In [5]:
np_img = np.array(data['images'])
np_ann = np.array(data['annotations'])

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_data['images'] = np_img[np.unique(groups[train_idx])].tolist()
    train_data['annotations'] = np_ann[train_idx].tolist()
    val_data['images'] = np_img[np.unique(groups[val_idx])].tolist()
    val_data['annotations'] = np_ann[val_idx].tolist()
    
    path = os.path.split(annotation)[0]
    file_name = os.path.join(path, f'train_{fold_ind}.json')
    with open(file_name, 'w') as outfile:
        json.dump(train_data, outfile, indent=4)
    
    file_name = os.path.join(path , f'val_{fold_ind}.json')
    with open(file_name, 'w') as outfile:
        json.dump(val_data, outfile, indent=4)