In [70]:
import os
import json
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedGroupKFold


In [2]:
# load json annotation = {dataset file 경로}
annotation = './dataset/train.json'

# json 파일을 읽어옴
with open(annotation) as f: data = json.load(f)

# 
var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]
X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in var])
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])


TRAIN: [   0    1    1 ... 4881 4881 4881]
  [0 3 7 ... 7 1 7]
 TEST: [   6   13   13 ... 4882 4882 4882]
  [1 6 7 ... 0 1 1]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4876 4876 4878]
  [7 0 0 ... 0 2 0]
TRAIN: [   0    3    3 ... 4882 4882 4882]
  [0 2 6 ... 0 1 1]
 TEST: [   1    1    1 ... 4877 4877 4880]
  [3 7 4 ... 7 7 0]
TRAIN: [   1    1    1 ... 4882 4882 4882]
  [3 7 4 ... 0 1 1]
 TEST: [   0    3    3 ... 4881 4881 4881]
  [0 2 6 ... 7 1 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   4    4    4 ... 4868 4872 4872]
  [1 1 1 ... 2 4 6]


In [89]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0
    
    dis_train = get_distribution(train_y)
    dis_val = get_distribution(val_y)
    dis_diff = abs(np.array(list(map(float, [i[:-1] for i in dis_train]))) - 
                      np.array(list(map(float, [i[:-1] for i in dis_val]))))
    
    dis_diff = np.append(dis_diff,(sum(dis_diff)))
    
    distrs.append(dis_train + ['None'])
    distrs.append(dis_val + ['None'])
    distrs.append(dis_diff)
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    index.append(f'diff - fold{fold_ind}')
    

categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)] + ['sum'])


Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing,sum
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%,
train - fold0,16.96%,27.45%,3.79%,4.13%,4.48%,12.61%,5.51%,22.28%,0.77%,2.02%,
val - fold0,17.85%,27.42%,4.23%,3.70%,3.26%,13.15%,5.25%,22.77%,0.35%,2.02%,
diff - fold0,0.89,0.03,0.44,0.43,1.22,0.54,0.26,0.49,0.42,0.0,4.72
train - fold1,17.14%,27.24%,4.01%,3.98%,4.28%,12.77%,5.38%,22.32%,0.67%,2.20%,
val - fold1,17.12%,28.17%,3.41%,4.26%,4.12%,12.51%,5.72%,22.57%,0.73%,1.38%,
diff - fold1,0.02,0.93,0.6,0.28,0.16,0.26,0.34,0.25,0.06,0.82,3.72
train - fold2,17.31%,27.39%,3.83%,4.08%,4.13%,12.80%,5.14%,22.68%,0.69%,1.94%,
val - fold2,16.42%,27.68%,4.05%,3.88%,4.70%,12.36%,6.76%,21.12%,0.69%,2.35%,
diff - fold2,0.89,0.29,0.22,0.2,0.57,0.44,1.62,1.56,0.0,0.41,6.2


In [22]:
train_data = {
    'info' : data['info'],
    'licenses' : data['licenses'],
    'categories' : data['categories']
}

val_data = {
    'info' : data['info'],
    'licenses' : data['licenses'],
    'categories' : data['categories']
}

In [88]:
np_img = np.array(data['images'])
np_ann = np.array(data['annotations'])

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_data['images'] = np_img[np.unique(groups[train_idx])].tolist()
    train_data['annotations'] = np_ann[train_idx].tolist()
    val_data['images'] = np_img[np.unique(groups[val_idx])].tolist()
    val_data['annotations'] = np_ann[val_idx].tolist()
    
    path = os.path.split(annotation)[0]
    file_name = os.path.join(path, f'train_{fold_ind}.json')
    with open(file_name, 'w') as outfile:
        json.dump(train_data, outfile, indent=4)
    
    file_name = os.path.join(path , f'val_{fold_ind}.json')
    with open(file_name, 'w') as outfile:
        json.dump(train_data, outfile, indent=4)