In [1]:
import json
import pandas as pd
from collections import defaultdict

target = '/opt/ml/segmentation/input/data/train_all.json'
category = {}
file_info = {}
make_frame = defaultdict(list)

# target 에 존재하는 train.json 파일을 엽니다.
with open(target, 'r') as f:
    json_datas = json.load(f) # python dict 처럼 접근하게끔 변환

    print(json_datas.keys())
    #dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

    # 이미지 정보 중 파일 경로와 아이디만 추출해서 file_info 에 저장
    for item in json_datas['images']:
        file_info[item['id']] = {'id' : item['id'], 'file_name' : item['file_name']}

    # 카테고리 정보를 category 에 저장
    for item in json_datas['categories']:
        category[item['id']] = item['name']

    # annotations 에 속하는 아이템들을 images 에 속하는 아이템의 정보와 합치기 위함
    for annotation in json_datas['annotations']:
        save_dict = file_info[annotation['image_id']]
        
        # 각 이미지에 해당하는 bounding box 정보와 class 정보 area(넓이) 정보를 추가
        save_dict.update({
            'class':annotation['category_id'],
            })

        for k,v in save_dict.items():
            # dataframe 으로 만들기 위해서 'key' : [item1,item2...] 형태로 저장
            make_frame[k].append(v)
    # print(make_frame)

    # dictionary 가 잘 만들어 졌는지 길이를 측정해서 확인해보세요!
    print(len(json_datas['annotations']))
    # dictionary to DataFrame
    df = pd.DataFrame.from_dict(make_frame)
    # index 를 제외하고 csv 파일로 저장
    # df.to_csv('detection_info.csv',index=False)
df

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])
26240


Unnamed: 0,id,file_name,class
0,0,batch_01_vt/0002.jpg,8
1,0,batch_01_vt/0002.jpg,8
2,0,batch_01_vt/0002.jpg,6
3,0,batch_01_vt/0002.jpg,5
4,0,batch_01_vt/0002.jpg,5
...,...,...,...
26235,3270,batch_03/0997.jpg,2
26236,3270,batch_03/0997.jpg,2
26237,3270,batch_03/0997.jpg,2
26238,3270,batch_03/0997.jpg,2


In [2]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [3]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y)
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()

    for label, g in zip(y, groups):
        label -= 1
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [4]:
train_x = df
train_y = train_x['class'].values
groups = train_x['file_name'].values

def get_distribution(y_vals):
    y_distr = Counter(y_vals)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals)+1)]

In [5]:
distrs = [get_distribution(train_y)]
index = ['training set']

# print(train_x.isnull().sum())
# print(f"train_x = {train_x}")
# print(f"train_y = {train_y}")
# print(f"groups = {groups}")
print(f"distrs = {distrs}")

for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    assert len(set(dev_groups) & set(val_groups)) == 0

    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')

distrs = [['0.00%', '10.60%', '35.48%', '2.51%', '2.14%', '2.32%', '11.78%', '5.12%', '29.13%', '0.24%', '0.67%']]


In [13]:
print('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

Distribution per class:


Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9,Label 10
training set,0.00%,10.60%,35.48%,2.51%,2.14%,2.32%,11.78%,5.12%,29.13%,0.24%,0.67%
development set - fold 0,0.00%,10.60%,35.49%,2.51%,2.14%,2.33%,11.78%,5.12%,29.13%,0.24%,0.67%
validation set - fold 0,0.00%,10.61%,35.47%,2.51%,2.15%,2.32%,11.77%,5.12%,29.11%,0.25%,0.69%
development set - fold 1,0.00%,10.60%,35.49%,2.51%,2.14%,2.33%,11.78%,5.12%,29.13%,0.24%,0.67%
validation set - fold 1,0.00%,10.61%,35.46%,2.51%,2.15%,2.32%,11.77%,5.12%,29.12%,0.25%,0.69%
development set - fold 2,0.00%,10.60%,35.48%,2.51%,2.14%,2.32%,11.78%,5.12%,29.13%,0.24%,0.68%
validation set - fold 2,0.00%,10.59%,35.48%,2.52%,2.13%,2.32%,11.78%,5.13%,29.13%,0.25%,0.67%
development set - fold 3,0.00%,10.60%,35.48%,2.51%,2.14%,2.32%,11.77%,5.12%,29.13%,0.24%,0.68%
validation set - fold 3,0.00%,10.60%,35.50%,2.52%,2.14%,2.33%,11.78%,5.11%,29.13%,0.23%,0.67%
development set - fold 4,0.00%,10.60%,35.48%,2.51%,2.14%,2.32%,11.77%,5.12%,29.12%,0.24%,0.68%


In [7]:
def create_json(filename, fold_index, img_idx, ann_idx):
    result = {}

    result['info'] = json_datas['info']
    result['licenses'] = json_datas['licenses']
    templist = []
    for idx, item in enumerate(json_datas['images']):
        if idx in img_idx:
            templist.append(item)
    result['images'] = templist
    result['categories'] = json_datas['categories']
    templist = []
    newindex = 0
    for idx, item in enumerate(json_datas['annotations']):
        if idx in ann_idx:
            item['id'] = newindex
            newindex += 1
            # print(f"item['id'] = {item['id']}, newindex = {newindex}, newitem['id'] = {item['id']}")
            templist.append(item)
    result['annotations'] = templist

    with open(f'/opt/ml/segmentation/input/data/{filename}_{fold_index+1}.json', 'w') as outfile:
        json.dump(result, outfile, indent=2)

In [8]:
def create_excel(filename, fold_index, ann_idx):
    tempdf = df.iloc[ann_idx]
    tempdf.to_csv(f'/opt/ml/segmentation/input/data/k-fold_{filename}_{fold_index+1}.csv',index=False)

In [9]:
import json
import pandas as pd
from collections import defaultdict

target = '/opt/ml/segmentation/input/data/train_all.json'

with open(target, 'r') as f:
    json_datas = json.load(f) # python dict 처럼 접근하게끔 변환
    
    for k_fold_index, (train_indexes, val_indexes) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5, seed=2021)):
        print("Before create json")
        create_json('train', k_fold_index, set(df.iloc[train_indexes]['id'].values), train_indexes)
        create_json('val', k_fold_index, set(df.iloc[val_indexes]['id'].values), val_indexes)
        
        create_excel('train', k_fold_index, train_indexes)
        create_excel('val', k_fold_index, val_indexes)
        
        print(f"Stratified K-fold {k_fold_index+1} Done.")

Before create json
Stratified K-fold 1 Done.
Before create json
Stratified K-fold 2 Done.
Before create json
Stratified K-fold 3 Done.
Before create json
Stratified K-fold 4 Done.
Before create json
Stratified K-fold 5 Done.
