In [31]:
import json
import pandas as pd
from collections import defaultdict

target = '../dataset/train.json'
category = {}
file_info = {}
make_frame = defaultdict(list)

# target 에 존재하는 train.json 파일을 엽니다.
with open(target, 'r') as f:
    json_datas = json.load(f) # python dict 처럼 접근하게끔 변환

    print(json_datas.keys())
    #dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

    # 이미지 정보 중 파일 경로와 아이디만 추출해서 file_info 에 저장
    for item in json_datas['images']:
        file_info[item['id']] = {'id' : item['id'], 'file_name' : item['file_name']}

    # 카테고리 정보를 category 에 저장
    for item in json_datas['categories']:
        category[item['id']] = item['name']

    # annotations 에 속하는 아이템들을 images 에 속하는 아이템의 정보와 합치기 위함
    for annotation in json_datas['annotations']:
        save_dict = file_info[annotation['image_id']]
        
        # 각 이미지에 해당하는 bounding box 정보와 class 정보 area(넓이) 정보를 추가
        save_dict.update({
            'class':annotation['category_id'],
            'x': annotation['bbox'][0],
            'y': annotation['bbox'][1],
            'w': annotation['bbox'][2],
            'h': annotation['bbox'][3],
            'area': annotation['area'],
            'ratio': annotation['bbox'][3] / annotation['bbox'][2]
            })

        for k,v in save_dict.items():
            # dataframe 으로 만들기 위해서 'key' : [item1,item2...] 형태로 저장
            make_frame[k].append(v)
    # print(make_frame)

    # dictionary 가 잘 만들어 졌는지 길이를 측정해서 확인해보세요!
    print(len(json_datas['annotations']))
    # dictionary to DataFrame
    df = pd.DataFrame.from_dict(make_frame)
    # index 를 제외하고 csv 파일로 저장
    # df.to_csv('detection_info.csv',index=False)
df

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])
23144


Unnamed: 0,id,file_name,class,x,y,w,h,area,ratio
0,0,train/0000.jpg,0,197.6,193.7,547.8,469.7,257301.66,0.857430
1,1,train/0001.jpg,3,0.0,407.4,57.6,180.6,10402.56,3.135417
2,1,train/0001.jpg,7,0.0,455.6,144.6,181.6,26259.36,1.255878
3,1,train/0001.jpg,4,722.3,313.4,274.3,251.9,69096.17,0.918338
4,1,train/0001.jpg,5,353.2,671.0,233.7,103.4,24164.58,0.442448
...,...,...,...,...,...,...,...,...,...
23139,4882,train/4882.jpg,5,0.0,116.2,944.1,814.1,768591.81,0.862303
23140,4882,train/4882.jpg,7,302.1,439.3,265.2,216.1,57309.72,0.814857
23141,4882,train/4882.jpg,0,511.3,451.1,58.7,30.2,1772.74,0.514480
23142,4882,train/4882.jpg,1,255.0,421.4,271.7,195.1,53008.67,0.718071


In [32]:
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [33]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [34]:
train_x = df
train_y = train_x['class'].values
groups = train_x['file_name'].values

def get_distribution(y_vals):
    y_distr = Counter(y_vals)
    y_vals_sum = sum(y_distr.values())
    return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [36]:
distrs = [get_distribution(train_y)]
index = ['training set']

for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')

In [37]:
print('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

Distribution per class:


Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
development set - fold 0,17.14%,27.45%,3.87%,4.04%,4.24%,12.72%,5.46%,22.38%,0.69%,2.02%
validation set - fold 0,17.13%,27.43%,3.88%,4.06%,4.25%,12.71%,5.46%,22.36%,0.69%,2.03%
development set - fold 1,17.14%,27.45%,3.87%,4.05%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 1,17.12%,27.44%,3.89%,4.04%,4.25%,12.72%,5.46%,22.37%,0.69%,2.03%
development set - fold 2,17.14%,27.45%,3.88%,4.05%,4.25%,12.71%,5.46%,22.37%,0.69%,2.02%
validation set - fold 2,17.13%,27.44%,3.87%,4.04%,4.23%,12.72%,5.47%,22.38%,0.69%,2.03%
development set - fold 3,17.13%,27.44%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
validation set - fold 3,17.15%,27.46%,3.87%,4.04%,4.24%,12.71%,5.45%,22.38%,0.69%,2.01%
development set - fold 4,17.13%,27.44%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%


In [38]:
for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    # dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    # dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    print(df.iloc[dev_ind])
    break

         id       file_name  class      x      y      w      h       area  \
0         0  train/0000.jpg      0  197.6  193.7  547.8  469.7  257301.66   
1         1  train/0001.jpg      3    0.0  407.4   57.6  180.6   10402.56   
2         1  train/0001.jpg      7    0.0  455.6  144.6  181.6   26259.36   
3         1  train/0001.jpg      4  722.3  313.4  274.3  251.9   69096.17   
4         1  train/0001.jpg      5  353.2  671.0  233.7  103.4   24164.58   
...     ...             ...    ...    ...    ...    ...    ...        ...   
23139  4882  train/4882.jpg      5    0.0  116.2  944.1  814.1  768591.81   
23140  4882  train/4882.jpg      7  302.1  439.3  265.2  216.1   57309.72   
23141  4882  train/4882.jpg      0  511.3  451.1   58.7   30.2    1772.74   
23142  4882  train/4882.jpg      1  255.0  421.4  271.7  195.1   53008.67   
23143  4882  train/4882.jpg      1  145.4  295.4  420.2  356.1  149633.22   

          ratio  
0      0.857430  
1      3.135417  
2      1.255878  
3  

In [55]:
import json
import pandas as pd
from collections import defaultdict

target = '../dataset/train.json'
category = {}
file_info = {}
make_frame = defaultdict(list)

result = {}

# target 에 존재하는 train.json 파일을 엽니다.
with open(target, 'r') as f:
    json_datas = json.load(f) # python dict 처럼 접근하게끔 변환

    # print(json_datas.keys())
    # print(json_datas['info'])
    result['info'] = json_datas['info']
    result['licenses'] = json_datas['licenses']
    result['categories'] = json_datas['categories']
    
    print(result)
    #dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

#     # 이미지 정보 중 파일 경로와 아이디만 추출해서 file_info 에 저장
#     for item in json_datas['images']:
#         file_info[item['id']] = {'id' : item['id'], 'file_name' : item['file_name']}

#     # 카테고리 정보를 category 에 저장
#     for item in json_datas['categories']:
#         category[item['id']] = item['name']

#     # annotations 에 속하는 아이템들을 images 에 속하는 아이템의 정보와 합치기 위함
#     for annotation in json_datas['annotations']:
#         save_dict = file_info[annotation['image_id']]
        
#         # 각 이미지에 해당하는 bounding box 정보와 class 정보 area(넓이) 정보를 추가
#         save_dict.update({
#             'class':annotation['category_id'],
#             'x': annotation['bbox'][0],
#             'y': annotation['bbox'][1],
#             'w': annotation['bbox'][2],
#             'h': annotation['bbox'][3],
#             'area': annotation['area'],
#             'ratio': annotation['bbox'][3] / annotation['bbox'][2]
#             })

#         for k,v in save_dict.items():
#             # dataframe 으로 만들기 위해서 'key' : [item1,item2...] 형태로 저장
#             make_frame[k].append(v)
#     # print(make_frame)

#     # dictionary 가 잘 만들어 졌는지 길이를 측정해서 확인해보세요!
#     print(len(json_datas['annotations']))
#     # dictionary to DataFrame
#     df = pd.DataFrame.from_dict(make_frame)
#     # index 를 제외하고 csv 파일로 저장
#     # df.to_csv('detection_info.csv',index=False)
# df

output = '../dataset/output.json'

with open(output, 'w') as outfile:
    json.dump(result, outfile)

{'info': {'year': 2021, 'version': '1.0', 'description': 'Recycle Trash', 'contributor': 'Upstage', 'url': None, 'date_created': '2021-02-02 01:10:00'}}
