In [1]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
annotation = './input/data/train_all.json'
with open(annotation) as f:
    data = json.load(f)
data.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [3]:
categories = data['categories']
anns = data['annotations']
imgs = data['images']

In [4]:
anns[0].keys()

dict_keys(['id', 'image_id', 'category_id', 'segmentation', 'area', 'bbox', 'iscrowd'])

In [5]:
import pandas as pd 

object_area_per_img = {img_id:0 for img_id in range(len(imgs))}
object_num_dict = {img_id:0 for img_id in range(len(imgs))}
for i in range(len(anns)):
    area = anns[i]['area']
    img_id = anns[i]['image_id']
    object_area_per_img[img_id] += area
    object_num_dict[img_id] += 1

In [6]:
multi_label_df = pd.DataFrame({'img_id': object_area_per_img.keys(), 'object_area':object_area_per_img.values(), 
              'object_num': object_num_dict.values()})
multi_label_df['object_mean_area'] = multi_label_df.object_area / multi_label_df.object_num
multi_label_df.describe() # 각 칼럼의 25, 50, 75%에 해당하는 값이 각 칼럼을 4개의 class로 분류하는 threshold

Unnamed: 0,img_id,object_area,object_num,object_mean_area
count,3272.0,3272.0,3272.0,3271.0
mean,1635.5,80431.278117,8.01956,23633.596734
std,944.689367,45827.645922,8.937412,23183.390923
min,0.0,0.0,0.0,545.625
25%,817.75,50254.125,1.0,7186.157576
50%,1635.5,73786.25,5.0,15008.5
75%,2453.25,102272.25,12.0,32472.989583
max,3271.0,358324.5,70.0,216136.5


In [7]:
object_area_class_list = []
object_num_class_list = []
object_mean_area_class_list = []

for i, row in multi_label_df.iterrows():
    if row['object_area'] < 50254:
        object_area_class_list.append(0)
    elif row['object_area'] < 73786:
        object_area_class_list.append(1)
    elif row['object_area'] < 102272:
        object_area_class_list.append(2)
    else:
        object_area_class_list.append(3)
        
    if row['object_num'] < 2:
        object_num_class_list.append(0)
    elif row['object_num'] < 6:
        object_num_class_list.append(1)
    elif row['object_num'] < 13:
        object_num_class_list.append(2)
    else:
        object_num_class_list.append(3)
    
    if row['object_mean_area'] < 7186:
        object_mean_area_class_list.append(0)
    elif row['object_mean_area'] < 15008:
        object_mean_area_class_list.append(1)
    elif row['object_mean_area'] < 32472:
        object_mean_area_class_list.append(2)
    else:
        object_mean_area_class_list.append(3)

multi_label_df['object_area_class'] = object_area_class_list
multi_label_df['object_num_class'] = object_num_class_list
multi_label_df['object_mean_area_class'] = object_mean_area_class_list
multi_label_df.head()

Unnamed: 0,img_id,object_area,object_num,object_mean_area,object_area_class,object_num_class,object_mean_area_class
0,0,133680.5,17,7863.558824,3,3,1
1,1,96922.0,14,6923.0,2,3,0
2,2,33044.0,1,33044.0,0,0,3
3,3,37967.0,2,18983.5,0,1,2
4,4,107148.5,2,53574.25,3,1,3


In [34]:
# !pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [8]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=41)

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['object_area_class', 'object_num_class', 'object_mean_area_class']])):
    
    print("TRAIN:", [train_idx[:10]])
    print(" TEST:", [val_idx[:10]])

TRAIN: [array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 11])]
 TEST: [array([ 7, 10, 17, 22, 24, 29, 35, 48, 62, 64])]
TRAIN: [array([ 1,  2,  5,  7,  8,  9, 10, 11, 12, 13])]
 TEST: [array([ 0,  3,  4,  6, 16, 21, 25, 26, 28, 33])]
TRAIN: [array([ 0,  1,  3,  4,  5,  6,  7, 10, 12, 16])]
 TEST: [array([ 2,  8,  9, 11, 13, 14, 15, 23, 27, 30])]
TRAIN: [array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10])]
 TEST: [array([ 5, 12, 18, 20, 31, 40, 41, 42, 46, 53])]
TRAIN: [array([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10])]
 TEST: [array([ 1, 19, 34, 37, 39, 44, 49, 54, 57, 59])]


In [9]:
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])  # class
groups = np.array([v[0] for v in var])  # image(group)
groups
distrs = [get_distribution(y)]
index = ['training set']
origin_ann_df = pd.DataFrame(data['annotations'])

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['object_area_class', 'object_num_class', 'object_mean_area_class']])):
    # train_y, val_y = y[train_idx], y[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    train_y = origin_ann_df[origin_ann_df['image_id'].isin(train_idx)].category_id.to_list()
    val_y = origin_ann_df[origin_ann_df['image_id'].isin(val_idx)].category_id.to_list()
    # train_gr, val_gr = train_idx, val_idx
    
    
    
    # assert len(set(train_gr) & set(val_gr)) == 0 
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    
categories = [d['name'] for d in data['categories']]
for fold in distrs:
    del fold[0]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(10)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,10.60%,35.48%,2.51%,2.14%,2.32%,11.78%,5.12%,29.13%,0.24%,0.67%
train - fold0,10.55%,34.43%,2.47%,2.08%,2.62%,11.93%,5.01%,30.00%,0.25%,0.67%
val - fold0,10.81%,39.79%,2.69%,2.40%,1.10%,11.16%,5.56%,25.58%,0.21%,0.68%
train - fold1,10.92%,35.91%,2.47%,2.10%,2.34%,11.59%,5.14%,28.73%,0.14%,0.64%
val - fold1,9.37%,33.84%,2.66%,2.29%,2.27%,12.47%,5.02%,30.65%,0.63%,0.81%
train - fold2,10.45%,35.71%,2.45%,2.20%,2.16%,11.59%,5.07%,29.44%,0.26%,0.68%
val - fold2,11.22%,34.58%,2.76%,1.90%,2.99%,12.54%,5.30%,27.88%,0.17%,0.65%
train - fold3,10.32%,35.60%,2.60%,2.17%,2.27%,12.06%,5.16%,28.90%,0.28%,0.64%
val - fold3,11.65%,35.06%,2.18%,2.03%,2.53%,10.74%,4.98%,29.96%,0.09%,0.79%
train - fold4,10.77%,35.77%,2.56%,2.15%,2.23%,11.72%,5.21%,28.57%,0.28%,0.73%


In [10]:
train_val_df = pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(10)])
fold0_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[1].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[3].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[5].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[7].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[9].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap,fold1_gap,fold2_gap,fold3_gap,fold4_gap)

13.469999999999999 7.649999999999993 6.2000000000000055 5.5900000000000025 7.650000000000001


In [11]:
fold0_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap_trainingset,fold1_gap_trainingset,fold2_gap_trainingset,fold3_gap_trainingset,fold4_gap_trainingset)

10.830000000000002 6.059999999999995 4.959999999999999 4.399999999999996 6.229999999999998


- 기존 train_all과의 분포 차이가 가장 작으면서, 나누어진 split내의 train과 val간의 분포차이가 가장 작은 split3(4번째 split) 선택

In [52]:
for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['object_area_class', 'object_num_class', 'object_mean_area_class']])):
    
    if fold_ind==0:
        fold3_best_train = train_idx
        fold3_best_val = val_idx
    
print(fold3_best_train[:10])
print(fold3_best_val[:10])
train_idx_list = fold3_best_train
val_idx_list = fold3_best_val
print(f'train length: {len(train_idx_list)}, val length: {len(val_idx_list)}')
# train, val 각각으로 사용할 image index 추출

[ 0  1  2  3  4  5  6  8  9 11]
[ 7 10 17 22 24 29 35 48 62 64]
train length: 2617, val length: 655


In [53]:
multi_train_df = multi_label_df[multi_label_df['img_id'].isin(train_idx_list)]
multi_val_df = multi_label_df[multi_label_df['img_id'].isin(val_idx_list)]

In [54]:
multi_train_df.head()

Unnamed: 0,img_id,object_area,object_num,object_mean_area,object_area_class,object_num_class,object_mean_area_class
0,0,133680.5,17,7863.558824,3,3,1
1,1,96922.0,14,6923.0,2,3,0
2,2,33044.0,1,33044.0,0,0,3
3,3,37967.0,2,18983.5,0,1,2
4,4,107148.5,2,53574.25,3,1,3


In [55]:
# object_area_class가 잘 나누어졌는지 확인
train_bbox_num_list = multi_train_df.object_area_class.value_counts().sort_index().to_list()
train_bbox_num_ratio = [round(i/2618,3) for i in train_bbox_num_list]

val_bbox_num_list = multi_val_df.object_area_class.value_counts().sort_index().to_list()
val_bbox_num_ratio = [round(i/654,3) for i in val_bbox_num_list] 

print('train',train_bbox_num_ratio,'\n')
print('val',val_bbox_num_ratio)

train [0.249, 0.247, 0.249, 0.255] 

val [0.252, 0.263, 0.255, 0.231]


In [56]:
# object_num_class가 잘 나누어졌는지 확인
train_object_num_class_list = multi_train_df.object_num_class.value_counts().sort_index().to_list()
train_object_num_class_ratio = [round(i/2618,3) for i in train_object_num_class_list]

val_object_num_list = multi_val_df.object_num_class.value_counts().sort_index().to_list()
val_object_num_ratio = [round(i/654,3) for i in val_object_num_list] 

print('train',train_object_num_class_ratio,'\n')
print('val',val_object_num_ratio)

train [0.255, 0.285, 0.231, 0.228] 

val [0.257, 0.307, 0.222, 0.216]


In [57]:
# object_mean_area_class가 잘 나누어졌는지 확인
train_object_mean_area_class_list = multi_train_df.object_mean_area_class.value_counts().sort_index().to_list()
train_object_mean_area_class_ratio = [round(i/2618,3) for i in train_object_mean_area_class_list]

val_object_mean_area_class_list = multi_val_df.object_mean_area_class.value_counts().sort_index().to_list()
val_object_mean_area_class_ratio = [round(i/654,3) for i in val_object_mean_area_class_list] 

print('train',train_object_mean_area_class_ratio,'\n')
print('val',val_object_mean_area_class_ratio)

train [0.25, 0.248, 0.251, 0.251] 

val [0.251, 0.257, 0.248, 0.246]


In [58]:
# 모두 잘 나누어졌으므로 json파일 생성
json_file = data.copy()

In [59]:
# for i in train_idx_list:
#     print(json_file['images'][i])
#     if i in[100,101,102,103]:
#         break

In [60]:
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]
print(len(new_data_images_train), len(new_data_images_val))

2617 655


In [61]:
ann_id_list = []
for ann in anns:
    ann_id_list.append(ann['id'])

In [62]:
new_ann_train = []
new_ann_val = []
for i in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][i]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][i])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][i])
        
print(len(new_ann_train), len(new_ann_val))

21080 5160


- image의 id 초기화, ann의 image_id 및 id 초기화(baseline의 custom dataloader에서 error나서 초기화)

In [156]:
image_id_init_dict = {new_data_images_train[i]['id']:i for i in range(len(new_data_images_train))}
for i in range(len(new_data_images_train)):
    img = new_data_images_train[i]
    img['id'] = i
for i in range(len(new_ann_train)):
    ann = new_ann_train[i]
    ann['image_id'] = image_id_init_dict[ann['image_id']]

In [162]:
image_id_init_dict = {new_data_images_val[i]['id']:i for i in range(len(new_data_images_val))}
for i in range(len(new_data_images_val)):
    img = new_data_images_val[i]
    img['id'] = i
for i in range(len(new_ann_val)):
    ann = new_ann_val[i]
    ann['image_id'] = image_id_init_dict[ann['image_id']]

- 위 셀까지 초기화, id초기화안할거면 이 부분 건너뛰기

In [63]:
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

print(len(json_train['images']), len(json_train['annotations']))
print(len(json_val['images']), len(json_val['annotations']))

2617 21080
655 5160


In [64]:
2618 + 654, 5605+20635

(3272, 26240)

In [65]:
import json

with open('split0_train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('split0_val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

In [87]:
## baseline의 custom dataloader에서 사용하려면 여기선 train_all을 train val로 쪼갰을 때 
##image의 id 초기화, ann의 image_id 및 id 초기화해야함

In [None]:
# 현재 사용중인 train_MultiStfKFold.json은 split3임

## IoU가 낮은 class에 대해 finetuning

- General trash, Metal, Plastic, Clothing에 해당하는 ann만 남긴 후, CropNonEmptyMaskIfExists 적용하여 배경 영향 줄이면서 finetuning 시도

In [66]:
annotation = './input/data/train_MultiStfKFold.json'
with open(annotation) as f:
    data = json.load(f)
data.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [67]:
data['categories'] # 1, 4, 6, 10 -> General trash, Metal, Plastic, Clothing 만 필터링

[{'id': 1, 'name': 'General trash', 'supercategory': 'General trash'},
 {'id': 2, 'name': 'Paper', 'supercategory': 'Paper'},
 {'id': 3, 'name': 'Paper pack', 'supercategory': 'Paper pack'},
 {'id': 4, 'name': 'Metal', 'supercategory': 'Metal'},
 {'id': 5, 'name': 'Glass', 'supercategory': 'Glass'},
 {'id': 6, 'name': 'Plastic', 'supercategory': 'Plastic'},
 {'id': 7, 'name': 'Styrofoam', 'supercategory': 'Styrofoam'},
 {'id': 8, 'name': 'Plastic bag', 'supercategory': 'Plastic bag'},
 {'id': 9, 'name': 'Battery', 'supercategory': 'Battery'},
 {'id': 10, 'name': 'Clothing', 'supercategory': 'Clothing'}]

In [72]:
tune_class_ann_list = []
tune_class_img_list = []
for ann in data['annotations']:
    if ann['category_id'] in [1, 4, 6, 10]:
        tune_class_ann_list.append(ann)
        tune_class_img_list.append(ann['image_id'])

In [77]:
tune_class_img_list = list(set(tune_class_img_list))
len(tune_class_ann_list), len(tune_class_img_list)

(5198, 1689)

In [75]:
(3090+2782+562+177) * 0.8

5288.8

In [79]:
tuning_images_train = []
for img in data['images']:
    if img['id'] in tune_class_img_list:
        tuning_images_train.append(img)
len(tuning_images_train)

1689

In [84]:
json_tuning = data.copy()

json_tuning['images'] = tuning_images_train
json_tuning['annotations'] = tune_class_ann_list

print(len(json_tuning['images']), len(json_tuning['annotations']))

1689 5198


In [85]:
with open('finetune_train_MultiStfKFold.json', 'w') as t:
    json.dump(json_tuning, t)