### StratifiedGroupKFold를 이용해 validation set 생성

- 대회가 어느정도 막바지를 향해가면, n_split을  큰 값으로 늘려, OOF를 진행해볼 수 있을 것 같다

In [1]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

# load json 
annotation = 'dataset/train.json'

with open(annotation) as f:
    data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])  # class
groups = np.array([v[0] for v in var])  # image(group)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])

TRAIN: [   0    1    1 ... 4881 4881 4881]
  [0 3 7 ... 7 1 7]
 TEST: [   6   13   13 ... 4882 4882 4882]
  [1 6 7 ... 0 1 1]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4876 4876 4878]
  [7 0 0 ... 0 2 0]
TRAIN: [   0    3    3 ... 4882 4882 4882]
  [0 2 6 ... 0 1 1]
 TEST: [   1    1    1 ... 4877 4877 4880]
  [3 7 4 ... 7 7 0]
TRAIN: [   1    1    1 ... 4882 4882 4882]
  [3 7 4 ... 0 1 1]
 TEST: [   0    3    3 ... 4881 4881 4881]
  [0 2 6 ... 7 1 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   4    4    4 ... 4868 4872 4872]
  [1 1 1 ... 2 4 6]


In [58]:
# check distribution
import pandas as pd
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0 
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    
categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,16.96%,27.45%,3.79%,4.13%,4.48%,12.61%,5.51%,22.28%,0.77%,2.02%
val - fold0,17.85%,27.42%,4.23%,3.70%,3.26%,13.15%,5.25%,22.77%,0.35%,2.02%
train - fold1,17.14%,27.24%,4.01%,3.98%,4.28%,12.77%,5.38%,22.32%,0.67%,2.20%
val - fold1,17.12%,28.17%,3.41%,4.26%,4.12%,12.51%,5.72%,22.57%,0.73%,1.38%
train - fold2,17.31%,27.39%,3.83%,4.08%,4.13%,12.80%,5.14%,22.68%,0.69%,1.94%
val - fold2,16.42%,27.68%,4.05%,3.88%,4.70%,12.36%,6.76%,21.12%,0.69%,2.35%
train - fold3,17.30%,27.47%,3.87%,4.06%,4.22%,12.63%,5.49%,22.39%,0.63%,1.95%
val - fold3,16.50%,27.36%,3.88%,3.99%,4.33%,13.07%,5.33%,22.30%,0.92%,2.32%
train - fold4,16.97%,27.67%,3.88%,3.97%,4.10%,12.77%,5.76%,22.20%,0.68%,2.00%


In [56]:
train_val_df = pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])
fold0_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[1].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[3].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[5].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[7].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[9].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap,fold1_gap,fold2_gap,fold3_gap,fold4_gap)

4.719999999999998 3.7200000000000024 6.199999999999997 2.4499999999999993 6.190000000000001


In [57]:
fold0_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap_trainingset,fold1_gap_trainingset,fold2_gap_trainingset,fold3_gap_trainingset,fold4_gap_trainingset)

3.7899999999999974 2.9000000000000017 4.98 1.9499999999999997 5.0399999999999965


- competition에서 주어진 training set과 test set이 random하게 split되었고, 두 set 간의 분포가 유사할 것이라는 가정하에, fold 내에서의 train, val의 차이가 가장 작으면서, valset이 기존 training set의 분포와 가장 비슷하기에 test set을 가장 잘 대변할 것으로 추측되는 fold3에서의 train과 val을 모델학습의 train과 val로 사용

In [64]:
import pandas as pd
from collections import Counter

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    
    if fold_ind==3:
        fold3_best_train = groups[train_idx]
        fold3_best_val = groups[val_idx]
    
print(fold3_best_train)
print(fold3_best_val)

# train, val 각각으로 사용할 image index 추출

[   1    1    1 ... 4882 4882 4882]
[   0    3    3 ... 4881 4881 4881]


- json으로 추출

In [85]:
train_idx_list = list(set(fold3_best_train))  # json파일 내의 images, annotations를 수정해야함
val_idx_list = list(set(fold3_best_val))

print(f'train length: {len(train_idx_list)}, val length: {len(val_idx_list)}')

train length: 3902, val length: 981


In [95]:
json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]
print(len(new_data_images_train), len(new_data_images_val))

3902 981


In [105]:
new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
print(len(new_ann_train), len(new_ann_val))

18454 4690


In [125]:
len(json_file['annotations'])  # 18454 + 4690

23144

In [124]:
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

print(len(json_train['images']), len(json_train['annotations']))
print(len(json_val['images']), len(json_val['annotations']))

3902 18454
981 4690


In [126]:
## Json file로 생성

import json

with open('train_StfGKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('val_StfGKFold.json', 'w') as v:
    json.dump(json_val, v)

## MultiLabel 

In [2]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

# load json 
annotation = 'dataset/train.json'

with open(annotation) as f:
    data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])  # class
groups = np.array([v[0] for v in var])  # image(group)

In [23]:
import pandas as pd
import numpy as np
train_df = pd.DataFrame(data['annotations'])
train_df = train_df[['image_id', 'category_id', 'area','id']]
train_df.head()

Unnamed: 0,image_id,category_id,area,id
0,0,0,257301.66,0
1,1,3,10402.56,1
2,1,7,26259.36,2
3,1,4,69096.17,3
4,1,5,24164.58,4


In [35]:
pd.options.display.float_format = '{:.5f}'.format
train_df.area.apply(lambda x: int(x)).describe()  # bbox의 크기를 0~25, 25~50, 50~75, 75~100 총 4가지로 분류

count     23144.00000
mean      92863.48116
std      135979.14884
min           0.00000
25%        9997.00000
50%       38938.00000
75%      119122.00000
max     1048371.00000
Name: area, dtype: float64

In [137]:
# 이미지 별 bbox 개수
bbox_num = train_df.groupby('image_id')['id'].count().to_list()

# 작은 box부터 큰 box까지 0,1,2,3, 이미지 별 bbox의 평균 크기 분류
area_mean_df = train_df.groupby('image_id')['area'].mean()
area_class = []  
for i in range(4883):
    if area_mean_df[i] <= 9997:
        area_class.append(0)
    elif area_mean_df[i] <= 38938:
        area_class.append(1)
    elif area_mean_df[i] <= 119122:
        area_class.append(2)
    else:
        area_class.append(3)

# 최빈 class 
category_group = train_df.groupby('image_id')['category_id'].value_counts()
most_class_list = []
for i in range(4883):
    most_class = category_group[i].index.to_list()[0] # 동률일 경우 가장 빠른 index 선택
    most_class_list.append(most_class)
    
print(len(bbox_num), len(area_class), len(most_class_list))

4883 4883 4883


In [144]:
multi_label_df = pd.DataFrame({'image_id':[i for i in range(4883)], 'bbox_num':bbox_num, 'area_class':area_class, 'most_class_list':most_class_list})
multi_label_df.head()

Unnamed: 0,image_id,bbox_num,area_class,most_class_list
0,0,1,3,0
1,1,8,2,0
2,2,1,3,3
3,3,2,2,2
4,4,6,1,1


In [199]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=41)
X = np.ones((len(bbox_num),1))

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['bbox_num', 'area_class']])):
    
    print("TRAIN:", [train_idx[:10]])
    print(" TEST:", [val_idx[:10]])
    


TRAIN: [array([ 1,  3,  4,  5,  7,  8,  9, 10, 11, 12])]
 TEST: [array([ 0,  2,  6, 16, 19, 23, 25, 28, 36, 38])]
TRAIN: [array([ 0,  2,  3,  4,  6,  7,  8,  9, 10, 11])]
 TEST: [array([ 1,  5, 14, 20, 21, 31, 33, 34, 41, 42])]
TRAIN: [array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10])]
 TEST: [array([ 7, 13, 18, 22, 24, 30, 49, 50, 51, 53])]
TRAIN: [array([ 0,  1,  2,  5,  6,  7,  9, 10, 12, 13])]
 TEST: [array([ 3,  4,  8, 11, 26, 27, 29, 32, 37, 40])]
TRAIN: [array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 11])]
 TEST: [array([ 9, 10, 12, 15, 17, 35, 39, 54, 58, 66])]


In [193]:
from collections import Counter

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])  # class
groups = np.array([v[0] for v in var])  # image(group)
groups
distrs = [get_distribution(y)]
index = ['training set']
origin_ann_df = pd.DataFrame(data['annotations'])

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['bbox_num', 'area_class']])):
    # train_y, val_y = y[train_idx], y[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    train_y = origin_ann_df[origin_ann_df['image_id'].isin(train_idx)].category_id.to_list()
    val_y = origin_ann_df[origin_ann_df['image_id'].isin(val_idx)].category_id.to_list()
    # train_gr, val_gr = train_idx, val_idx
    
    
    
    # assert len(set(train_gr) & set(val_gr)) == 0 
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    
categories = [d['name'] for d in data['categories']]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,17.05%,27.49%,3.85%,4.12%,4.48%,12.48%,5.25%,22.60%,0.64%,2.05%
val - fold0,17.48%,27.27%,3.99%,3.75%,3.30%,13.67%,6.29%,21.47%,0.86%,1.92%
train - fold1,17.07%,27.44%,4.03%,4.18%,4.26%,12.72%,5.46%,22.09%,0.65%,2.09%
val - fold1,17.38%,27.46%,3.28%,3.50%,4.20%,12.69%,5.45%,23.48%,0.83%,1.74%
train - fold2,17.18%,27.08%,3.79%,3.83%,4.21%,13.18%,5.55%,22.56%,0.74%,1.89%
val - fold2,16.96%,28.91%,4.23%,4.91%,4.38%,10.86%,5.10%,21.63%,0.48%,2.55%
train - fold3,17.08%,27.86%,3.91%,4.18%,4.09%,12.37%,5.45%,22.14%,0.74%,2.17%
val - fold3,17.34%,25.80%,3.73%,3.49%,4.86%,14.11%,5.48%,23.30%,0.46%,1.43%
train - fold4,17.29%,27.36%,3.80%,3.91%,4.18%,12.84%,5.58%,22.48%,0.66%,1.91%


In [194]:
train_val_df = pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])
fold0_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[1].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[3].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[5].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[7].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[9].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap,fold1_gap,fold2_gap,fold3_gap,fold4_gap)

6.050000000000001 3.7799999999999994 8.360000000000001 7.910000000000001 5.0699999999999985


In [195]:
fold0_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[2].to_list())))))
fold1_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[4].to_list())))))
fold2_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[6].to_list())))))
fold3_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[8].to_list())))))
fold4_gap_trainingset = np.sum(np.abs(np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[0].to_list()))) - np.array(list(map(lambda x:float(x[:-1]), train_val_df.iloc[10].to_list())))))
print(fold0_gap_trainingset,fold1_gap_trainingset,fold2_gap_trainingset,fold3_gap_trainingset,fold4_gap_trainingset)

4.810000000000002 3.000000000000001 6.700000000000005 6.329999999999996 4.07


- competition에서 주어진 training set과 test set이 random하게 split되었고, 두 set 간의 분포가 유사할 것이라는 가정하에, fold 내에서의 train, val의 차이가 가장 작으면서, valset이 기존 training set의 분포와 가장 비슷하기에 test set을 가장 잘 대변할 것으로 추측되는 fold2에서의 train과 val을 모델학습의 train과 val로 사용

- 'most_class_list'는 빼고 multilabel stratified kfold를 진행했을 때, 포함시켰을때보다, 기존 training set()과의 누적분포차이가 약 0.2%정도 더 작았기에 , 'most_class_list'는 빼고 진행

In [278]:
from collections import Counter

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['bbox_num', 'area_class']])):
    
    if fold_ind==1:
        fold2_best_train = train_idx
        fold2_best_val = val_idx
    
print(fold2_best_train[:10])
print(fold2_best_val[:10])
train_idx_list = fold2_best_train
val_idx_list = fold2_best_val
print(f'train length: {len(train_idx_list)}, val length: {len(val_idx_list)}')
# train, val 각각으로 사용할 image index 추출

[ 0  2  3  4  6  7  8  9 10 11]
[ 1  5 14 20 21 31 33 34 41 42]
train length: 3907, val length: 976


In [250]:
multi_train_df = multi_label_df[multi_label_df['image_id'].isin(train_idx_list)]
multi_val_df = multi_label_df[multi_label_df['image_id'].isin(val_idx_list)]

In [277]:
train_bbox_num_list = multi_train_df.bbox_num.value_counts().sort_index().to_list()
train_bbox_num = train_bbox_num_list[:20]
train_bbox_num.append(sum(train_bbox_num_list[20:]))  # box 20개 이상인 경우는 하나로 묶음
train_bbox_num_ratio = [round(i/3907,3) for i in train_bbox_num] # 마지막 값은 bbox 20개 이상인 이미지 개수, 그 전까지는 박스가 1 2 3 .. 19 20개인 이미지 개수

val_bbox_num_list = multi_val_df.bbox_num.value_counts().sort_index().to_list()
val_bbox_num = val_bbox_num_list[:20]
val_bbox_num.append(sum(val_bbox_num_list[20:]))  # box 20개 이상인 경우는 하나로 묶음
val_bbox_num_ratio = [round(i/976,3) for i in val_bbox_num]  # 마지막 값은 bbox 20개 이상인 이미지 개수, 그 전까지는 박스가 1 2 3 .. 19 20개인 이미지 개수

print('train',train_bbox_num_ratio,'\n')
print('val',val_bbox_num_ratio)

train [0.405, 0.161, 0.083, 0.053, 0.045, 0.034, 0.033, 0.021, 0.022, 0.018, 0.013, 0.013, 0.01, 0.01, 0.009, 0.011, 0.009, 0.005, 0.005, 0.004, 0.036] 

val [0.409, 0.157, 0.082, 0.052, 0.044, 0.036, 0.031, 0.022, 0.02, 0.016, 0.015, 0.016, 0.008, 0.005, 0.008, 0.007, 0.006, 0.007, 0.005, 0.006, 0.046]


In [276]:
train_area_class_list = multi_train_df.area_class.value_counts().sort_index().to_list()
train_area_class_ratio = [round(i/3907,3) for i in train_area_class_list] # 마지막 값은 bbox 20개 이상인 이미지 개수, 그 전까지는 박스가 1 2 3 .. 19 20개인 이미지 개수

val_area_class_list = multi_val_df.area_class.value_counts().sort_index().to_list()
val_area_class_ratio = [round(i/976,3) for i in val_area_class_num_list]  # 마지막 값은 bbox 20개 이상인 이미지 개수, 그 전까지는 박스가 1 2 3 .. 19 20개인 이미지 개수

print('train',train_area_class_ratio,'\n')
print('val',val_area_class_ratio)

train [0.005, 0.103, 0.391, 0.501] 

val [0.005, 0.115, 0.377, 0.503]


- train과 val의 box의 class 분포, 이미지 내 box의 평균 크기 분포, 이미지 내 box의 개수 분포 모두 비슷함을 알 수 있다.

- json으로 추출

In [279]:
json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]
print(len(new_data_images_train), len(new_data_images_val))

3907 976


In [280]:
new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
print(len(new_ann_train), len(new_ann_val))

18425 4719


In [283]:
len(json_file['annotations'])  # 18425 + 4719

23144

In [284]:
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

print(len(json_train['images']), len(json_train['annotations']))
print(len(json_val['images']), len(json_val['annotations']))

3907 18425
976 4719


In [285]:
## Json file로 생성

import json

with open('train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

### 하위 33% 크기의 bbox를 가지고 있는 image 추출

- 작은 bbox에 대한 mAP가 매우 낮기에, 작은 bbox에 대해 fine tuning 위해

In [2]:
import json
import numpy as np
import pandas as pd

annotation = 'dataset/train_MultiStfKFold.json'

with open(annotation) as f:
    multi_json = json.load(f)

In [6]:
df = pd.DataFrame(multi_json['annotations'])
pd.options.display.float_format = '{:.5f}'.format
df.area.describe()  # 원래 분포(train+val)의 분포와 매우 유사

count     18425.00000
mean      93327.34619
std      136631.82471
min           0.56000
25%       10063.68000
50%       39062.06000
75%      119405.10000
max     1048371.21000
Name: area, dtype: float64

In [10]:
df.area.sort_values().reset_index().loc[int(18425*0.33)]  # 하위 33%에 해당하는 bbox 크기

index   12889.00000
area    16305.26000
Name: 6080, dtype: float64

In [15]:
small_box_df = df[df['area'] < 16306]
small_box_df.image_id.nunique()
# 하위 33% 크기의 bbox를 포함하는 image의 수, 기존 train 이미지의 약 37%에 해당

1469

In [28]:
small_box_image_id_list = list(set(small_box_df.image_id.to_list()))
small_box_images = [multi_json['images'][i] for i in range(len(multi_json['images'])) if multi_json['images'][i]['id'] in small_box_image_id_list]
len(small_box_images)

1469

In [34]:
small_box_annotations = [multi_json['annotations'][i] for i in range(len(multi_json['annotations'])) if multi_json['annotations'][i]['image_id'] in small_box_image_id_list]
len(small_box_annotations)

14144

In [38]:
multi_json_smallbox_train = multi_json.copy()

multi_json_smallbox_train['images'] = small_box_images
multi_json_smallbox_train['annotations'] = small_box_annotations

print(len(multi_json_smallbox_train['images']), len(multi_json_smallbox_train['annotations']))

1469 14144


In [39]:
## bbox크기 하위 3분의1에 해당하는 image의 Json file로 생성

import json

with open('smallbox_train_MultiStfKFold.json', 'w') as t:
    json.dump(multi_json_smallbox_train, t)

### 5fold OOF를 위한 json 생성

In [42]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

# load json 
annotation = 'dataset/train.json'

with open(annotation) as f:
    data = json.load(f)

var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1))
y = np.array([v[1] for v in var])  # class
groups = np.array([v[0] for v in var])  # image(group)

In [43]:
import pandas as pd
import numpy as np
train_df = pd.DataFrame(data['annotations'])
train_df = train_df[['image_id', 'category_id', 'area','id']]
train_df.head()

Unnamed: 0,image_id,category_id,area,id
0,0,0,257301.66,0
1,1,3,10402.56,1
2,1,7,26259.36,2
3,1,4,69096.17,3
4,1,5,24164.58,4


In [35]:
# pd.options.display.float_format = '{:.5f}'.format
# train_df.area.apply(lambda x: int(x)).describe()  # bbox의 크기를 0~25, 25~50, 50~75, 75~100 총 4가지로 분류

count     23144.00000
mean      92863.48116
std      135979.14884
min           0.00000
25%        9997.00000
50%       38938.00000
75%      119122.00000
max     1048371.00000
Name: area, dtype: float64

In [44]:
# 이미지 별 bbox 개수
bbox_num = train_df.groupby('image_id')['id'].count().to_list()

# 작은 box부터 큰 box까지 0,1,2,3, 이미지 별 bbox의 평균 크기 분류
area_mean_df = train_df.groupby('image_id')['area'].mean()
area_class = []  
for i in range(4883):
    if area_mean_df[i] <= 9997:
        area_class.append(0)
    elif area_mean_df[i] <= 38938:
        area_class.append(1)
    elif area_mean_df[i] <= 119122:
        area_class.append(2)
    else:
        area_class.append(3)

# 최빈 class 
category_group = train_df.groupby('image_id')['category_id'].value_counts()
most_class_list = []
for i in range(4883):
    most_class = category_group[i].index.to_list()[0] # 동률일 경우 가장 빠른 index 선택
    most_class_list.append(most_class)
    
print(len(bbox_num), len(area_class), len(most_class_list))

4883 4883 4883


In [45]:
multi_label_df = pd.DataFrame({'image_id':[i for i in range(4883)], 'bbox_num':bbox_num, 'area_class':area_class, 'most_class_list':most_class_list})
multi_label_df.head()

Unnamed: 0,image_id,bbox_num,area_class,most_class_list
0,0,1,3,0
1,1,8,2,0
2,2,1,3,3
3,3,2,2,2
4,4,6,1,1


In [46]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=41)
X = np.ones((len(bbox_num),1))

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['bbox_num', 'area_class']])):
    
    print("TRAIN:", [train_idx[:10]])
    print(" TEST:", [val_idx[:10]])

TRAIN: [array([ 1,  3,  4,  5,  7,  8,  9, 10, 11, 12])]
 TEST: [array([ 0,  2,  6, 16, 19, 23, 25, 28, 36, 38])]
TRAIN: [array([ 0,  2,  3,  4,  6,  7,  8,  9, 10, 11])]
 TEST: [array([ 1,  5, 14, 20, 21, 31, 33, 34, 41, 42])]
TRAIN: [array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10])]
 TEST: [array([ 7, 13, 18, 22, 24, 30, 49, 50, 51, 53])]
TRAIN: [array([ 0,  1,  2,  5,  6,  7,  9, 10, 12, 13])]
 TEST: [array([ 3,  4,  8, 11, 26, 27, 29, 32, 37, 40])]
TRAIN: [array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 11])]
 TEST: [array([ 9, 10, 12, 15, 17, 35, 39, 54, 58, 66])]


In [49]:
from collections import Counter
all_fold_train_list = []
all_fold_val_list = []
for fold_ind, (train_idx, val_idx) in enumerate(cv.split(multi_label_df, multi_label_df[['bbox_num', 'area_class']])):
    
    all_fold_train_list.append(list(train_idx))
    all_fold_val_list.append(list(val_idx))
    
print(len(all_fold_train_list))
print(len(all_fold_val_list))

5
5


In [51]:
train_idx_list = all_fold_train_list[0]
val_idx_list = all_fold_val_list[0]

json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]

new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

with open('1___train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('1___val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

In [52]:
train_idx_list = all_fold_train_list[1]
val_idx_list = all_fold_val_list[1]

json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]

new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

with open('2___train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('2___val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

In [53]:
train_idx_list = all_fold_train_list[2]
val_idx_list = all_fold_val_list[2]

json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]

new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

with open('3___train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('3___val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

In [54]:
train_idx_list = all_fold_train_list[3]
val_idx_list = all_fold_val_list[3]

json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]

new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

with open('4___train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('4___val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

In [55]:
train_idx_list = all_fold_train_list[4]
val_idx_list = all_fold_val_list[4]

json_file = data.copy()
new_data_images_train = [json_file['images'][i] for i in train_idx_list]
new_data_images_val = [json_file['images'][i] for i in val_idx_list]

new_ann_train = []
new_ann_val = []
for ann_id in range(len(data['annotations'])):
    ann_img_id = json_file['annotations'][ann_id]['image_id']
    if ann_img_id in train_idx_list:
        new_ann_train.append(json_file['annotations'][ann_id])
    if ann_img_id in val_idx_list:
        new_ann_val.append(json_file['annotations'][ann_id])
        
json_train = json_file.copy()
json_val = json_file.copy()

json_train['images'] = new_data_images_train
json_train['annotations'] = new_ann_train

json_val['images'] = new_data_images_val
json_val['annotations'] = new_ann_val

with open('5___train_MultiStfKFold.json', 'w') as t:
    json.dump(json_train, t)

with open('5___val_MultiStfKFold.json', 'w') as v:
    json.dump(json_val, v)

### 가로, 세로 중, 한 쪽이 다른 쪽의 6배 이상 큰 경우는 noise로 분류
### bbox area가 1000이하인 작은 bbox는 noise로 분류
### -> 제거한 annotation json 생성

In [53]:
import json

# load json 
annotation = './train_MultiStfKFold.json'

with open(annotation) as f:
    multi_json = json.load(f)
multi_json.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [54]:
area_over1000_list = [multi_json['annotations'][i] for i in range(len(multi_json['annotations'])) if multi_json['annotations'][i]['area'] > 1000]
min([area_over1000_list[i]['area'] for i in range(len(area_over1000_list))])

1000.56

In [55]:
import pandas as pd
area_over1000_df = pd.DataFrame(area_over1000_list)
area_ratio_list = []
for i, row in area_over1000_df.iterrows():
    area_ratio = row['bbox'][2] / row['bbox'][3]
    area_ratio_list.append(area_ratio)
area_over1000_df['area_ratio'] = area_ratio_list
area_over1000_df.head()

Unnamed: 0,image_id,category_id,area,bbox,iscrowd,id,area_ratio
0,0,0,257301.66,"[197.6, 193.7, 547.8, 469.7]",0,0,1.166276
1,2,3,324010.8,"[267.9, 165.2, 631.6, 513.0]",0,9,1.231189
2,3,2,59550.94,"[462.2, 369.4, 233.9, 254.6]",0,10,0.918696
3,3,6,80710.56,"[773.3, 3.0, 188.4, 428.4]",0,11,0.439776
4,4,1,14768.88,"[567.5, 462.2, 165.2, 89.4]",0,12,1.847875


In [56]:
area_over1000_df = area_over1000_df[(area_over1000_df['area_ratio']<6) & (area_over1000_df['area_ratio']>0.17)]
area_ratio_filtered_list = area_over1000_df.id.to_list()

In [57]:
(18425-17929)/18425 # 약 2.6%의 bbox 제거됨

0.026919945725915877

In [58]:
area_ratio_filtered_ann = [multi_json['annotations'][i] for i in range(len(multi_json['annotations'])) if multi_json['annotations'][i]['id'] in area_ratio_filtered_list]
len(area_ratio_filtered_ann)

17929

In [59]:
new_image_list = list(set(area_over1000_df.image_id.to_list()))
no_box_image_list = []
for i in range(len(multi_json['images'])):
    if multi_json['images'][i]['id'] not in new_image_list:
        no_box_image_list.append(multi_json['images'][i]['id'])
no_box_image_list
## bbox가 사라진 image 7장도 함께 제거

[1075, 1324, 1474, 1672, 1967, 2930, 3782]

In [60]:
area_ratio_filtered_image = [multi_json['images'][i] for i in range(len(multi_json['images'])) if multi_json['images'][i]['id'] not in no_box_image_list]
len(area_ratio_filtered_image) ## 3900이면 됨

3900

In [63]:
import json
area_ratio_filtered_json = multi_json.copy()

area_ratio_filtered_json['images'] = area_ratio_filtered_image
area_ratio_filtered_json['annotations'] = area_ratio_filtered_ann
with open('AreaRatioFilter_train_MultiStfKFold.json', 'w') as t:
    json.dump(area_ratio_filtered_json, t)

### pseudo labeling 된 test data + 기존 trainset

In [47]:
import pandas as pd
import numpy as np

pseudo_df = pd.read_csv('output_best.csv')
pseudo_df.head()

Unnamed: 0,PredictionString,image_id
0,7 0.9920914173126221 208.55441284179688 48.766...,test/0000.jpg
1,5 0.9239843487739563 125.11742401123047 2.8685...,test/0001.jpg
2,1 0.9707459211349487 77.59071350097656 270.659...,test/0002.jpg
3,9 0.8658463954925537 149.75205993652344 253.91...,test/0003.jpg
4,1 0.81441330909729 191.17074584960938 237.0220...,test/0004.jpg


In [48]:
import json

# load json 
annotation = 'dataset/test.json'

with open(annotation) as f:
    test = json.load(f)
    
test.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [49]:
annotation = 'dataset/train_MultiStfKFold.json'

with open(annotation) as f:
    train = json.load(f)

In [50]:
ann_list = []
bbox_id_cumulate = 23144
for i, row in pseudo_df.iterrows():
    pred_str = row['PredictionString']
    pred_str_list = pred_str.split()
    if len(pred_str_list)==0:
        continue
    for j in range(len(pred_str_list)//6):
        if float(pred_str_list[j*6+1]) > 0.5:
            image_id = 4883 + int(row['image_id'].split('.')[0][5:])
            category_id = int(pred_str_list[j*6])
            bbox = [round(float(pred_str_list[j*6+2]),1),round(float(pred_str_list[j*6+3]),1),
                    round((float(pred_str_list[j*6+4]) - float(pred_str_list[j*6+2])),1),
                    round((float(pred_str_list[j*6+5]) - float(pred_str_list[j*6+3])),1)]
            
            # for k in range(len(bbox)):
            #     if k==2:
            #         bbox[k] == 1024
            #     if k==3:
            #         bbox[k] == 0
            
            area = round(bbox[2]*bbox[3],2)
            is_crowd = 0
            bbox_id = bbox_id_cumulate
            bbox_id_cumulate+=1
            ann_dict = {
                        'image_id': image_id,
                        'category_id': category_id,
                        'area': area,
                        'bbox': bbox,
                        'iscrowd':is_crowd,
                        'id': bbox_id
                        }
            ann_list.append(ann_dict)
        else:
            continue
print(len(ann_list))

13114


In [51]:
ann_df = pd.DataFrame(ann_list)
pseudo_img_id_list = list(set(ann_df.image_id.to_list()))
print(len(pseudo_img_id_list))

4683


In [52]:
for i in range(len(test['images'])):
    img_id = test['images'][i]['id']
    img_id = img_id + 4883
    test['images'][i]['id'] = img_id

In [53]:
pseudo_img_list = [test['images'][i] for i in range(len(test['images'])) if test['images'][i]['id'] in pseudo_img_id_list]
print(len(pseudo_img_list))

4683


In [54]:
import json
pseudo_json = train.copy() 
pseudo_json['images'] = train['images'] + pseudo_img_list
pseudo_json['annotations'] = train['annotations'] + ann_list
with open('pseudo+train_coco.json', 'w') as t:
    json.dump(pseudo_json, t)