# 앙상블
submission.csv 파일들을 앙상블 해주는 코드

In [None]:
!pip install ensemble_boxes

In [1]:
import numpy as np
import pandas as pd

from ensemble_boxes import *
from pycocotools.coco import COCO
from tqdm import tqdm

In [2]:
# 앙상블할 파일 목록
files = [
    '~/me.csv',
    '~/yera.csv',
    '~/tj.csv',
]

dfs = [pd.read_csv(file) for file in files]

# 같은 이미지 Set에 대한 예측인지 검사
for df1 in dfs:
    for df2 in dfs:
        assert all(df1['image_id'] == df2['image_id']), '다른 데이터셋에 대한 CSV 파일이 섞인 것 같음'

In [3]:
rows = []

for i in tqdm(range(len(dfs[0]))):
    boxes_list = []
    scores_list = []
    labels_list = []
    
    for df in dfs:
        preds, image_id = df.iloc[i]
        
        assert image_id == f'test/{i:04d}.jpg'
        
        if preds == float('nan'):
            boxes = []
            scores = []
            labels = []
        else:
            L = preds.split()
            data = np.array(L).reshape(-1, 6)

            labels = data[:, 0].astype(int)
            scores = data[:, 1].astype(float)
            boxes = data[:, 2:].astype(float) / 1024.
        
        boxes_list.append(boxes)
        scores_list.append(scores)
        labels_list.append(labels)
    
    # 앙상블 진행 (NMS, Soft-NMS, NMW, WBF 중 선택)
    # https://github.com/ZFTurbo/Weighted-Boxes-Fusion
    
    # 사용방법
    
    # boxes, scores, labels = nms(boxes_list, scores_list, labels_list, weights=weights, iou_thr=iou_thr)
    # boxes, scores, labels = soft_nms(boxes_list, scores_list, labels_list, weights=weights, iou_thr=iou_thr, sigma=sigma, thresh=skip_box_thr)
    # boxes, scores, labels = non_maximum_weighted(boxes_list, scores_list, labels_list, weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    # boxes, scores, labels = weighted_boxes_fusion(boxes_list, scores_list, labels_list, weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    
    boxes_e, scores_e, labels_e = weighted_boxes_fusion(boxes_list, scores_list, labels_list)
    
    
    PredictionString = ''
    for box, score, label in zip(boxes_e, scores_e, labels_e):
        PredictionString += f'{int(label)} {score} {box[0]*1024} {box[1]*1024} {box[2]*1024} {box[3]*1024} '
    row = pd.Series({
        'PredictionString': PredictionString,
        'image_id': image_id
    }).to_frame().T
    
    rows.append(row)

100%|██████████| 4871/4871 [00:22<00:00, 214.45it/s]


In [4]:
df_ensemble = pd.concat(rows)
df_ensemble.to_csv('ensemble.csv')

df_ensemble.head()

Unnamed: 0,PredictionString,image_id
0,7 0.9828125834465027 214.46697998046875 47.993...,test/0000.jpg
0,5 0.6562874913215637 338.6947937011719 248.465...,test/0001.jpg
0,1 0.9643809795379639 73.37809753417969 277.484...,test/0002.jpg
0,9 0.9629031419754028 132.81578063964844 268.68...,test/0003.jpg
0,1 0.9649724960327148 190.38409423828125 268.07...,test/0004.jpg


# 박스 개수 분석

In [5]:
all_files = files + ['ensemble']
all_dfs = dfs + [df_ensemble]
cnt_list = []

for file, df in tqdm(zip(all_files, all_dfs), total=len(all_files)):
    for i in range(len(df)):
        preds, image_id = df.iloc[i][['PredictionString', 'image_id']]
        idx = df.iloc[i].name

        L = preds.split()
        data = np.array(L).reshape(-1, 6)

        bboxes = data[:, 2:].astype(float)
        x, y, X, Y = bboxes.T
        w = X - x
        h = Y - y
        area = w*h
        
        # assert (x >= 0).all()
        # assert (X <= 1024).all()
        # assert (y >= 0).all()
        # assert (Y <= 1024).all()

        df.loc[idx, 'cnt'] = len(bboxes)
        df.loc[idx, 'cnt_small'] = np.where(area <= 32**2)[0].size
        df.loc[idx, 'cnt_medium'] = np.where((32**2 <= area) & (area <= 96**2))[0].size
        df.loc[idx, 'cnt_large'] = np.where(area >= 96**2)[0].size
    
    cnt_list.append(df.iloc[:, 2:].mean().to_frame(name=file).T)
    
pd.concat(cnt_list)

100%|██████████| 4/4 [00:31<00:00,  7.91s/it]


Unnamed: 0,cnt,cnt_small,cnt_medium,cnt_large
~/me.csv,11.917881,0.236707,3.316157,8.365017
~/yera.csv,18.343256,0.105317,5.191336,13.046602
~/tj.csv,38.099158,0.888113,14.347157,22.863888
ensemble,13.0,0.0,1.0,12.0
