# 앙상블
submission.csv 파일들을 앙상블 해주는 코드

In [None]:
!pip install ensemble_boxes

In [None]:
import numpy as np
import pandas as pd

from ensemble_boxes import *
from pycocotools.coco import COCO
from tqdm import tqdm

In [None]:
# 앙상블할 파일 목록
files = [
    'fold-tj.csv',
    'fold-gh.csv',
    'fold-gun.csv',
    'fold-kh.csv',
    'fold-yr.csv'
]

dfs = [pd.read_csv(file) for file in files]

# 같은 이미지 Set에 대한 예측인지 검사
for df1 in dfs:
    for df2 in dfs:
        assert all(df1['image_id'] == df2['image_id']), '다른 데이터셋에 대한 CSV 파일이 섞인 것 같음'

In [None]:
rows = []

for i in tqdm(range(len(dfs[0]))):
    boxes_list = []
    scores_list = []
    labels_list = []
    
    for df in dfs:
        preds, image_id = df.iloc[i]
        
        assert image_id == f'test/{i:04d}.jpg'
        
        if type(preds) == float:
            boxes = []
            scores = []
            labels = []
        else:
            L = preds.split()
            data = np.array(L).reshape(-1, 6)

            labels = data[:, 0].astype(int)
            scores = data[:, 1].astype(float)
            boxes = data[:, 2:].astype(float) / 1024.
        
        boxes_list.append(boxes)
        scores_list.append(scores)
        labels_list.append(labels)
    
    # 앙상블 진행 (NMS, Soft-NMS, NMW, WBF 중 선택)
    # https://github.com/ZFTurbo/Weighted-Boxes-Fusion
    boxes_e, scores_e, labels_e = weighted_boxes_fusion(boxes_list, scores_list, labels_list)
    
    PredictionString = ''
    for box, score, label in zip(boxes_e, scores_e, labels_e):
        PredictionString += f'{int(label)} {score} {box[0]*1024} {box[1]*1024} {box[2]*1024} {box[3]*1024} '
    row = pd.Series({
        'PredictionString': PredictionString,
        'image_id': image_id
    }).to_frame(name=image_id).T
    
    rows.append(row)

In [None]:
df_ensemble = pd.concat(rows)
df_ensemble.to_csv('ensemble-5fold.csv')

df_ensemble.head()

# 박스 개수 분석

In [None]:
all_files = files + ['ensemble']
all_dfs = dfs + [df_ensemble]
cnt_list = []

for file, df in tqdm(zip(all_files, all_dfs), total=len(all_files)):
    for i in range(len(df)):
        preds, image_id = df.iloc[i][['PredictionString', 'image_id']]
        idx = df.iloc[i].name

        if type(preds) == float:
            df.loc[idx, ['cnt', 'cnt_small', 'cnt_medium', 'cnt_large']] = 0
            continue

        L = preds.split()
        data = np.array(L).reshape(-1, 6)

        bboxes = data[:, 2:].astype(float)
        x, y, X, Y = bboxes.T
        w = X - x
        h = Y - y
        area = w*h
        
        # assert (x >= 0).all()
        # assert (X <= 1024).all()
        # assert (y >= 0).all()
        # assert (Y <= 1024).all()

        df.loc[idx, 'cnt'] = len(bboxes)
        df.loc[idx, 'cnt_small'] = np.where(area <= 32**2)[0].size
        df.loc[idx, 'cnt_medium'] = np.where((32**2 <= area) & (area <= 96**2))[0].size
        df.loc[idx, 'cnt_large'] = np.where(area >= 96**2)[0].size
    
    cnt_list.append(df.iloc[:, 2:].mean().to_frame(name=file).T)
    
pd.concat(cnt_list)