In [10]:
"""
여러 개의 submission.csv (image_id, PredictionString 형식, Pascal VOC 좌표)를
WBF / NMS / Soft-NMS / NMW 로 앙상블해 새로운 csv를 만드는 스크립트.

필수 라이브러리:
    pip install pandas numpy ensemble-boxes
"""

import pandas as pd
import numpy as np
from ensemble_boxes import (
    nms,
    soft_nms,
    non_maximum_weighted,
    weighted_boxes_fusion,
)

# ---------------------------------------------------------------------
# 0. 설정 부분: 여기만 네 환경에 맞게 수정하면 됨
# ---------------------------------------------------------------------

# (1) 입력 파일 리스트 (원하는 만큼 추가)
SUB_PATHS = [
    # "/data/ephemeral/home/pro-cv-objectdetection-cv-11-sj/lsj/ensemble_mj_nms.csv",
    "/data/ephemeral/home/pro-cv-objectdetection-cv-11-sj/lsj/ensemble_wbf.csv",
    "/data/ephemeral/home/pro-cv-objectdetection-cv-11-sj/submission_yolov11l_dataaug-rev3.csv",
]

# (2) 각 모델에 줄 가중치 (SUB_PATHS 길이와 동일해야 함)
WEIGHTS_ALL = [
    1.0,
    0.98,
    # 0.85,
    # 1.0,
    # 1.0,
    # 0.8,
    # 1.2,
]

assert len(SUB_PATHS) == len(WEIGHTS_ALL), "SUB_PATHS 개수와 WEIGHTS_ALL 길이가 다릅니다!"

# (3) 이미지 해상도 (데이터셋 해상도로 꼭 맞게 설정!)
IMG_WIDTH = 1024
IMG_HEIGHT = 1024

# (4) 사용할 앙상블 방법
#     "wbf", "nms", "soft_nms", "nmw" 중 하나
METHOD = "wbf"

# (5) NMS / WBF / Soft-NMS 파라미터
IOU_THR = 0.5        # IoU threshold
SKIP_BOX_THR = 0.0   # score threshold (0이면 전부 사용)
SIGMA = 0.5          # Soft-NMS에서 사용 (논문 기본값 정도)

# (6) 출력 파일 이름
OUTPUT_PATH = f"ensemble_augyolo_{METHOD}.csv"


# ---------------------------------------------------------------------
# 1. PredictionString 파싱 / 생성 함수
# ---------------------------------------------------------------------
def parse_prediction_string(pred_str):
    """
    'label score xmin ymin xmax ymax ...' 문자열을
    WBF/NMS용 boxes, scores, labels 로 변환.

    boxes: [ [x1, y1, x2, y2], ... ]  0~1 정규화 좌표
    scores: [score1, score2, ...]
    labels: [label1, label2, ...]
    """
    # NaN 또는 빈 문자열 처리
    if pred_str is None:
        return [], [], []

    if isinstance(pred_str, float) and np.isnan(pred_str):
        return [], [], []

    pred_str = str(pred_str).strip()
    if pred_str == "":
        return [], [], []

    parts = pred_str.split()
    if len(parts) % 6 != 0:
        raise ValueError(
            f"PredictionString 길이가 6의 배수가 아님: {len(parts)} / 내용 앞부분: {pred_str[:80]}"
        )

    boxes = []
    scores = []
    labels = []

    # 6개 단위: label, score, xmin, ymin, xmax, ymax
    for i in range(0, len(parts), 6):
        label = int(float(parts[i]))
        score = float(parts[i + 1])
        xmin = float(parts[i + 2])
        ymin = float(parts[i + 3])
        xmax = float(parts[i + 4])
        ymax = float(parts[i + 5])

        # 0~1 정규화
        boxes.append(
            [
                xmin / IMG_WIDTH,
                ymin / IMG_HEIGHT,
                xmax / IMG_WIDTH,
                ymax / IMG_HEIGHT,
            ]
        )
        scores.append(score)
        labels.append(label)

    return boxes, scores, labels


def to_prediction_string(boxes, scores, labels, score_thr=0.0):
    """
    WBF/NMS 결과(boxes: 0~1 정규화 좌표)를 다시
    'label score xmin ymin xmax ymax ...' 문자열로 변환.
    """
    parts = []

    for box, score, label in zip(boxes, scores, labels):
        if score < score_thr:
            continue

        xmin = box[0] * IMG_WIDTH
        ymin = box[1] * IMG_HEIGHT
        xmax = box[2] * IMG_WIDTH
        ymax = box[3] * IMG_HEIGHT

        parts.extend(
            [
                str(int(label)),          # label
                f"{float(score):.6f}",    # score
                f"{xmin:.6f}",            # xmin
                f"{ymin:.6f}",            # ymin
                f"{xmax:.6f}",            # xmax
                f"{ymax:.6f}",            # ymax
            ]
        )

    return " ".join(parts)


# ---------------------------------------------------------------------
# 2. 앙상블 방법 선택 래퍼
# ---------------------------------------------------------------------
def ensemble_with_method(
    boxes_list,
    scores_list,
    labels_list,
    weights=None,
    method="wbf",
    iou_thr=0.5,
    skip_box_thr=0.0,
    sigma=0.1,
):
    """
    boxes_list, scores_list, labels_list:
        모델별 리스트 (각 원소는 한 모델의 box들)
        포맷은 ensemble-boxes 에서 기대하는 그대로.

    method: "wbf", "nms", "soft_nms", "nmw"

    return:
        boxes, scores, labels  (0~1 정규화 좌표)
    """
    method = method.lower()

    if method == "nms":
        boxes, scores, labels = nms(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights,
            iou_thr=iou_thr,
        )

    elif method == "soft_nms":
        boxes, scores, labels = soft_nms(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights,
            iou_thr=iou_thr,
            sigma=sigma,
            thresh=skip_box_thr,
        )

    elif method == "nmw":
        boxes, scores, labels = non_maximum_weighted(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights,
            iou_thr=iou_thr,
            skip_box_thr=skip_box_thr,
        )

    elif method == "wbf":
        boxes, scores, labels = weighted_boxes_fusion(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights,
            iou_thr=iou_thr,
            skip_box_thr=skip_box_thr,
        )
    else:
        raise ValueError(f"Unknown method: {method}")

    return boxes, scores, labels


# ---------------------------------------------------------------------
# 3. 여러 submission.csv 읽어서 image_id 기준 병합
# ---------------------------------------------------------------------
print("Loading submissions...")

df = None
pred_cols = []  # 각 모델의 PredictionString 컬럼 이름들

for idx, path in enumerate(SUB_PATHS):
    sub = pd.read_csv(path)
    colname = f"pred{idx}"

    # image_id 중복 여부 체크 (디버깅용)
    dup_cnt = sub["image_id"].duplicated().sum()
    print(f"[{path}] rows={len(sub)}, duplicated image_id={dup_cnt}")

    sub = sub.rename(columns={"PredictionString": colname})

    if df is None:
        df = sub
    else:
        df = pd.merge(df, sub, on="image_id", how="outer")

    pred_cols.append(colname)

# NaN -> 빈 문자열
for c in pred_cols:
    df[c] = df[c].fillna("")

print("Total images after merge:", len(df))


# ---------------------------------------------------------------------
# 4. 이미지별로 앙상블 적용
# ---------------------------------------------------------------------
ensemble_preds = []

for idx, row in df.iterrows():
    image_id = row["image_id"]

    boxes_list = []
    scores_list = []
    labels_list = []
    weights_used = []

    # 모델 개수만큼 loop
    for m_idx, col in enumerate(pred_cols):
        pred_str = row[col]
        b, s, l = parse_prediction_string(pred_str)

        if len(b) > 0:
            boxes_list.append(b)
            scores_list.append(s)
            labels_list.append(l)
            weights_used.append(WEIGHTS_ALL[m_idx])

    # 모든 모델이 이 이미지에서 박스를 하나도 안 냈으면 빈 PredictionString
    if len(boxes_list) == 0:
        ensemble_preds.append("")
        continue

    # 한 모델만 박스가 있으면 그대로 사용
    if len(boxes_list) == 1:
        boxes = boxes_list[0]
        scores = scores_list[0]
        labels = labels_list[0]
    else:
        boxes, scores, labels = ensemble_with_method(
            boxes_list,
            scores_list,
            labels_list,
            weights=weights_used,
            method=METHOD,
            iou_thr=IOU_THR,
            skip_box_thr=SKIP_BOX_THR,
            sigma=SIGMA,
        )

    pred_str = to_prediction_string(
        boxes,
        scores,
        labels,
        score_thr=SKIP_BOX_THR,
    )
    ensemble_preds.append(pred_str)

    if (idx + 1) % 500 == 0:
        print(f"Processed {idx + 1} / {len(df)} images")


# ---------------------------------------------------------------------
# 5. 최종 DataFrame 만들고 저장 (중복 image_id 제거 포함)
# ---------------------------------------------------------------------
df_ens = pd.DataFrame(
    {
        "PredictionString": ensemble_preds,
        "image_id": df["image_id"],
    }
)

# 혹시 모를 중복 image_id 제거 (첫 번째 것만 유지)
before = len(df_ens)
df_ens = df_ens.drop_duplicates(subset="image_id", keep="first")
after = len(df_ens)

print(f"Drop duplicates: {before} -> {after}")

# 정렬 (선택 사항)
df_ens = df_ens.sort_values("image_id").reset_index(drop=True)

df_ens.to_csv(OUTPUT_PATH, index=False)
print(f"Saved: {OUTPUT_PATH}")


Loading submissions...
[/data/ephemeral/home/pro-cv-objectdetection-cv-11-sj/lsj/ensemble_wbf.csv] rows=4871, duplicated image_id=0
[/data/ephemeral/home/pro-cv-objectdetection-cv-11-sj/submission_yolov11l_dataaug-rev3.csv] rows=4871, duplicated image_id=0
Total images after merge: 4871




Processed 500 / 4871 images
Processed 1000 / 4871 images
Processed 1500 / 4871 images
Processed 2000 / 4871 images
Processed 2500 / 4871 images
Processed 3000 / 4871 images
Processed 3500 / 4871 images
Processed 4000 / 4871 images
Processed 4500 / 4871 images
Drop duplicates: 4871 -> 4871
Saved: ensemble_augyolo_wbf.csv
