In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
def csv_ensemble(csv_paths, save_dir):  
    def decode_rle_to_mask(rle, height, width):
        # RLE로 인코딩된 결과를 mask map으로 복원합니다.
        s = rle.split()
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
        starts -= 1
        ends = starts + lengths
        img = np.zeros(height * width, dtype=np.uint8)
        
        for lo, hi in zip(starts, ends):
            img[lo:hi] = 1
        
        return img.reshape(height, width)

    def encode_mask_to_rle(mask):
        # mask map으로 나오는 인퍼런스 결과를 RLE로 인코딩 합니다.
        pixels = mask.flatten()
        pixels = np.concatenate([[0], pixels, [0]])
        runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
        runs[1::2] -= runs[::2]
        return ' '.join(str(x) for x in runs)

    # CSV 데이터 로드
    csv_data = []
    for path in csv_paths:
        data = pd.read_csv(path)
        
        # 필수 열 확인
        required_columns = ['rle', 'class', 'image_name']
        if not all(col in data.columns for col in required_columns):
            raise ValueError(f"CSV file at {path} is missing required columns: {required_columns}")
        
        csv_data.append(data)

    # 데이터 일관성 확인
    csv_column = min(len(data) for data in csv_data)  # 최소 열 길이를 기준으로 설정
    print(f"Using {csv_column} rows based on the shortest CSV.")

    file_num = len(csv_data)
    filename_and_class = []
    rles = []

    for index in tqdm(range(csv_column)):    
        model_rles = []
        for data in csv_data:
            # 인덱스 범위 초과 방지
            if index >= len(data):
                print(f"Index {index} out of bounds for DataFrame with length {len(data)}")
                model_rles.append(np.zeros((2048, 2048)))
                continue
            
            # NaN 체크
            if pd.isna(data.iloc[index]['rle']) or type(data.iloc[index]['rle']) == float:
                model_rles.append(np.zeros((2048, 2048)))
                continue
            
            model_rles.append(decode_rle_to_mask(data.iloc[index]['rle'], 2048, 2048))
        
        # 앙상블
        image = np.zeros((2048, 2048))
        for model in model_rles:
            image += model
        
        # Majority Voting
        image[image <= (file_num // 2)] = 0
        image[image > (file_num // 2)] = 1

        result_image = image

        # 앙상블 결과 RLE로 인코딩
        rles.append(encode_mask_to_rle(result_image))
        filename_and_class.append(f"{csv_data[0].iloc[index]['class']}_{csv_data[0].iloc[index]['image_name']}")

    # CSV 저장
    classes, filenames = zip(*[x.split("_") for x in filename_and_class])
    image_name = [os.path.basename(f) for f in filenames]

    df = pd.DataFrame({
        "image_name": image_name,
        "class": classes,
        "rle": rles,
    })

    os.makedirs(os.path.dirname(save_dir), exist_ok=True)
    df.to_csv(save_dir, index=False)
    print(f"Ensembled CSV saved at {save_dir}")

In [3]:
csv_path1 = '/Users/kimbohyun/Desktop/tools/hard_1/deeplabv3+++.csv'
csv_path2 = '/Users/kimbohyun/Desktop/tools/hard_1/hard_3.csv'
csv_path3 = '/Users/kimbohyun/Desktop/tools/hard_1/HRNet.csv'

csv_paths = [csv_path1, csv_path2, csv_path3]
save_dir = './ensemble.csv'

In [4]:
csv_ensemble(csv_paths, save_dir)

Using 8352 rows based on the shortest CSV.


100%|██████████| 8352/8352 [03:13<00:00, 43.07it/s]


Ensembled CSV saved at ./ensemble.csv
