In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

#######################################################################
df1 = pd.read_csv('/opt/ml/result/hardvoting_result/fold0_9732.csv')
df2 = pd.read_csv('/opt/ml/result/hardvoting_result/fold1_9729.csv')
df3 = pd.read_csv('/opt/ml/result/hardvoting_result/fold2_9718.csv')
df4 = pd.read_csv('/opt/ml/result/hardvoting_result/fold3_9718.csv')
df5 = pd.read_csv('/opt/ml/result/hardvoting_result/fold4_9723.csv')

df_list = [df1, df2, df3, df4, df5]
#######################################################################

In [None]:
# mask map으로 나오는 인퍼런스 결과를 RLE로 인코딩 합니다. (mask map -> RLE)
def encode_mask_to_rle(mask):
    '''
    mask: numpy array binary mask 
    1 - mask 
    0 - background
    Returns encoded run length 
    '''
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# RLE로 인코딩된 결과를 mask map으로 복원합니다. (RLE -> mask map)
def decode_rle_to_mask(rle, height, width):
    s = str(rle).split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(height * width, dtype=np.uint8)
    
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    
    return img.reshape(height, width)

In [4]:
threshold = len(df_list)//2     ## e.g. 5개의 prediction중 2개 초과인 경우
# threshold = 1                 ## 별도 Threshold 지정 시

def process_data(i):
    result = np.zeros((2048, 2048), dtype=int)

    for df in df_list:
        rle = df.rle[i]
                
        if (rle!=rle):
            continue
        
        mask = decode_rle_to_mask(rle, 2048, 2048)
        result += mask
    
    result = np.where(result > threshold, 1, 0)
    df_list[0].rle[i] = encode_mask_to_rle(result)

In [5]:
num_processes = cpu_count()  

with Pool(num_processes) as pool, tqdm(total=len(df1)) as pbar:
    for i, _ in enumerate(pool.imap_unordered(process_data, range(len(df1)))):
        pbar.update()

# csv 파일 저장
df_list[0].to_csv("hardvote_result.csv", index=False)

100%|██████████| 8700/8700 [03:42<00:00, 39.17it/s]
