## parquet format 파일 불러와서 확률평균값으로 앙상블
- csv 포맷 대신 parquet 포맷을 사용하여, 파일을 읽는 시간을 단축하였습니다.
- ensemble_parquet 폴더 아래에 `make_parquet.ipynb`를 먼저 실행하여 확률값도 저장한 parquet 포맷의 파일들이 위치해야합니다.
- 그래도 앙상블할 모델 갯수가 많아질수록 시간이 오래걸리니, parts로 나눠서 앙상블을 하는 것을 추천합니다.

In [None]:
!pip install pyarrow

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pc
submission_files = []
for file_name in os.listdir('./ensemble_parquet'):
    if file_name.startswith('.'):
        continue
    submission_files.append(file_name)

In [None]:
submission_files

In [None]:
def decode_rle_to_mask(rle, prob, height, width):
    if rle == None: # rle가 없는 경우, 0으로만 채워진 행렬 반환
        img = np.zeros(height * width, dtype=np.float64)
        return img.reshape(height, width)
    s = rle.split()
    p = prob.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    img = np.zeros(height * width, dtype=np.float64)
    for s, l in zip(starts, lengths):
        for i in range(l):
            img[s+i] = float(p.pop(0))
    return img.reshape(height, width)
    
def encode_mask_to_rle(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
image_name = []
classes = []
rles = []

In [None]:
vote = len(submission_files)//2
for idx in range(29*300):
    print(idx)
    pred_list = []
    sum_result = np.zeros((2048, 2048), dtype=np.float64)
    
    for submission in submission_files:
        # df = pd.read_csv('./ensemble_prob/'+submission)
        df = pq.read_table('./ensemble_parquet/'+submission).to_pandas()
        pred = decode_rle_to_mask(df.loc[idx]['rle'], df.loc[idx]['prob'], height=2048, width=2048)
        if np.max(pred) != 0:  # rle가 없는 경우는 제외하는 조건문
            pred_list.append(pred)
    
    for pred in pred_list:
        sum_result += pred
    sum_result /= len(pred_list)
    result = np.where(sum_result > 0.5, 1, 0)  # thr=0.5 초과만 앙상블 (codition, True, False)
    rle = encode_mask_to_rle(result)
    
    image_name.append(df.loc[idx]['image_name'])
    classes.append(df.loc[idx]['class'])
    rles.append(rle)

result_df = pd.DataFrame({
    "image_name": image_name,
    "class": classes,
    "rle": rles
})
result_df.to_csv("./ensemble.csv", index=False)

In [None]:
result_df

## 나눠서 앙상블한 파일 합치기
나눠서 앙상블시 prob____.ipynb를 모두 실행한 후,  
ensemble_parts 폴더에 저장된 파일들을 불러와서 합쳐주는 코드 입니다.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pc
ensemble_files = []
for file_name in os.listdir('./ensemble_parts2'):
    if file_name.startswith('.'):
        continue
    ensemble_files.append(file_name)
ensemble_files.sort()

In [None]:
len(ensemble_files)

In [None]:
ensemble_files

In [None]:
ensemble_df = pd.read_csv('./ensemble_parts2/'+ensemble_files[0])
for file in ensemble_files[1:]:
    df = pd.read_csv('./ensemble_parts2/'+file)
    ensemble_df = pd.concat([ensemble_df, df], ignore_index=True)

In [None]:
ensemble_df

In [None]:
ensemble_df.to_csv("ensemble_concat.csv", index=False)