In [16]:
import os
import sys
from copy import deepcopy
from glob import glob
from tqdm import tqdm

import pandas as pd
import numpy as np
import json

In [17]:
def save_json(path: str, f: object) -> None:
    with open(path, "w", encoding='utf-8') as json_path:
        json.dump(
            f,
            json_path,
            indent=2,
            ensure_ascii=False
        )
        
def load_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as json_file:
        output = json.load(json_file)
    return output

def get_label_table(slot_meta):
    inference_table = {s:[] for s in slot_meta}
    return inference_table

def convert_state_dict(state: list) -> dict: # [도메인-슬릇-밸류] -> {도메인-슬릇: 밸류}
    state_dict = dict()
    for s in state:
        dom, slot, val = s.split('-')
        dom_slot = '-'.join([dom, slot])
        state_dict[dom_slot] = val
    return state_dict

In [18]:
FOLD_DIR = '../output'
SLOT_META_PATH = '../input/data/train_dataset/slot_meta.json'
slot_meta = load_json(SLOT_META_PATH)

fold_outputs = []
'''
for fpath in range(1, 6):
    fold_outputs.append(load_json(f"{FOLD_DIR}/kfold-{fpath}-predictions.csv"))
'''

for i in [1, 2, 3, 4, 5]:
    fold_outputs.append(load_json(f"{FOLD_DIR}/new_processed_kfold_{i}-predictions.csv"))
for i in [1, 3]:
    fold_outputs.append(load_json(f"{FOLD_DIR}/new_processed_kfold_{i}-coco-predictions.csv"))
dialogue_ids = list(fold_outputs[0].keys())

In [19]:
# inference
output_dict = dict()

for name in tqdm(dialogue_ids):
    label_table = get_label_table(slot_meta)
    for fold in fold_outputs:
        pred_dict = convert_state_dict(fold[name])

        for dom_slot in slot_meta:
            pred_val = pred_dict.get(dom_slot, 'none')
            label_table[dom_slot].append(pred_val)
    output_dict[name] = deepcopy(label_table)

100%|██████████| 14771/14771 [00:06<00:00, 2193.67it/s]


In [20]:
# hard voting
output_hardvoted = dict()

for name in tqdm(dialogue_ids):
    hard_voted_outputs = []
    for dom_slot in slot_meta:
        hard_voted_val = (
            pd.Series(output_dict[name][dom_slot])
            .value_counts(ascending=False)
            .index[0]
            )

        if hard_voted_val != 'none':
            dom_slot_val = '-'.join([dom_slot, hard_voted_val])
            hard_voted_outputs.append(dom_slot_val)
    
    output_hardvoted[name] = deepcopy(hard_voted_outputs)

100%|██████████| 14771/14771 [05:52<00:00, 41.95it/s]


In [21]:
save_json('../output/normal5-minimal2-ensemble.csv', output_hardvoted)