In [5]:
import json

def get_preds(path):
    with open(path) as f:
        return json.load(f) # dict

paths = [
    "/opt/ml/outputs/preds/backup/no1_xlm_roberta/nbest_predictions.json",
    "/opt/ml/outputs/preds/backup/no4_concat_korquad1_chckpt4413_top20/nbest_predictions.json"
]

preds = [pred for pred in map(get_preds, paths)]


In [6]:
from pprint import pprint

def pred_summary(pred: dict):
    for id0, value_list in pred.items():
        print_list = ["{", f"    {id0}: [", f"        {value_list[0]},", f"        {value_list[1]},", "        ...", "    ]},", "...", "}", ""]
        for elt in print_list:
            print(elt)
        break
        
    print("질문 개수:", len(pred))
    print("질문 당 예측 개수:", len(value_list))
    
pred_summary(preds[0])


{
    mrc-1-000653: [
        {'start_logit': -1.940676212310791, 'end_logit': -2.005676507949829, 'text': '지구', 'probability': 0.3954400420188904},
        {'start_logit': -2.849112033843994, 'end_logit': -2.2359774112701416, 'text': '니비루(행성X)', 'probability': 0.1266290843486786},
        ...
    ]},
...
}

질문 개수: 600
질문 당 예측 개수: 400


In [20]:
from collections import defaultdict
from operator import itemgetter


def _get_text_scores(ans_lists):
    text_scores = defaultdict(float)
    for ans_list in ans_lists:
        for ans in ans_list:
            text_scores[ans["text"]] += ans["probability"]
    return text_scores

def _get_best_text(text_scores):
    return max(text_scores.items(), key=itemgetter(1))[0]

def get_ensemble_result(preds):
    result = dict() # dict keeps input sequence order
    for q_id, *ans_lists in zip(preds[0], *(pred.values() for pred in preds)):
        text_scores = _get_text_scores(ans_lists)
        best_text = _get_best_text(text_scores)
        result[q_id] = best_text
    return result

ensemble_result = get_ensemble_result(preds)
#ensemble_result

In [4]:
from utils import increment_path

def save_json(preds, path):
    with open(path, "w") as f:
            f.write(json.dumps(preds, indent=4, ensure_ascii=False) + "\n")

out_path = increment_path("/opt/ml/outputs/preds/", infix="ensemble", name="predictions.json")
save_json(ensemble_result, out_path)

In [21]:
from collections import defaultdict
from operator import itemgetter


def _get_text_counts(ans_lists):
    text_counts = defaultdict(int)
    for ans_list in ans_lists:
        for ans in ans_list:
            text_counts[ans["text"]] += 1
    return text_counts

def _get_text_scores(ans_lists):
    text_scores = defaultdict(float)
    for ans_list in ans_lists:
        for ans in ans_list:
            text_scores[ans["text"]] += ans["probability"]
    return text_scores

def _get_best_text(text_scores):
    return max(text_scores.items(), key=itemgetter(1))[0]

def get_ensemble_result(preds, exp=1.0):
    result = dict() # dict keeps input sequence order
    for q_id, *ans_lists in zip(preds[0], *(pred.values() for pred in preds)):
        text_counts = _get_text_counts(ans_lists)
        text_scores = _get_text_scores(ans_lists)
        text_weighted_scores = {text: score*(count**exp) for text, score, count in zip(text_scores, text_scores.values(), text_counts.values())}
        best_text = _get_best_text(text_weighted_scores)
        result[q_id] = best_text
    return result

ensemble_result = get_ensemble_result(preds, exp=1)
#ensemble_result