In [1]:
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict

In [2]:
# num2label 파일 로드
num2label = "../dict_num_to_label.pkl"
with open(num2label, "rb") as f:
    n2l = pickle.load(f)

In [3]:
# 파일 경로
abs_path = "~/dataset/weighted_voting"
model_paths = [
    abs_path + "/google_rembert.csv",
    abs_path + "/klue_bert_base.csv",
    abs_path + "/klue_roberta_large.csv",
    abs_path + "/kobart.csv",
    abs_path + "/korquad.csv",
    abs_path + "/mluke.csv",
]

In [4]:
labels_dict = {
    'no_relation': 0, 
    'org:top_members/employees': 1, 
    'org:members': 2, 
    'org:product': 3, 
    'per:title': 4, 
    'org:alternate_names': 5, 
    'per:employee_of': 6, 
    'org:place_of_headquarters': 7, 
    'per:product': 8, 
    'org:number_of_employees/members': 9, 
    'per:children': 10, 
    'per:place_of_residence': 11, 
    'per:alternate_names': 12, 
    'per:other_family': 13, 
    'per:colleagues': 14, 
    'per:origin': 15, 
    'per:siblings': 16, 
    'per:spouse': 17, 
    'org:founded': 18, 
    'org:political/religious_affiliation': 19, 
    'org:member_of': 20, 
    'per:parents': 21, 
    'org:dissolved': 22, 
    'per:schools_attended': 23, 
    'per:date_of_death': 24, 
    'per:date_of_birth': 25, 
    'per:place_of_birth': 26, 
    'per:place_of_death': 27, 
    'org:founded_by': 28, 
    'per:religion': 29
}

In [5]:
def get_picked_probs(model_path):
    results = pd.read_csv(model_path)
    probs_picked_dict = defaultdict(list)

    for idx, r in results.iterrows():
        probs_picked_tmp = eval(r.probs)
        probs_picked_dict[r.pred_label].append(probs_picked_tmp[labels_dict[r.pred_label]])
        
    return probs_picked_dict

def get_probs_stats(model_paths):
    total_probs_mean, total_probs_std = [], []
    for model_path in model_paths:
        probs_picked_mean, probs_picked_std = [0] * 30, [0] * 30
        probs_picked_dict = get_picked_probs(model_path)

        for pp_key, pp_value_list in probs_picked_dict.items():
            probs_picked_mean[labels_dict[pp_key]] = np.mean(pp_value_list)
            probs_picked_std[labels_dict[pp_key]] = np.std(pp_value_list)
            
        total_probs_mean.append(probs_picked_mean)
        total_probs_std.append(probs_picked_std)
        
    total_probs_mean = np.array(total_probs_mean)
    total_probs_std = np.array(total_probs_std)

    return total_probs_mean, total_probs_std

In [6]:
def weighted_voting(model_paths, save_name=False):
    total_probs_mean, total_probs_std = get_probs_stats(model_paths)
    total_probs_mean, total_probs_std = total_probs_mean, total_probs_std
    
    weights_mean_col = []
    for probs_mean_col in total_probs_mean.T:
        weight_mean_tmp = np.sqrt(probs_mean_col / max(probs_mean_col))
        weights_mean_col.append(weight_mean_tmp)
    weights_mean_col = np.array(weights_mean_col).T
        
    weights_std_row = []
    for probs_std_row in total_probs_std:
        weight_std_tmp = np.sqrt(min(probs_std_row) / probs_std_row)
        weights_std_row.append(weight_std_tmp)
    weights_std_row = np.array(weights_std_row)
        
    weights = np.sqrt(weights_mean_col * weights_std_row)
    
    # submission 파일 로드
    dfs = [pd.read_csv(path) for path in model_paths]
    
    probs = []
    for row in zip(*[df["probs"].tolist() for df in dfs]):
        temp = []
        for idx, col in enumerate(zip(*[eval(p) for p in row])):
            temp.append(sum(np.array(col) * weights[:, idx]) / len(col))
        probs.append(list(np.array(temp) / (sum(temp) + 1e-7)))

    pred_label = [n2l[i.index(max(i))] for i in probs]
    
    df = pd.DataFrame(columns=["id", "pred_label", "probs"])
    df["id"] = range(0, len(pred_label))
    df["pred_label"] = pred_label
    df["probs"] = probs
    
    if save_name:
        abs_path = '~/dataset/weighted_voting/'
        df.to_csv(abs_path + save_name + '.csv', index=False)
        
    return df

In [None]:
def weighted_voting_only_std(model_paths, save_name=False):
    total_probs_mean, total_probs_std = get_probs_stats(model_paths)
    total_probs_mean, total_probs_std = total_probs_mean, total_probs_std
    
    # weights_mean_col = []
    # for probs_mean_col in total_probs_mean.T:
    #     weight_mean_tmp = np.sqrt(probs_mean_col / max(probs_mean_col))
    #     weights_mean_col.append(weight_mean_tmp)
    # weights_mean_col = np.array(weights_mean_col).T
        
    weights_std_row = []
    for probs_std_row in total_probs_std:
        weight_std_tmp = np.sqrt(min(probs_std_row) / probs_std_row)
        weights_std_row.append(weight_std_tmp)
    weights_std_row = np.array(weights_std_row)
        
    weights = np.sqrt(weights_mean_col * weights_std_row)
    
    # submission 파일 로드
    dfs = [pd.read_csv(path) for path in model_paths]
    
    probs = []
    for row in zip(*[df["probs"].tolist() for df in dfs]):
        temp = []
        for idx, col in enumerate(zip(*[eval(p) for p in row])):
            temp.append(sum(np.array(col) * weights[:, idx]) / len(col))
        probs.append(list(np.array(temp) / (sum(temp) + 1e-7)))

    pred_label = [n2l[i.index(max(i))] for i in probs]
    
    df = pd.DataFrame(columns=["id", "pred_label", "probs"])
    df["id"] = range(0, len(pred_label))
    df["pred_label"] = pred_label
    df["probs"] = probs
    
    if save_name:
        abs_path = '~/dataset/weighted_voting/'
        df.to_csv(abs_path + save_name + '.csv', index=False)
        
    return df

In [7]:
weighted_results = weighted_voting(model_paths, save_name='weighted_ensemble_krl_kqrl_gr_ml_kbt_xrl')
weighted_results

Unnamed: 0,id,pred_label,probs
0,0,org:product,"[0.040575259909545365, 0.0027853163271796496, ..."
1,1,per:alternate_names,"[0.011574718017820545, 0.00022999549080844037,..."
2,2,no_relation,"[0.9986053460421489, 4.812570586514864e-05, 4...."
3,3,no_relation,"[0.8256819890155512, 0.00017867665431825784, 0..."
4,4,no_relation,"[0.5856840169035097, 0.41142197873283765, 0.00..."
...,...,...,...
7760,7760,org:place_of_headquarters,"[0.2129247363196598, 0.0008470555908998431, 0...."
7761,7761,no_relation,"[0.9947164993755921, 0.00019684989725659525, 0..."
7762,7762,org:top_members/employees,"[8.080596506751483e-05, 0.9993390313347464, 3...."
7763,7763,no_relation,"[0.9535685397651175, 8.953380125109465e-05, 0...."


In [8]:
original_results = pd.read_csv('~/dataset/ensemble_results/ensemble_krl_kqrl_gr_ml_kbt_xrl.csv')
original_results

Unnamed: 0,id,pred_label,probs
0,0,org:product,"[0.031444637066063784, 0.002242747097625397, 0..."
1,1,per:alternate_names,"[0.012074694212060422, 0.00015735178142980052,..."
2,2,no_relation,"[0.999154289563497, 3.333234023254287e-05, 4.3..."
3,3,no_relation,"[0.9784967998663584, 0.00015599028059417228, 0..."
4,4,no_relation,"[0.6537101119029103, 0.3442419117006163, 0.000..."
...,...,...,...
7760,7760,org:place_of_headquarters,"[0.1354187680214333, 0.0006359863315689532, 0...."
7761,7761,no_relation,"[0.9952300488948822, 0.0001755451124457371, 0...."
7762,7762,org:top_members/employees,"[7.197343165898928e-05, 0.999396393696467, 3.1..."
7763,7763,no_relation,"[0.9692200521628062, 7.909001275644793e-05, 0...."


In [9]:
sum(weighted_results.pred_label != original_results.pred_label)

345

In [10]:
sum(weighted_results.iloc[0].probs)

0.9999998807099932

In [11]:
sum(eval(original_results.iloc[0].probs))

0.9999999622583953