In [None]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import yaml
import argparse
from tqdm import tqdm
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter
from scipy import stats
import math
import os

from scipy import stats


# 데이터 유출 방지를 위해 model_result에 있는 파일은 공개하지 않음.

validation = True
if validation:
    output = pd.read_csv("NLP_dataset/sample_val_submission.csv")
    csvpath = "model_result/outEDA_result"
else:      
    output = pd.read_csv("NLP_dataset/sample_submission.csv")
    csvpath = "model_result/output"
    
model_df = {}


for p in Path(csvpath).glob('*'):
    if validation:
        df = output.copy()
        eda_df = pd.read_csv(p)
        df['target'] = round(np.clip(eda_df['pred'], 0, 5), 1)
    else:
        df = pd.read_csv(p)
    model_df[p.name[:-4]] = df 
    
label_models_list = [['wrMSE_1_e12', 'fine_tuning_mecab_e8', 'klue-roberta-large_last'],
               ['fine_tuning_mecab_e8', 'klue-roberta-large_last', 'MLM2STS_last'],
               ['klue-roberta-large_last', 'wrMSE_2_e9', 'NLI2STS_e7'],
               ['fine_tuning_mecab_e8', 'klue-roberta-large_last', 'MLM2STS_last'],
               [ 'wrMSE_4_e12', 'klue-roberta-large_last', 'NLI2STS_e7']]


pred_aver = pd.DataFrame() 
for label_pos, models in enumerate(label_models_list):
    x = pd.concat([model_df[m] for m in models], axis= 1).mean(axis='columns')
    pred_aver = pd.concat([pred_aver, x], axis=1)

pred_aver.columns = [0, 1, 2, 3, 4]

In [None]:

weighted_vote = False
final_preds = []

for idx in range(len(output)):
    row_pred = []
    for model_name, df in model_df.items():
        if weighted_vote:
            row_pred.append(math.floor(df.loc[idx, 'target']*0.999))
        else:
            row_pred.append(df.loc[idx, 'target'])
            
    if weighted_vote:        
        label_counter = dict(Counter(row_pred))
        x = sum([v*pred_aver.loc[idx, k] for k, v in label_counter.items()]) / len(model_df)
        final_preds.append(x)     
    else:
        x = sum(row_pred) / len(model_df)
        final_preds.append(x)

if validation:
    val_label = pd.read_csv("NLP_dataset/han_processed_dev.csv")
    final_preds = [round(np.clip(p, 0, 5), 1) for p in final_preds]
    output['target'] = final_preds
    print('pearson :', stats.pearsonr(final_preds, val_label['label']))
else:      
    final_preds = [round(np.clip(p, 0, 5), 1) for p in final_preds]
    output['target'] = final_preds
    output.to_csv("output.csv", index=False)