In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from load_data import *
import pandas as pd
import torch
import torch.nn.functional as F
import pickle as pickle
import numpy as np
from tqdm import tqdm
import argparse
from omegaconf import OmegaConf
import random

In [8]:
def inference(model, tokenized_sent, device):
    """
    test dataset을 DataLoader로 만들어 준 후, batch_size로 나눠 model이 예측 합니다.
    """
    dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False) # batch_size= 16
    model.eval()
    output_pred = []
    output_prob = []
    for i, data in enumerate(tqdm(dataloader)):
        data = {k:v.to(device) for k,v in data.items()}
        with torch.no_grad():
            outputs = model(**data) # default : input, token  다 넣어줬음 원래
            _, logits = outputs[:2]
            # logits = outputs['output']
            prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
            logits = logits.detach().cpu().numpy()
            result = np.argmax(logits, axis=-1)

            output_pred.append(result)
            output_prob.append(prob)

  
    return np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

def num_to_label(cfg, label):
    """
    숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다.
    """
    origin_label = []
    with open(cfg.test.num_to_label, 'rb') as f:
        dict_num_to_label = pickle.load(f)
    for v in label:
        origin_label.append(dict_num_to_label[v])
  
    return origin_label

def load_test_dataset(dataset_dir):
    """
    test dataset을 불러온 후, tokenizing 합니다.
    """
    test_dataset = load_data(dataset_dir)
    test_label = list(map(int,test_dataset['label'].values))
    test_sub_type = list(test_dataset['subject_entity'].apply(lambda x : eval(x)['type']))

    return test_dataset['id'], test_dataset, test_label,test_sub_type

def double_check(output,sub_type,tokenizer):
    new_df = output[output['sub_type']==sub_type]
    RE_data = RE_Dataset(new_df,list(range(len(new_df))),tokenizer)
    return new_df['id'],RE_data

def test(cfg):
    ## Device
    # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')
    
    ## load Model & Tokenizer
    bin_tokenizer = AutoTokenizer.from_pretrained(cfg.model.bin_plm)
    # model = AutoModelForSequenceClassification.from_pretrained(cfg.model.saved_model)
    bin_model = AutoModelForSequenceClassification.from_pretrained(cfg.model.binary_model)
    bin_model.parameters
    bin_model.to(device)
    
    per_tokenizer = AutoTokenizer.from_pretrained(cfg.model.sec_plm)
    per_model = AutoModelForSequenceClassification.from_pretrained(cfg.model.per_model)
    per_model.parameters
    per_model.to(device)
    
    org_tokenizer = AutoTokenizer.from_pretrained(cfg.model.sec_plm)
    org_model = AutoModelForSequenceClassification.from_pretrained(cfg.model.saved_model)
    org_model.parameters
    org_model.to(device)
    

    ## load test datset
    test_dataset_dir = cfg.data.test_data
    test_id, test_dataset, test_label,test_sub_type = load_test_dataset(test_dataset_dir)
    Re_test_dataset = RE_Dataset(test_dataset ,test_label, bin_tokenizer)
    
    ## predict answer ## 절대 바꾸지 말 것 ##
    pred_answer, output_prob = inference(bin_model, Re_test_dataset, device) # model에서 class 추론
    pred_answer = bin_num_to_label(cfg, pred_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환.

    ## make csv file with predicted answer
    output = pd.DataFrame({'id':test_id,'pred_label':pred_answer,'probs':output_prob, 'sub_type' : test_sub_type})
    no_rel_output, rel_output = output[output['pred_label']=='no_relation'],output[output['pred_label']=='relation']
    rel_ouput = pd.merge(rel_output, test_dataset, left_on='id', right_on='id', how='left')
    # check output
    per_id, RE_PER = double_check(rel_output,'PER',per_tokenizer)
    org_id, RE_ORG = double_check(rel_output,'ORG',org_tokenizer)
    
    
    per_answer, per_output_prob = inference(per_model, Re_PER, device) # model에서 class 추론
    per_answer = per_num_to_label(cfg, per_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환.

    org_answer, org_output_prob = inference(org_model, Re_ORG, device) # model에서 class 추론
    org_answer = org_num_to_label(cfg, per_answer) # 숫자로 된 class를 원래 문자열 라벨로 변환.

    
    per_output = pd.DataFrame({'id':per_id,'pred_label':per_answer,'probs':per_output_prob})
    org_output = pd.DataFrame({'id':org_id,'pred_label':org_answer,'probs':org_output_prob})
    
    output = pd.concat([no_rel_output,per_output,org_output])
    output.sort_index(ascending=True)

    return output
    # output.to_csv(cfg.test.output_csv, index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.


In [40]:
# def seed_everything(seed):
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)  # if use multi-GPU
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
#     np.random.seed(seed)               # 시드를 고정해도 함수를 호출할 때 다른 결과가 나오더라..?
#     random.seed(seed)
#     print('lock_all_seed')

# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--config', type=str, default='config-test')
#     args, _ = parser.parse_known_args()
#     cfg = OmegaConf.load(f'./config/{args.config}.yaml')
#     seed_everything(cfg.train.seed)
#     df = test(cfg)


In [3]:
import pandas as pd
tmp = pd.read_csv('/opt/ml/code/prediction/23 new_meta_binary.csv')

In [14]:
tmp['len'] = tmp['probs'].apply(lambda x : len(eval(x)))


In [18]:
tmp['sum'] = tmp['probs'].apply(lambda x : sum(eval(x)))

In [22]:
tmp['sum'].unique()

array([0.99999997, 0.99999998, 0.99999996, ..., 1.00000002, 1.00000009,
       1.        ])

In [23]:
import pandas as pd
tmp2 = pd.read_csv('/opt/ml/code/prediction/submission_roberta_base.csv')
tmp2['sum'] = tmp2['probs'].apply(lambda x : sum(eval(x)))
tmp2['len'] = tmp2['probs'].apply(lambda x : len(eval(x)))

In [27]:
tmp2['sum'].unique()

array([1.00000002, 0.99999994, 1.00000012, ..., 0.99999994, 1.00000013,
       1.00000014])

In [28]:
import pandas as pd
tmp3 = pd.read_csv('/opt/ml/code/prediction/23 new_meta_binary.csv')

In [33]:
tmp3['probs'] = tmp3['probs'].apply(lambda x : [1]+[0 for i in range(29)])
tmp3.to_csv('/opt/ml/code/prediction/24 new_meta_binary.csv',index=False)

In [36]:
tmp3['pred_label'].unique()

array(['org:members', 'per:alternate_names', 'no_relation',
       'per:employee_of', 'per:spouse', 'org:top_members/employees',
       'org:product', 'org:place_of_headquarters', 'org:member_of',
       'per:place_of_residence', 'per:title', 'per:date_of_birth',
       'per:product', 'per:colleagues', 'per:origin', 'org:founded',
       'org:alternate_names', 'per:place_of_birth',
       'org:number_of_employees/members', 'per:children', 'per:siblings',
       'per:religion', 'per:place_of_death', 'org:founded_by',
       'org:political/religious_affiliation', 'per:date_of_death',
       'per:parents', 'per:other_family', 'per:schools_attended',
       'org:dissolved'], dtype=object)

In [39]:
tmp2

Unnamed: 0,id,pred_label,probs,sum,len
0,0,org:product,"[0.0002801950613502413, 7.88332472438924e-05, ...",1.0,30
1,1,per:alternate_names,"[1.4626407391915563e-05, 4.2694207991189614e-0...",1.0,30
2,2,no_relation,"[0.9999440908432007, 2.6342677301727235e-06, 2...",1.0,30
3,3,no_relation,"[0.9999125003814697, 4.017337687400868e-06, 6....",1.0,30
4,4,no_relation,"[0.9998575448989868, 5.0133887270931154e-05, 2...",1.0,30
...,...,...,...,...,...
7760,7760,org:place_of_headquarters,"[0.0006010245415382087, 5.9869475990126375e-06...",1.0,30
7761,7761,no_relation,"[0.9999493360519409, 3.763392669497989e-06, 3....",1.0,30
7762,7762,org:top_members/employees,"[1.8655678104551043e-06, 0.9999780654907227, 7...",1.0,30
7763,7763,no_relation,"[0.9998759031295776, 1.260993599316862e-06, 2....",1.0,30


In [42]:
import pandas as pd
df = pd.read_csv('/opt/ml/code/prediction/24 new_meta_binary.csv')
df['id']

0          0
1          1
2          2
3          3
4          4
        ... 
7759    7760
7760    7761
7761    7762
7762    7763
7763    7764
Name: id, Length: 7764, dtype: int64

In [44]:
for i,v in enumerate(df['id']):
    if i==v:
        pass
    else:
        print('this is error',i)
        break

this is error 6820


In [46]:
df[df['pred_label']=='no_relation']

Unnamed: 0,id,pred_label,probs
2,2,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,5,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10,10,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
7751,7752,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7754,7755,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7756,7757,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7760,7761,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [47]:
gold = pd.read_csv('/opt/ml/dataset/test/test_data.csv')

In [50]:
gold.iloc[6820]
# gold.iloc[6819 :6822]

id                                                             6820
sentence          1875년 시카고 빈민가에 교회를 설립했는데, 구두판매원출신다운 쉽고 설득력있는 무...
subject_entity    {'word': '시카고', 'start_idx': 6, 'end_idx': 8, ...
object_entity     {'word': '1875년', 'start_idx': 0, 'end_idx': 4...
label                                                           100
source                                                    wikipedia
Name: 6820, dtype: object

In [63]:
prob = [1]+[0 for _ in range(29)]

row1 = pd.DataFrame({'id': gold.iloc[6820]['id'], 'pred_label' : 'no_relation', 'probs' : [prob]})
row1

Unnamed: 0,id,pred_label,probs
0,6820,no_relation,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [64]:
tmp = pd.read_csv('/opt/ml/code/prediction/23 new_meta_binary.csv')

In [67]:
df = pd.concat([tmp,row1])
df = df.sort_values('id')
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,id,pred_label,probs
0,0,org:members,"[0, 0.00642776396125555, 0.6780948042869568, 0..."
1,1,per:alternate_names,"[0, 0, 0, 0, 0.14000281691551208, 0, 0.0008271..."
2,2,no_relation,"[0.9674349427223206, 0.03256501629948616, 0, 0..."
3,3,no_relation,"[0.8527429699897766, 0.1472570300102234, 0, 0,..."
4,4,no_relation,"[0.5491185784339905, 0.4508814513683319, 0, 0,..."
...,...,...,...
7760,7760,org:place_of_headquarters,"[0, 0.0014581563882529736, 0.00566781731322407..."
7761,7761,no_relation,"[0.5846818089485168, 0.41531816124916077, 0, 0..."
7762,7762,org:top_members/employees,"[0, 0.9987155199050903, 0.00014345147064886987..."
7763,7763,per:date_of_death,"[0, 0, 0, 0, 0.0007162904948927462, 0, 0.00076..."


In [68]:
df.to_csv('/opt/ml/code/prediction/25 new_meta_binary_cf.csv',index=False)