In [12]:
import numpy as np
import pandas as pd
import os
from glob import glob 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pickle
import torch
import sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer
#from load_data import *

## Data Load

In [13]:
train = pd.read_csv("../dataset/train/train.csv")

In [39]:
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [14]:
def preprocessing_dataset(dataset):
    subject_entity = []
    object_entity = []
    for i,j in zip(dataset['subject_entity'], dataset['object_entity']):
        i = i[1:-1].split(',')[0].split(':')[1]
        j = j[1:-1].split(',')[0].split(':')[1]

        subject_entity.append(i)
        object_entity.append(j)
    out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':subject_entity,'object_entity':object_entity,'label':dataset['label'],})
    return out_dataset

def load_data(dataset_dir):
    pd_dataset = pd.read_csv(dataset_dir)
    dataset = preprocessing_dataset(pd_dataset)
    return dataset

In [18]:
train_dataset = load_data("../dataset/train/train.csv")
train_dataset[:3]

Unnamed: 0,id,sentence,subject_entity,object_entity,label
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,'비틀즈','조지 해리슨',no_relation
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,'민주평화당','대안신당',no_relation
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,'광주FC','한국프로축구연맹',org:member_of


In [24]:
def label_to_num(label):
    num_label = []
    with open('../code/dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    return num_label

train_label = label_to_num(train_dataset['label'].values)
train_label[:10]

[0, 0, 20, 1, 0, 5, 0, 25, 7, 6]

- no_relation -> 0 ... label 을 숫자로 매칭해줌

## Data tokenized

In [None]:
def tokenized_dataset(dataset, tokenizer):
  concat_entity = []
  for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
    temp = ''
    temp = e01 + '[SEP]' + e02
    concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=256,
      add_special_tokens=True,
      )
  return tokenized_sentences

tokenized_train = tokenized_dataset(train_dataset, tokenizer)

In [36]:
print(tokenized_train)

{'input_ids': tensor([[    2,    11, 29830,  ...,     0,     0,     0],
        [    2,    11,  3772,  ...,     0,     0,     0],
        [    2,    11,  4104,  ...,     0,     0,     0],
        ...,
        [    2,    11, 18272,  ...,     0,     0,     0],
        [    2,    11, 15710,  ...,     0,     0,     0],
        [    2,    11, 15437,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [44]:
# Dataset 구성을 위한 class
# make dataset for pytorch.
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, labels):
        self.pair_dataset = pair_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)     

In [45]:
RE_train_dataset = RE_Dataset(tokenized_train, train_label)

In [47]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [49]:
model_config =  AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30
model_config

BertConfig {
  "architectures": [
    "BertForPretraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    

In [50]:
model =  AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [55]:
model.parameters
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [57]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_total_limit=5,              # number of total save model.
    save_steps= 500,                 # model saving step.
    num_train_epochs=20,              # total number of training epochs
    learning_rate= 5e-6, #5e-5,               # learning_rate
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps= 500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,              # log saving step.
    evaluation_strategy='steps', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    eval_steps = 500,            # evaluation step.
    load_best_model_at_end = True 
)

In [60]:
def klue_re_micro_f1(preds, labels):
  label_list = ['no_relation', 'org:top_members/employees', 'org:members',
      'org:product', 'per:title', 'org:alternate_names',
      'per:employee_of', 'org:place_of_headquarters', 'per:product',
      'org:number_of_employees/members', 'per:children',
      'per:place_of_residence', 'per:alternate_names',
      'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
      'per:spouse', 'org:founded', 'org:political/religious_affiliation',
      'org:member_of', 'per:parents', 'org:dissolved',
      'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
      'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
      'per:religion']
  no_relation_label_idx = label_list.index("no_relation")
  label_indices = list(range(len(label_list)))
  label_indices.remove(no_relation_label_idx)
  return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0

def klue_re_auprc(probs, labels):
  labels = np.eye(30)[labels]

  score = np.zeros((30,))
  for c in range(30):
    targets_c = labels.take([c], axis=1).ravel()
    preds_c = probs.take([c], axis=1).ravel()
    precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
    score[c] = sklearn.metrics.auc(recall, precision)
  return np.average(score) * 100.0

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  f1 = klue_re_micro_f1(preds, labels)
  auprc = klue_re_auprc(probs, labels)
  acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.

  return {
      'micro f1 score': f1,
      'auprc' : auprc,
      'accuracy': acc,
  }

trainer = Trainer(
  model=model,                         # the instantiated 🤗 Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=RE_train_dataset,         # training dataset
  eval_dataset=RE_train_dataset,             # evaluation dataset
  compute_metrics=compute_metrics         # define metrics function
)


In [None]:
# train code
# trainer.train()
# model.save_pretrained('./best_model')

## inference.py

```
if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  
  # model dir
  parser.add_argument('--model_dir', type=str, default="./best_model")
  args = parser.parse_args()
  print(args)
  main(args)
```

- parser 진행 후 main 문 실행

In [65]:
# parser.add_argument('--model_dir', type=str, default="./best_model") 
import argparse

parser = argparse.ArgumentParser()

parser.add_argument('--model_dir', type=str, default="./best_model") 

# python inference.py --model_dir=./results/checkpoint-500
# 모델 경로 설정하여 model 선택


_StoreAction(option_strings=['--model_dir'], dest='model_dir', nargs=None, const=None, default='./best_model', type=<class 'str'>, choices=None, help=None, metavar=None)

In [68]:
!python ../code/inference.py --model_dir=./results/checkpoint-40500

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Namespace(model_dir='./results/checkpoint-40500')
404 Client Error: Not Found for url: https://huggingface.co/results/checkpoint-40500/resolve/main/config.json
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py", line 524, in get_config_dict
    resolved_config_file = cached_path(
  File "/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py", line 1404, in cached_path
    output_path = get_from_cache(
  File "/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py", line 1575, in get_from_cache
    r.raise_for_status()
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 941, in raise_for_status
    

In [None]:

# 입력받은 인자값 저장
args = parser.parse_args()
print(args)

In [76]:
train_dataset[439:442]

Unnamed: 0,id,sentence,subject_entity,object_entity,label
439,439,뮤지컬 배우 함연지 씨가 '동상이몽'에 출연했다.,'함연지','뮤지컬 배우',per:title
440,440,21세의 앙갱 공작 루이가 지휘하는 프랑스군은 재빠르게 스페인 군의 움직임에 반응하...,'스페인','6,no_relation
441,441,1984년 5월 빌바오와의 코파 델 레이(스페인 국왕컵) 결승에서 마라도나는 집단 ...,'코파 델 레이','스페인',org:place_of_headquarters


In [77]:
train_dataset[13312:13314]

Unnamed: 0,id,sentence,subject_entity,object_entity,label
13312,13312,김영삼 의원 제명 파동은 1979년 9월 29일 민주공화당과 유신정우회에서 신민당 ...,'박정희','민주공화당',no_relation
13313,13313,"얼마 뒤 아브드 엘 크림은 아누알(Annual)의 스페인 보병대를 급습하여 단 3,...",'스페인','8,no_relation


In [81]:
train[13313:13314]

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
13313,13313,"얼마 뒤 아브드 엘 크림은 아누알(Annual)의 스페인 보병대를 급습하여 단 3,...","{'word': '스페인', 'start_idx': 73, 'end_idx': 75...","{'word': '8,000명', 'start_idx': 65, 'end_idx':...",no_relation,wikipedia


In [82]:
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 14.0 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.9.0-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 90.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp38-cp38-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 70.2 MB/s 
[?25hCollecting tqdm>=4.62.1
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 9.4 MB/s 
Collecting dill
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 11.

In [1]:
from datasets import load_dataset
klue_re = load_dataset("load_klue_re.py")
klue_re

FileNotFoundError: Couldn't find a dataset script at /opt/ml/klue-level2-nlp-18/load_klue_re.py