# Baseline Code

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import pickle

## 1. Data Load & Preprocessing

In [2]:
train_dir= './dataset/train/train.csv'
test_dir= './dataset/test/test_data.csv'

In [3]:
def preprocessing_dataset(dataset):
    """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
    subject_entity = [i[1:-1].split(',')[0].split(':')[1] for i in dataset['subject_entity']]
    object_entity = [j[1:-1].split(',')[0].split(':')[1] for j in dataset['object_entity']]
    out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':subject_entity,'object_entity':object_entity,'label':dataset['label'],})
    
    duplied = out_dataset[out_dataset.duplicated(subset=['sentence','subject_entity','object_entity'])]
    duplied_no_idx = duplied[duplied['label'] == 'no_relation']['id'].to_list()
    for idx in duplied_no_idx:
        out_dataset.drop(out_dataset.loc[out_dataset['id']==idx].index, inplace=True)
    out_dataset = out_dataset.drop_duplicates(subset=['sentence','subject_entity','object_entity','label'],keep='first')  
    return out_dataset


def load_data(dataset_dir):
    """ csv 파일을 경로에 맡게 불러 옵니다. """
    pd_dataset = pd.read_csv(dataset_dir)
    return preprocessing_dataset(pd_dataset)


def load_stratified_data(dataset_dir):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    pd_dataset = pd.read_csv(dataset_dir)

    for train_index, test_index in split.split(pd_dataset, pd_dataset["label"]):
        strat_train_set = pd_dataset.loc[train_index]
        strat_dev_set = pd_dataset.loc[test_index]
    train_dataset = preprocessing_dataset(strat_train_set)  
    dev_dataset = preprocessing_dataset(strat_dev_set)  
    return train_dataset, dev_dataset

def label_to_num(label):
    num_label = []
    with open('./dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])

    return num_label

In [4]:
train_dataset, dev_dataset = load_stratified_data(train_dir)
test_dataset = load_data(test_dir)

train_label = label_to_num(train_dataset['label'])
dev_label = label_to_num(dev_dataset['label'])

In [5]:
train_dataset

Unnamed: 0,id,sentence,subject_entity,object_entity,label
10706,10706,"추승우(秋承佑, 1979년 9월 24일 ~)는 전 KBO 리그 한화 이글스의 외야수...",'추승우','외야수',no_relation
26891,26891,부안군에 따르면 부안군자원봉사센터와 전라북도자원봉사센터가 공동으로 주관한 이번 행사...,'부안군','전라북도',no_relation
29437,29437,"중복인력 감축, 서울메트로, 서울특별시 도시철도공사 임원 인건비 절감으로 2027년...",'서울메트로','서울특별시',org:member_of
29780,29780,그 결과 민주정의당 대표인 노태우가 대통령 직선제 개헌을 수용하는 6·29선언이 발...,'민주정의당','노태우',org:top_members/employees
27887,27887,한편 창당에 앞서 옛 유신당의 대표였던 마쓰노 요리히사는 과거 민주당을 탈당한 전력...,'유신당','마쓰노 요리히사',org:top_members/employees
...,...,...,...,...,...
31784,31784,"모스크바 시간 기준 2015년 5월 12일, 암살당한 야당인사 보리스 넴초프가 한때...",'자유당','자유주의',org:political/religious_affiliation
22143,22143,"김범수(金範洙, 1979년 1월 26일 ~)는 대한민국의 가수이다.",'김범수','1979년 1월 26일',per:date_of_birth
8825,8825,"일반적으로 김정은의 어머니는 고용희(고영희)인 것으로 알려지고 있지만, 이복형 김정...",'고용희','김정은',per:children
20549,20549,구자욱의 타구를 잡은 1루수 제이미 로맥이 1루를 찍은 뒤 2루로 뛰던 1루 주자를...,'제이미 로맥','1루수',per:title


## 2. Tokenizier

In [6]:
from transformers import AutoTokenizer

MODEL_NAME = "klue/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
def tokenized_dataset(dataset, tokenizer):
  """ tokenizer에 따라 sentence를 tokenizing 합니다."""
  concat_entity = []
  for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
    temp = ''
    temp = e01 + '[SEP]' + e02
    concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=256,
      add_special_tokens=True,
      return_token_type_ids=False,
      )
  return tokenized_sentences

In [8]:
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)
tokenized_test = tokenized_dataset(test_dataset, tokenizer)

In [9]:
tokenized_train

{'input_ids': tensor([[    0,    11,  1672,  ...,     1,     1,     1],
        [    0,    11, 22902,  ...,     1,     1,     1],
        [    0,    11,  3671,  ...,     1,     1,     1],
        ...,
        [    0,    11,  4571,  ...,     1,     1,     1],
        [    0,    11, 10258,  ...,     1,     1,     1],
        [    0,    11,  3629,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## 3. Model init

In [10]:
import torch

class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset 구성을 위한 class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [11]:
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [12]:

from transformers import AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model_config = AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config).to(device)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'class

## 4. Training

In [13]:
import sklearn
from sklearn.metrics import accuracy_score

def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
                  'org:product', 'per:title', 'org:alternate_names',
                  'per:employee_of', 'org:place_of_headquarters', 'per:product',
                  'org:number_of_employees/members', 'per:children',
                  'per:place_of_residence', 'per:alternate_names',
                  'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
                  'per:spouse', 'org:founded', 'org:political/religious_affiliation',
                  'org:member_of', 'per:parents', 'org:dissolved',
                  'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
                  'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
                  'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0


def klue_re_auprc(probs, labels):
    """KLUE-RE AUPRC (with no_relation)"""
    labels = np.eye(30)[labels]

    score = np.zeros((30,))
    for c in range(30):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0


def compute_metrics(pred):
    """ validation을 위한 metrics function """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = pred.predictions

    # calculate accuracy using sklearn's function
    f1 = klue_re_micro_f1(preds, labels)
    auprc = klue_re_auprc(probs, labels)
    acc = accuracy_score(labels, preds)  # 리더보드 평가에는 포함되지 않습니다.

    return {'micro f1 score': f1, 'auprc': auprc, 'accuracy': acc,}

In [14]:
import numpy as np
import random

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed_everything(2001)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_total_limit=5,              # number of total save model.
    save_steps=100,                 # model saving step.
    num_train_epochs=3,              # total number of training epochs
    learning_rate=5e-5,               # learning_rate
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,              # log saving step.
    evaluation_strategy='steps', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    eval_steps = 100,            # evaluation step.
    load_best_model_at_end = True 
)

In [16]:
trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=RE_train_dataset,         # training dataset
        eval_dataset=RE_dev_dataset,             # evaluation dataset
        compute_metrics=compute_metrics,         # define metrics function
)

trainer.train()


***** Running training *****
  Num examples = 25946
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 77838
  0%|          | 15/77838 [01:35<139:19:03,  6.44s/it]