In [10]:
import pickle
import os
import pandas as pd
from ast import literal_eval
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer
from sklearn.model_selection import StratifiedShuffleSplit
from collections import defaultdict
import random
import warnings
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [11]:
def clean_dataset(dataset):
    # mislabeling 수정
    dataset = dataset.drop_duplicates(['sentence','subject_entity','object_entity','label'])
    dataset.loc[dataset['id'] == 32107, 'subject_entity'] = "{'word': '이용빈', 'start_idx': 0, 'end_idx': 2, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 1435, 'object_entity'] = "{'word': '조오섭', 'start_idx': 0, 'end_idx': 2, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 9269, 'object_entity'] = "{'word': '김성진', 'start_idx': 21, 'end_idx': 23, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 30870, 'object_entity'] = "{'word': '김성진', 'start_idx': 21, 'end_idx': 23, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 1334, 'subject_entity'] = "{'word': '김성진', 'start_idx': 21, 'end_idx': 23, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 30530, 'subject_entity'] = "{'word': '김성진', 'start_idx': 21, 'end_idx': 23, 'type': 'PER'}"
    dataset.loc[dataset['id'] == 8477, 'object_entity'] = "{'word': '김성진', 'start_idx': 21, 'end_idx': 23, 'type': 'PER'}"

    # index오류 수정
    dataset.loc[dataset['id'] == 13780, 'object_entity'] = "{'word': '시동', 'start_idx': 4, 'end_idx': 5, 'type': 'POH'}"
    dataset.loc[dataset['id'] == 15584, 'object_entity'] = "{'word': '시동', 'start_idx': 4, 'end_idx': 5, 'type': 'POH'}"
    dataset.loc[dataset['id'] == 630, 'object_entity'] = "{'word': '시동', 'start_idx': 44, 'end_idx': 45, 'type': 'POH'}"
    dataset.loc[dataset['id'] == 25109, 'object_entity'] = "{'word': '은교', 'start_idx': 4, 'end_idx': 5, 'type': 'POH'}"
    dataset.loc[dataset['id'] == 25756, 'object_entity'] = "{'word': '스승의 은혜', 'start_idx': 13, 'end_idx': 18, 'type': 'POH'}"

    drop_ids = [18458, 6749, 8364, 11511, 25094, 277, 19074] # 19074:스승의 은혜
    dataset = dataset[dataset['id'].map(lambda x: x not in drop_ids)] # mislabeling drop
    dataset = dataset.reset_index(drop=True)

    return dataset

In [12]:
def load_data(dataset_dir):
    dataset = pd.read_csv(dataset_dir)
    if 'train' in dataset_dir:
        dataset = clean_dataset(dataset)
    dataset['subject_entity'] = dataset['subject_entity'].map(literal_eval)
    dataset['object_entity'] = dataset['object_entity'].map(literal_eval)
    
    return dataset

In [13]:
def split_data(dataset, num_splits):
    if num_splits == 1:
        test_size = 0.1
    else: test_size = 0.2
    split = StratifiedShuffleSplit(n_splits=num_splits, test_size=test_size, random_state=42)
    for train_index, dev_index in split.split(dataset, dataset["label"]):
        train_dataset = dataset.loc[train_index]
        dev_dataset = dataset.loc[dev_index]
    
        yield train_dataset, dev_dataset

In [14]:
def tokenized_dataset(dataset, tokenizer):
    sub_df = dataset['subject_entity'].apply(pd.Series).add_prefix('sub_')
    obj_df = dataset['object_entity'].apply(pd.Series).add_prefix('obj_')
    dataset = pd.concat([dataset, sub_df], axis=1)
    dataset = pd.concat([dataset, obj_df], axis=1)

    tokens = []
    for row in dataset.itertuples():
        temp = [i for i in row.sentence]
        if row.sub_start_idx > row.obj_start_idx:
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'^#{row.sub_type}#{row.sub_word}^']
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
        else:
            temp[row.obj_start_idx:row.obj_end_idx+1] = [f'@+{row.obj_type}+{row.obj_word}@']
            temp[row.sub_start_idx:row.sub_end_idx+1] = [f'^#{row.sub_type}#{row.sub_word}^']

        tokenized_sentences = tokenizer(
            f'^#{row.sub_type}#{row.sub_word}^'+' '+ f'@+{row.obj_type}+{row.obj_word}@',
                ''.join(temp),
                return_tensors="pt",
                padding=False,
                truncation=True,
                max_length=256,
                add_special_tokens=True
                ) 
        tokens.append(tokenized_sentences)

    return tokens

In [15]:
def make_sampler(data, batch_size=64, max_pad_len=20):
    sentence_length = [sen['input_ids'].shape[1] for sen in data]
    bucket_dict = defaultdict(list)

    for index, src_length in enumerate(sentence_length):
        bucket_dict[(src_length // max_pad_len)].append(index)

    batch_dict = defaultdict(list)

    for key, bucket in bucket_dict.items():
        for start in range(0, len(bucket), batch_size):
            batch_dict[key].append(bucket[start:start+batch_size])

    surplus = []
    sampler = []
    for batch_set in batch_dict.values():
        for batch in batch_set:
            if len(batch) == batch_size:
                sampler.append(batch)
            else:
                surplus.extend(batch)
    sampler.extend([surplus[start:start+batch_size] for start in range(0, len(surplus), batch_size)])
    random.shuffle(sampler)
    return sampler

In [16]:
def collate_fn(batch_samples):
    max_len = max([i['input_ids'].shape[1] for i in batch_samples])
    batch = defaultdict(list)
    for data in batch_samples:
        pad_len = max_len - data['input_ids'].shape[1]
        for key, val in data.items():
            if key != 'labels':
                if key == 'input_ids':
                    input_id = torch.cat((val, torch.ones(1,pad_len)), dim=1).type(torch.long)
                    batch[key].append(input_id)
                    
                elif key != 'token_type_ids':
                    batch[key].append(torch.cat((val, torch.zeros(1,pad_len)), dim=1).type(torch.long))
            else:
                batch[key].append(val)
                
    batch['input_ids'] = torch.stack(batch['input_ids']).squeeze(1)
    batch['attention_mask'] = torch.stack(batch['attention_mask']).squeeze(1)
    batch['sub_mask'] = torch.stack(batch['sub_mask']).squeeze(1)
    batch['obj_mask'] = torch.stack(batch['obj_mask']).squeeze(1)
    batch['labels'] = torch.stack(batch['labels'])
    return batch

In [17]:
def label_to_num(label):
    num_label = []
    with open('dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    
    return num_label

In [18]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, labels):
        self.pair_dataset = pair_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = self.pair_dataset[idx]
        item['labels'] = torch.tensor(self.labels[idx])
            
        return item

    def __len__(self):
        return len(self.labels)

In [27]:
def make_entity_mask(tokens):
    for token in tokens:
        for tar, name in zip([65, 36], ['sub_mask', 'obj_mask']):
            mask_temp = torch.zeros_like(token['input_ids'])
            sentence = list(token['input_ids'].squeeze())
            start_idx = sentence.index(tar)
            end_idx = sentence.index(tar, start_idx+ 1)
            mask_temp[:, start_idx:end_idx+1] = 1 # start ^,@ 부터 end ^,@ 까지 1 masking

            start_idx2 = sentence.index(tar, end_idx+1)
            end_idx2 = sentence.index(tar, start_idx2+ 1)
            mask_temp[:, start_idx2:end_idx2+1] = 1
            
            # mask_temp[:, [start_idx,end_idx]] = 1 # ^,@ 위치에만 1 masking

            token[name] = mask_temp
    return tokens

In [28]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

# load dataset
dataset = load_data('../dataset/train/train.csv')
for fold, (train_dataset,dev_dataset) in enumerate(split_data(dataset, num_splits=1), 1):
    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    train_sampler = make_sampler(tokenized_train, batch_size=32, max_pad_len=10)
    valid_sampler = make_sampler(tokenized_dev, batch_size=32, max_pad_len=20)

    tokenized_train = make_entity_mask(tokenized_train)
    tokenized_dev = make_entity_mask(tokenized_dev)

    train_label = label_to_num(train_dataset['label'].values)
    dev_label = label_to_num(dev_dataset['label'].values)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [29]:
dataloader = DataLoader(RE_train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn)

In [30]:
for i in dataloader:
    break

In [46]:
tokenizer.decode(i['input_ids'][0])

'[CLS] ^ # ORG # 네덜란드 ^ @ + DAT + 1712년 @ [SEP] 드냉 전투는 @ + DAT + 1712년 @ 7월 24일 스페인 왕위 계승 전쟁의 전투 중 하나로 일어 났고, 사보이의 외젠 공 ( Prince Eugene of Savoy ) 휘하의 오스트리아 - ^ # ORG # 네덜란드 ^ 군에 대하여 빌라르 원수 ( Marshal Villars ) 가 이끄는 프랑스군이 승리를 거두었다. [SEP] [PAD]'

In [2]:
import pickle
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoConfig, Trainer, TrainingArguments, set_seed
from load_data_for_R import *
from model_for_R import R_BigBird
import wandb

In [None]:
class BucketTrainer(Trainer):
    def get_train_dataloader(self) -> DataLoader:

        return DataLoader(self.train_dataset, batch_sampler=self.train_sampler, collate_fn=collate_fn)

    def get_eval_dataloader(self, eval_dataset) -> DataLoader:
        if eval_dataset is not None:
            return DataLoader(eval_dataset, batch_sampler=self.valid_sampler, collate_fn=collate_fn)
        else:
            return DataLoader(self.eval_dataset, batch_sampler=self.valid_sampler, collate_fn=collate_fn)

In [77]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# setting model hyperparameter
model_config =  AutoConfig.from_pretrained('klue/roberta-large')
# model_config.num_labels = 30

model = R_BigBird(model_config, 0.1)
model = model.to(device)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 31.75 GiB total capacity; 2.30 GiB already allocated; 14.50 MiB free; 2.32 GiB reserved in total by PyTorch)

In [74]:
model.model.embeddings = torch.nn.Embedding(10,1)

In [76]:
model.model

RobertaModel(
  (embeddings): Embedding(10, 1)
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=1024, out_features=4096, bias=True)
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=4096, out_features=1024, bias=True

In [None]:
def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
       'org:product', 'per:title', 'org:alternate_names',
       'per:employee_of', 'org:place_of_headquarters', 'per:product',
       'org:number_of_employees/members', 'per:children',
       'per:place_of_residence', 'per:alternate_names',
       'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
       'per:spouse', 'org:founded', 'org:political/religious_affiliation',
       'org:member_of', 'per:parents', 'org:dissolved',
       'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
       'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
       'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0

def klue_re_auprc(probs, labels):
    """KLUE-RE AUPRC (with no_relation)"""
    labels = np.eye(30)[labels]

    score = np.zeros((30,))
    for c in range(30):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0

def compute_metrics(pred):
    """ validation을 위한 metrics function """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = pred.predictions

    # calculate accuracy using sklearn's function
    f1 = klue_re_micro_f1(preds, labels)
    auprc = klue_re_auprc(probs, labels)
    acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.

    return {
        'micro f1 score': f1,
        'auprc' : auprc,
        'accuracy': acc,
    }

In [None]:
training_args = TrainingArguments(
      output_dir='./results',          # output directory
      save_total_limit=5,              # number of total save model.
      save_steps=500,                 # model saving step.
      num_train_epochs=3,              # total number of training epochs
      learning_rate=5e-5,               # learning_rate
      per_device_train_batch_size=32,  # batch size per device during training
      per_device_eval_batch_size=32,   # batch size for evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=500,              # log saving step.
      evaluation_strategy='steps',
      eval_steps=500 ,
      load_best_model_at_end = True, 
      report_to='wandb'
    )
    
trainer = BucketTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=RE_train_dataset,         # training dataset
    eval_dataset=RE_dev_dataset,             # evaluation dataset
    compute_metrics=compute_metrics         # define metrics function
)
trainer.train_sampler = train_sampler
trainer.valid_sampler = valid_sampler

In [None]:
trainer.train()

***** Running training *****
  Num examples = 29223
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2760
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mggoggori[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [28]:
model.save_pretrained('./best_model')

Configuration saved in ./best_model/config.json
Model weights saved in ./best_model/pytorch_model.bin


In [53]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, BigBirdModel, AutoConfig
import torch
from torch.utils.data import DataLoader
from model_for_R import *
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F


In [54]:
def inference(model, tokenized_sent, device):
	dataloader = DataLoader(tokenized_sent, batch_size=32, shuffle=False, collate_fn=collate_fn)
	model.eval()
	output_pred = []
	output_prob = []
	for i, data in enumerate(tqdm(dataloader)):
		with torch.no_grad():
			outputs = model(
			input_ids = data['input_ids'].to(device),
			attention_mask = data['attention_mask'].to(device),
			sub_mask = data['sub_mask'].to(device),
			obj_mask = data['obj_mask'].to(device),
			labels = None
			)

			logits = outputs[0]
			prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
			logits = logits.detach().cpu().numpy()
			result = np.argmax(logits, axis=-1)
			output_pred.append(result)
			output_prob.append(prob)

	return np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()


def num_to_label(label):
  """
    숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다.
  """
  origin_label = []
  with open('dict_num_to_label.pkl', 'rb') as f:
    dict_num_to_label = pickle.load(f)
  for v in label:
    origin_label.append(dict_num_to_label[v])
  
  return origin_label

In [55]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')
model_config =  AutoConfig.from_pretrained('klue/roberta-large')
model = R_BigBird(model_config, 0.1)
model.load_state_dict(torch.load('./best_model/pytorch_model.bin'))
model.to(device)

dataset = load_data('../dataset/test/test_data.csv')
tokenized_test = tokenized_dataset(dataset, tokenizer)
test_label = list(map(int,dataset['label'].values))
RE_dataset_test = RE_Dataset(tokenized_test, test_label)

output_pred, output_prob = inference(model, RE_dataset_test, device)
original_label = num_to_label(output_pred)

test = pd.read_csv('../dataset/test/test_data.csv')
test_id = test['id'].to_list()
output = pd.DataFrame({'id':test_id, 'pred_label':original_label, 'probs':output_prob})

output.to_csv('./prediction/submission.csv', index=False)

cuda:0


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [63]:
dataloader = DataLoader(RE_dataset_test, batch_size=32, shuffle=False, collate_fn=collate_fn)
model.eval()
output_pred = []
output_prob = []
for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
        input_ids = data['input_ids'].to(device),
        attention_mask = data['attention_mask'].to(device),
        sub_mask = data['sub_mask'].to(device),
        obj_mask = data['obj_mask'].to(device),
        labels = None
        )

        logits = outputs[0]
        prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)
        output_pred.append(result)
        output_prob.append(prob)

    np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

    break

  0%|          | 0/243 [00:00<?, ?it/s]
