In [1]:
from datasets import load_dataset

klue_re_dataset_train = load_dataset("klue", "re", split="train")
klue_re_dataset_val = load_dataset("klue", "re", split="validation")

klue_re_dataset_train = klue_re_dataset_train['sentence']

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 22.5k/22.5k [00:00<00:00, 11.1MB/s]
Downloading data: 100%|██████████| 6.65M/6.65M [00:00<00:00, 18.1MB/s]
Downloading data: 100%|██████████| 1.54M/1.54M [00:00<00:00, 6.67MB/s]
Generating train split: 100%|██████████| 32470/32470 [00:00<00:00, 449319.04 examples/s]
Generating validation split: 100%|██████████| 7765/7765 [00:00<00:00, 416597.64 examples/s]


In [2]:
print(klue_re_dataset_train[0])

〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.


In [3]:
klue_re_dataset_val= klue_re_dataset_val['sentence']

In [11]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer , AutoModelForMaskedLM, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader, RandomSampler

import pytorch_lightning as pl
import torch

In [5]:
train_data = klue_re_dataset_train
val_data = klue_re_dataset_val

In [6]:
val_data[:2]

["20대 남성 A(26)씨가 아버지 치료비를 위해 B(30)씨가 모아둔 돈을 훔쳐 인터넷 방송 BJ에게 '별풍선'으로 쏜 사실이 알려졌다.",
 '그러나 심 의원은 보좌진이 접속 권한을 받아 정부 업무추진비 사용 내역 등을 다운받았음에도 정부가 허위 사실을 유포하는 등 국정감사 활동을 방해하고 있다고 반박했고, 김동연 경제부총리 겸 기획재정부 장관과 김재훈 재정정보원장, 기재부 관계자 등을 무고 등 혐의로 전날 맞고발했다.']

In [7]:
class LineByLineTextDataset(Dataset):
    def __init__(self,tokenizer,data,block_size):
        encoded_data = tokenizer(data,
                  truncation=True,
                  max_length=block_size)
        self.examples = encoded_data['input_ids']
        self.examples = [{"input_ids": torch.tensor(ex,dtype=torch.long)} for ex in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self,i):
        return self.examples[i]

In [8]:
def prepare_dataset_for_pretraining(tokenizer,train_input,val_input):
    train_dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        data=train_input,
        block_size=512,
    )
    # set mlm task
    # DataCollatorForSOP로 변경시 SOP 사용 가능 (DataCollatorForLanguageModeling)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15 # 0.3
    )
    eval_dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        data=val_input,
        block_size=512,
    )

    return train_dataset, data_collator, eval_dataset

In [27]:
def set_trainer_for_pretraining(
        model,
        data_collator,
        dataset,
        eval_dataset,
        epoch = 10,
        batch_size = 16,
        accumalation_step = 1,):

     # set training args
    training_args = TrainingArguments(
        report_to = 'tensorboard',
        output_dir='./pretraining_outputs',
        overwrite_output_dir=True,
        num_train_epochs=epoch,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=accumalation_step,
        evaluation_strategy = 'steps',
        eval_steps=500,
        save_steps=500,
        save_total_limit=1,
        fp16=True,
        load_best_model_at_end=True,
        seed=42,
        save_strategy='steps'
        # evaluation_strategy='epoch',
    )


    # set Trainer class for pre-training
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3,early_stopping_threshold=0.001)]

    )

    return trainer

In [28]:
def pretrain():
    """MLM task 기반 사전학습 진행"""
    # fix a seed
    pl.seed_everything(seed=42)

    # set device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)

    # set model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
    model = AutoModelForMaskedLM.from_pretrained("klue/bert-base")
    model.to(device)

    # set data
    train_dataset, data_collator, eval_dataset = prepare_dataset_for_pretraining(tokenizer, train_data, val_data)

    # set trainer
    trainer = set_trainer_for_pretraining(model,data_collator,train_dataset,eval_dataset)

    # train model
    print("--- Start train ---")
    trainer.train()
    print("--- Finish train ---")
    model.save_pretrained("./pretrained")

In [2]:
pretrain()

NameError: name 'pretrain' is not defined

In [3]:
TYPE_MARKERS = dict(
    subject_start_per_marker="<S:PER>",
    subject_start_org_marker="<S:ORG>",
    subject_start_loc_marker="<S:LOC>",
    subject_end_per_marker ="</S:PER>",
    subject_end_org_marker ="</S:ORG>",
    subject_end_loc_marker="</S:LOC>",
    object_start_per_marker="<O:PER>",
    object_start_org_marker="<O:ORG>",
    object_start_loc_marker="<O:LOC>",
    object_start_dat_marker="<O:DAT>",
    object_start_poh_marker="<O:POH>",
    object_start_noh_marker="<O:NOH>",
    object_end_per_marker ="</O:PER>",
    object_end_org_marker ="</O:ORG>",
    object_end_loc_marker ="</O:LOC>",
    object_end_dat_marker ="</O:DAT>",
    object_end_poh_marker ="</O:POH>",
    object_end_noh_marker ="</O:NOH>",
)

In [5]:
import pickle as pickle
import os
import pandas as pd
import torch
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from transformers import AutoModelForMaskedLM, AutoModel
from load_data import *
from sklearn.model_selection import StratifiedKFold
# from traindevsplit import * # train_dev_split
import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm


In [14]:


tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

new_special_tokens = list(TYPE_MARKERS.values())

tokenizer.add_special_tokens({'additional_special_tokens': new_special_tokens})



18

In [17]:
      # Setting model hyperparameter
      model_path = './pretrained_roberta_large'
      model_config = AutoConfig.from_pretrained(f'{model_path}/config.json')
      model_config.num_labels = 30

      # Load the sequence classification model
      model = AutoModelForSequenceClassification.from_pretrained(model_path, config=model_config)
      model.resize_token_embeddings(len(tokenizer))


      

Some weights of the model checkpoint at ./pretrained_roberta_large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./pretrained_roberta_large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out

Embedding(32018, 1024)

In [None]:
def update_ranges_to_1(start_tokens, end_tokens, length):
    res = np.zeros(length, dtype=int)
    for start, end in zip(start_tokens, end_tokens):
        res[start + 1:end] = 1
    return res

def entity_ids_maker(data, start_id, end_id):
    entity_ids = []
    
    for ids in tqdm(data):
        length = len(ids)
        startidx = [i for i, id in enumerate(ids) if id in start_id]
        endidx = [i for i, id in enumerate(ids) if id in end_id]

        if startidx and endidx:
            tmp = update_ranges_to_1(startidx, endidx, length)
            entity_ids.append(tmp)
    
    entity_ids = torch.tensor(entity_ids, dtype=torch.int)
    return entity_ids

In [None]:
def entity_ids_maker(data, start_id, end_id): # data에는 tokenizer를 거쳐 나온 input_ids가 들어온다
    def update_ranges_to_1(start_tokens, end_tokens, maxlen=251):
        res = np.zeros(maxlen, dtype=int    )
        for start_token, end_token in zip(start_tokens, end_tokens):
            res[start_token + 1:end_token] = 1
        return res
    
             
    entity_ids = []
    for ids in tqdm(data):
        length = len(ids)

        startidx = [i for i, id in enumerate(ids) if id in start_id]
        endidx = [i for i, id in enumerate(ids) if id in end_id]

        if startidx and endidx:
            tmp = update_ranges_to_1(startidx, endidx, maxlen=length)
            entity_ids.append(tmp)

                
    entity_ids = torch.Tensor(entity_ids, dtype=torch.int)
    return entity_ids 

In [None]:
# tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    train_ent_pos_emb = get_entity_position_embedding(tokenizer, tokenized_train['input_ids'])
    # print(len(tokenized_train['input_ids'][0]))
    # for i in train_ent_pos_emb:
    #     if len(i) == 4:continue
    #     else:
    #         print("error")
    tokenized_train['entity_ids'] = making_entity_pos_emb(train_ent_pos_emb)
    # entity_ids = entity_ids_maker(train_dataset, tokenizer)
    # tokenized_train['entity_ids'] = entity_ids
    
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)
    dev_ent_pos_emb = get_entity_position_embedding(tokenizer, tokenized_dev['input_ids'])
    tokenized_dev['entity_ids'] = making_entity_pos_emb(dev_ent_pos_emb)