In [5]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
from transformers import LineByLineTextDataset
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import os

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [1]:
import pandas as pd

train = pd.read_csv('../dataset/train/train.csv')
test = pd.read_csv('../dataset/test/test_data.csv')

merge = pd.concat([train,test])
merge

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,"{'word': '비틀즈', 'start_idx': 24, 'end_idx': 26...","{'word': '조지 해리슨', 'start_idx': 13, 'end_idx':...",no_relation,wikipedia
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,"{'word': '민주평화당', 'start_idx': 19, 'end_idx': ...","{'word': '대안신당', 'start_idx': 14, 'end_idx': 1...",no_relation,wikitree
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,"{'word': '광주FC', 'start_idx': 21, 'end_idx': 2...","{'word': '한국프로축구연맹', 'start_idx': 34, 'end_idx...",org:member_of,wikitree
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,"{'word': '아성다이소', 'start_idx': 13, 'end_idx': ...","{'word': '박정부', 'start_idx': 22, 'end_idx': 24...",org:top_members/employees,wikitree
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,"{'word': '요미우리 자이언츠', 'start_idx': 22, 'end_id...","{'word': '1967', 'start_idx': 0, 'end_idx': 3,...",no_relation,wikipedia
...,...,...,...,...,...,...
7760,7760,코로나19 방역 조치의 일환으로 국민의 움직임을 통제하려는 정부의 시도를 이탈리아 ...,"{'word': '정부', 'start_idx': 33, 'end_idx': 34,...","{'word': '이탈리아', 'start_idx': 41, 'end_idx': 4...",100,wikitree
7761,7761,선 연구원은 “위식도역류질환치료제인 케이캡이 92억원 판매되면서 2019년 연간 3...,"{'word': '종근당', 'start_idx': 133, 'end_idx': 1...","{'word': '전년', 'start_idx': 143, 'end_idx': 14...",100,wikitree
7762,7762,"한국전기안전공사(사장 조성완)는 8월 1일부로, 3급 간부직원에 대한 승진·이동 인...","{'word': '한국전기안전공사', 'start_idx': 0, 'end_idx'...","{'word': '조성완', 'start_idx': 12, 'end_idx': 14...",100,wikitree
7763,7763,1987년 B. 슈나이더(B. Schneider)에 의해 만들어졌다.,"{'word': 'B. 슈나이더', 'start_idx': 6, 'end_idx':...","{'word': '1987년', 'start_idx': 0, 'end_idx': 4...",100,wikipedia


In [3]:
merge['sentence'].to_csv('new.csv',index=False,header=True)

In [6]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')
model = AutoModelForMaskedLM.from_pretrained('klue/roberta-large')


In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="new.csv",
    block_size=512,
)   

In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=5,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,    
)

loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.11.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

loading file 

In [None]:
trainer.train()
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained_model")

***** Running training *****
  Num examples = 32471
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 64944


Step,Training Loss
500,0.9103
1000,0.8174
1500,0.85
2000,0.8343
2500,0.8554
3000,0.8506
3500,0.8559
4000,0.8256
4500,0.8506
5000,0.8475


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-1000
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-1500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Relation_extraction/roberta-retrained/checkpoint-1500/con