In [1]:
import os
import re
import torch
import sklearn
import copy
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pickle
import numpy as np
import collections
import wandb
import random
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import (AutoTokenizer, 
                          AutoConfig, 
                          AutoModelForMaskedLM,
                          DataCollatorForLanguageModeling,
                          Trainer, 
                          TrainingArguments)

2021-10-03 15:49:37.982862: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Raw Data

In [2]:
data_df = pd.read_csv('/opt/ml/dataset/train/train.csv')
data_df.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,"{'word': '비틀즈', 'start_idx': 24, 'end_idx': 26...","{'word': '조지 해리슨', 'start_idx': 13, 'end_idx':...",no_relation,wikipedia
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,"{'word': '민주평화당', 'start_idx': 19, 'end_idx': ...","{'word': '대안신당', 'start_idx': 14, 'end_idx': 1...",no_relation,wikitree
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,"{'word': '광주FC', 'start_idx': 21, 'end_idx': 2...","{'word': '한국프로축구연맹', 'start_idx': 34, 'end_idx...",org:member_of,wikitree
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,"{'word': '아성다이소', 'start_idx': 13, 'end_idx': ...","{'word': '박정부', 'start_idx': 22, 'end_idx': 24...",org:top_members/employees,wikitree
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,"{'word': '요미우리 자이언츠', 'start_idx': 22, 'end_id...","{'word': '1967', 'start_idx': 0, 'end_idx': 3,...",no_relation,wikipedia


In [3]:
data_size = len(data_df)
data_sen = list(data_df['sentence'])

## Device

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Model

In [5]:
model_name = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [6]:
model_config =  AutoConfig.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name, config=model_config).to(device)

## Dataset

In [7]:
class PretrainDataset(Dataset):
    def __init__(self, sen_data, len_data, tokenizer, val_ratio=0.1):
        super(PretrainDataset, self).__init__()
        assert len(sen_data) == len(len_data)
        self.tokenizer = tokenizer
        self.val_ratio = 0.1

        self.dataset = []
        for i in tqdm(range(5)) :
            tensor_data = self.build_data(sen_data, len_data)
            self.dataset.extend(tensor_data)

    def pair_data(self, sen_data, len_data) :
        data_size = len(sen_data)
        len_group = collections.defaultdict(list)

        for i, length in enumerate(len_data) :
            len_value = length // 10
            len_group[len_value].append(i)

        data_sen_idx = []
        for group in sorted(len_group.keys()) :
            idx_list = len_group[group]
            random.shuffle(idx_list)
            data_sen_idx.extend(idx_list)

        fir_data = []
        sec_data = []

        for i in range(int(data_size/2)) :
            fir_idx = data_sen_idx[i]
            sec_idx = data_sen_idx[data_size-1-i]

            fir_sen = sen_data[fir_idx]
            sec_sen = sen_data[sec_idx]

            if i % 2 == 0 :
                fir_data.append(fir_sen)
                sec_data.append(sec_sen)
            else :
                fir_data.append(sec_sen)
                sec_data.append(fir_sen)
    
        return fir_data, sec_data


    def build_data(self, sen_data, len_data) :
        data_size = len(sen_data)
        fir_data, sec_data = self.pair_data(sen_data, len_data)

        tensor_data = []
        for i in range(int(data_size/2)) :
            data_dict = self.tokenizer(fir_data[i],
                                       sec_data[i],
                                       return_tensors='pt',
                                       return_token_type_ids=False,
                                       add_special_tokens=True)
      
            data_dict = {k : v[0] for k, v in data_dict.items()}
            tensor_data.append(data_dict)
        return tensor_data

    def __getitem__(self, idx):
        return self.dataset[idx]

    def __len__(self):
        return len(self.dataset)

    def split(self) :
        n_val = int(len(self) * self.val_ratio)
        n_train = len(self) - n_val
        train_set, val_set = random_split(self, [n_train, n_val])
        
        return train_set, val_set

In [8]:
# Length of Data
data_len = [len(tokenizer.tokenize(sen)) for sen in tqdm(data_sen)]

100%|██████████| 32470/32470 [00:08<00:00, 3951.77it/s]


In [9]:
# Dataset
dset = PretrainDataset(data_sen, data_len, tokenizer)
train_dset, val_dset = dset.split()

100%|██████████| 5/5 [00:41<00:00,  8.36s/it]


## Collator

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, 
                                                mlm_probability=0.15)

## Training Argument

In [11]:
# Training Argument
training_args = TrainingArguments(
    output_dir='./results',
    save_total_limit=5,
    save_steps=1000, 
    num_train_epochs=5,
    learning_rate=1e-5, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    warmup_steps=4000,  
    weight_decay=1e-2,  
    evaluation_strategy='steps',
    eval_steps = 500,
    logging_dir='./logs', 
    logging_steps=500,  
    report_to='wandb'
)

## Trainer

In [12]:
trainer = Trainer(
  model=model, 
  args=training_args, 
  data_collator=data_collator,
  train_dataset=train_dset,
  eval_dataset=val_dset
)

## Training

In [None]:
WANDB_AUTH_KEY = os.getenv('WANDB_AUTH_KEY')
wandb.login(key=WANDB_AUTH_KEY)

wandb.init(entity="sangha0411",project="huggingface",name="pretraining")
trainer.train()
wandb.finish

[34m[1mwandb[0m: Currently logged in as: [33msangha0411[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-10-03 15:50:50.524851: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



***** Running training *****
  Num examples = 73058
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 22835
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
500,3.2937,1.830008
1000,1.8444,1.608319
1500,1.6818,1.517828
2000,1.5866,1.464435
2500,1.5711,1.432953
3000,1.5442,1.403903
3500,1.5223,1.390225


***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
***** Running Evaluation *****
  Num examples = 8117
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json
Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****


In [None]:
model.save_pretrained('./best_model')