In [1]:
import pickle as pickle
import pandas as pd
import torch
from ast import literal_eval
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn as nn
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoConfig, AutoModel
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from torch.cuda.amp import GradScaler
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import wandb
import torch
from torch.cuda.amp import autocast
from torch.utils.data import DataLoader
from collections import Counter
import random
import os

In [2]:
### FOCAL LOSS ###

class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

### Dataloader

In [3]:
def preprocessing_dataset(dataset):
    """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
    subject_entity = []
    object_entity = []
    for i, j in zip(dataset['subject_entity'], dataset['object_entity']):
        sub_dict = literal_eval(i)
        obj_dict = literal_eval(j)

        sub_start = int(sub_dict['start_idx'])
        sub_end = int(sub_dict['end_idx'])
        sub_type = sub_dict['type']

        obj_start = int(obj_dict['start_idx'])
        obj_end = int(obj_dict['end_idx'])
        obj_type = obj_dict['type']

        subject_entity.append([sub_start, sub_end, sub_type])
        object_entity.append([obj_start, obj_end, obj_type])
    out_dataset = pd.DataFrame({'id': dataset['id'],
                                'sentence': dataset['sentence'],
                                'subject_entity': subject_entity,
                                'object_entity': object_entity,
                                'label': dataset['label'], })
    return out_dataset


In [4]:
def load_data(dataset_dir):
    """ csv 파일을 경로에 맡게 불러 옵니다. """
    pd_dataset = pd.read_csv(dataset_dir)
    dataset = preprocessing_dataset(pd_dataset)
    return dataset


def tokenized_dataset(dataset, tokenizer):
    """
    각 문장에 typed entity marker를 끼워줍니다.
    entity type는 한국어모델에 맞게 한 토큰으로
    토크나이징되는 단얼로 대체 (e.g. "PER" -> "사람")
    subject: @*type*subject word@ (e.g.  김현수 -> @*사람*김현수@)
    object: #^type^object word# (e.g. #^지명^한국#)

    <<An Improved Baseline for Sentence-level Relation Extraction>>
    논문에서 각 entity marker의 시작위치 (ss, es)를 사용하기 때문에 함께 반환

    """

    type_dict = {
        "PER": "사람",
        "LOC": "지명",
        "ORG": "기관",
        "DAT": "날짜",
        "TIM": "시간",
        "DUR": "기간",
        "MNY": "통화",
        "PNT": "비율",
        "NOH": "수량",
        "POH": "기타"
    }
    sentences = []
    e01, e02, sent = dataset['subject_entity'], dataset['object_entity'], dataset['sentence']
    subject_start, subject_end, sub_type = e01
    object_start, object_end, obj_type = e02
    subj = sent[e01[0]: e01[1] + 1]
    obj = sent[e02[0]: e02[1] + 1]
    if subject_start < object_start:
        sent_ = sent[:subject_start] + f'@*{type_dict[sub_type]}*' + subj + '@' + \
                    sent[subject_end + 1:object_start] + f'&^{type_dict[obj_type]}^' \
                    + obj + '&' + sent[object_end + 1:]
        ss = 1 + len(tokenizer.tokenize(sent[:subject_start]))
        se = ss + 4 + len(tokenizer.tokenize(subj))
        es = 1 + se + len(tokenizer.tokenize(sent[subject_end + 1:object_start]))
        ee = es + 4 + len(tokenizer.tokenize(obj))
    else:
        sent_ = sent[:object_start] + f'&^{type_dict[obj_type]}^' + obj + '&' + \
                sent[object_end + 1:subject_start] + f'@*{type_dict[sub_type]}*' + subj + '@' + \
                sent[subject_end + 1:]
        es = 1 + len(tokenizer.tokenize(sent[:object_start]))
        ee = es + 4 + len(tokenizer.tokenize(obj))
        ss = 1 + ee + len(tokenizer.tokenize(sent[object_end + 1:subject_start]))
        se = ss + 4 + len(tokenizer.tokenize(subj))
    sentences.append(sent_)
    max_length = 256
    senttokens = tokenizer.tokenize(sent_)[:max_length - 2]
    input_ids = tokenizer.convert_tokens_to_ids(senttokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
    return input_ids, ss, se, es, ee


def collate_fn(batch):
    ''' 각 batch안에서
    max_len = max([len(f['input_ids'] for f in batch])식으로
    맞추려고 했는데 잘 안되서 일단 max_len 상수값으로 지정
    '''
    max_len = 256
    input_ids = [f["input_ids"] + [1] * (max_len - len(f["input_ids"])) for f in batch]
    #input_ids = [f["input_ids"] + [0] * (max_len - len(f["input_ids"])) for f in batch]
    input_mask = [[1.0] * len(f["input_ids"]) + [0.0] * (max_len - len(f["input_ids"])) for f in batch]
    labels = [f["labels"] for f in batch]
    ss = [f["ss"] for f in batch]
    se = [f['se'] for f in batch]
    es = [f["es"] for f in batch]
    ee = [f['ee'] for f in batch]
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    input_mask = torch.tensor(input_mask, dtype=torch.float)
    labels = torch.tensor(labels, dtype=torch.long)
    ss = torch.tensor(ss, dtype=torch.long)
    se = torch.tensor(se, dtype=torch.long)
    es = torch.tensor(es, dtype=torch.long)
    ee = torch.tensor(ee, dtype=torch.long)
    output = (input_ids, input_mask, labels, ss, se, es, ee)
    return output


def label_to_num(label):
    num_label = []
    with open('./code/dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    return num_label


def processor(tokenizer, dataset, train_mode):
    '''
    train_dataset = processor(tokenizer, train_df))
    --> train_dataloader = Dataloader(train_dataset, batch_size = ...)
    '''
    features = []
    labels = dataset['label'].values
    if train_mode:
        labels = label_to_num(dataset['label'].values)
    for i in range(len(dataset)):
        input_ids, new_ss, new_se, new_es, new_ee = tokenized_dataset(dataset.iloc[i], tokenizer)
        label = labels[i]
        feature = {
            'input_ids' : input_ids,
            'labels' : label,
            'ss': new_ss,
            'se': new_se,
            'es' : new_es,
            'ee' : new_ee,
        }
        features.append(feature)
    return features


### Model

In [5]:
class CustomModel(nn.Module):
    def __init__(self, model_name, config):
        super().__init__()
        self.encoder_model = AutoModel.from_pretrained(model_name, config=config)
        self.loss_fnt = FocalLoss(gamma = 1.0)
        #self.loss_fnt = nn.CrossEntropyLoss(class_weights)
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size*2, config.hidden_size),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(hidden_size, config.num_labels)
        )

    @autocast()
    def forward(self, input_ids=None,
                attention_mask=None,
                labels=None,
                ss=None,
                se = None,
                es=None,
                ee = None,):
        
        outputs = self.encoder_model(
            input_ids,
            attention_mask=attention_mask,
        )
        last_hs = outputs[0]
        idx = torch.arange(input_ids.size(0)).to(input_ids.device)
        ss_emb = last_hs[idx, ss]
        #se_emb = pooled_output[idx,se]
        es_emb = last_hs[idx, es]
        #ee_emb = pooled_output[idx, ee]
        h = torch.cat((ss_emb,es_emb), dim=-1)
        #final = torch.cat((cls, h), dim=-1)
        logits = self.classifier(h)
        outputs = (logits,)
        if labels is not None:
            loss = self.loss_fnt(logits.float(), labels)
            outputs = (loss,) + outputs
        return outputs

### Utils

In [6]:
def seed_everything(seed: int=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministric = True
    torch.backends.cudnn.benchmark = True



def klue_re_micro_f1(preds, labels):
    """KLUE-RE micro f1 (except no_relation)"""
    label_list = ['no_relation', 'org:top_members/employees', 'org:members',
                  'org:product', 'per:title', 'org:alternate_names',
                  'per:employee_of', 'org:place_of_headquarters', 'per:product',
                  'org:number_of_employees/members', 'per:children',
                  'per:place_of_residence', 'per:alternate_names',
                  'per:other_family', 'per:colleagues', 'per:origin', 'per:siblings',
                  'per:spouse', 'org:founded', 'org:political/religious_affiliation',
                  'org:member_of', 'per:parents', 'org:dissolved',
                  'per:schools_attended', 'per:date_of_death', 'per:date_of_birth',
                  'per:place_of_birth', 'per:place_of_death', 'org:founded_by',
                  'per:religion']
    no_relation_label_idx = label_list.index("no_relation")
    label_indices = list(range(len(label_list)))
    label_indices.remove(no_relation_label_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=label_indices) * 100.0


def klue_re_auprc(probs, labels):
    """KLUE-RE AUPRC (with no_relation)"""
    probs = np.array(probs)
    labels = np.eye(30)[labels]
    score = np.zeros((30,))
    for c in range(30):
        targets_c = labels.take([c], axis=1).ravel()
        preds_c = probs.take([c], axis=1).ravel()
        precision, recall, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(recall, precision)
    return np.average(score) * 100.0


def compute_metrics(keys, logitss):
    """ validation을 위한 metrics function """
    #print(pred.predictions[0])
    labels = np.array(keys, dtype= np.int64)
    logitss = torch.tensor(logitss)
    preds = torch.argmax(logitss, dim= -1)
    probs = logitss

    # calculate accuracy using sklearn's function
    f1 = klue_re_micro_f1(preds, labels)
    auprc = klue_re_auprc(probs, labels)
    acc = accuracy_score(labels, preds)  # 리더보드 평가에는 포함되지 않습니다.

    return f1, auprc, acc
    #return {
    #    'micro f1 score': f1,
    #    'auprc': auprc,
    #    'accuracy': acc,
    #}

def label_to_num(label):
    num_label = []
    with open('./code/dict_label_to_num.pkl', 'rb') as f:
        dict_label_to_num = pickle.load(f)
    for v in label:
        num_label.append(dict_label_to_num[v])
    return num_label


def split_df(df, kfold_n):
    kfold = StratifiedKFold(n_splits = kfold_n)
    X = df['sentence'].values
    y = df['label'].values
    datas = []
    for i, (train_index, valid_index) in enumerate(kfold.split(X,y)):
        train_df = df.iloc[train_index].copy().reset_index(drop=True)
        valid_df = df.iloc[valid_index].copy().reset_index(drop=True)

        datas.append((train_df, valid_df))
    return datas

### Training

In [7]:
def evaluate(model, features):
    dataloader = DataLoader(features, batch_size = 5, collate_fn = collate_fn, drop_last = False)
    keys, preds, logitss = [], [], []
    device = torch.device("cuda")
    for i_b, batch in enumerate(dataloader):
        model.eval()
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'ss': batch[3].to(device),
                  'es': batch[5].to(device),
                  }
        keys+= batch[2].tolist()
        with torch.no_grad():
            logit = model(**inputs)[0]
            pred = torch.argmax(logit, dim=-1)
            for i in logit:
                logitss.append(i.tolist())
        preds += pred.tolist()

    keys = np.array(keys, dtype = np.int64)
    f1, auprc, acc = compute_metrics(keys, logitss)
    output = {
        "f1": f1, "auprc": auprc, "acc" : acc
    }
    print(output)
    return f1, auprc, acc

In [9]:
def train():
    # load model and tokenizer
    MODEL_NAME = "klue/roberta-large"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    dataset = load_data("./dataset/train/train.csv")

    # Make Validation set
    kfold_dataset = split_df(dataset, kfold_n = 5)
    train_dataset  = kfold_dataset[-1][0]
    valid_dataset = kfold_dataset[-1][1]

    # tokenizing dataset & making train dataloader
    train_features = processor(tokenizer, dataset, train_mode = True)
    val_features = processor(tokenizer, valid_dataset, train_mode = True)
    train_dataloader = DataLoader(train_features, batch_size= 16, shuffle=True, collate_fn = collate_fn)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 30
    model = CustomModel(MODEL_NAME, config = model_config)
    model.parameters
    model.to(device)

    ### Hyper-parameters to add in argmument parser ##
    num_train_epochs = 10
    gradient_accumulation_steps = 2
    adam_epsilon = 1e-6
    max_grad_norm = 1.0
    warmup_ratio = 0.1
    learning_rate = 3e-5
    ##################################

    total_steps = int(len(train_dataloader) * num_train_epochs) // gradient_accumulation_steps
    warmup_steps = int(total_steps * warmup_ratio)
    scaler = GradScaler()
    #optimizer = AdamW(model.parameters(), 
    #                  lr = learning_rate, 
    #                  eps = adam_epsilon)
    optimizer = AdamP(model.parameters(), 
                      lr=learning_rate, betas=(0.9, 0.999), 
                      weight_decay=1e-2)
    
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps = total_steps
                                                )
    num_steps = 0
    best_f1 = 0
    for epoch in range(int(num_train_epochs)):
        model.zero_grad()
        average_loss = 0
        #tokens = []
        for step, batch in enumerate(tqdm(train_dataloader)):
            model.train()
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device),
                      'labels': batch[2].to(device),
                      'ss': batch[3].to(device),
                      'es': batch[5].to(device),
                      }
            ''' 올바른 토큰 참조하는지 확인용
            for i in range(len(batch[0])):
                ss_ = tokenizer.decode(batch[0][i])
                ss = tokenizer.tokenize(ss_)[batch[3][i]]
                se = tokenizer.tokenize(ss_)[batch[4][i]]
                e_ = tokenizer.decode(batch[0][i])
                es = tokenizer.tokenize(e_)[batch[5][i]]
                ee = tokenizer.tokenize(e_)[batch[6][i]]
                tok = "".join([ss,se,es,ee])
                tokens.append(tok)
            '''
            outputs = model(**inputs)
            loss = outputs[0] / gradient_accumulation_steps
            average_loss += loss
            scaler.scale(loss).backward()
            if step % gradient_accumulation_steps == 0:
                num_steps += 1
                if max_grad_norm > 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                model.zero_grad()
        print(f"average_training_loss: {average_loss/len(train_dataloader)}")
        #toks_acc = Counter(tokens)
        #print(toks_acc)
        ### Validation ###
        f1, auprc, acc = evaluate(model, val_features)
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model, './best_model/roberta_focal.pt')

In [20]:
seed_everything()

In [None]:
train()