In [1]:
import pickle
import os
import random
import argparse
import numpy as np
import pandas as pd
import torch
from glob import glob
from sklearn.metrics import accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig
from load_data import *

In [2]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from torch.utils.data import DataLoader
from torch.optim import *
from tqdm import tqdm

In [29]:

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }


In [30]:
seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

# load dataset
dataset = load_data("/opt/ml/input/data/train/train.tsv")
label = dataset['label'].values

dataset

Unnamed: 0,sentence,entity_01,entity_02,label
0,영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버(Land Rover)...,랜드로버,자동차,17
1,"선거에서 민주당은 해산 전 의석인 230석에 한참 못 미치는 57석(지역구 27석,...",민주당,27석,0
2,유럽 축구 연맹(UEFA) 집행위원회는 2014년 1월 24일에 열린 회의를 통해 ...,유럽 축구 연맹,UEFA,6
3,"용병 공격수 챠디의 부진과 시즌 초 활약한 강수일의 침체, 시즌 중반에 영입한 세르...",강수일,공격수,2
4,람캄행 왕은 1237년에서 1247년 사이 수코타이의 왕 퍼쿤 씨 인트라팃과 쓰엉 ...,람캄행,퍼쿤 씨 인트라팃,8
...,...,...,...,...
8995,2002년 FIFA 월드컵 사우디아라비아와의 1차전에서 독일은 8-0으로 승리하였는...,사우디아라비아,2002년,0
8996,일본의 2대 메이커인 토요타와 닛산은 시장 점유율을 높이기 위한 신차 개발을 계속하...,토요타,일본,9
8997,방호의의 손자 방덕룡(方德龍)은 1588년(선조 21년) 무과에 급제하고 낙안군수로...,방덕룡,선무원종공신(宣武原從功臣),2
8998,LG전자는 올해 초 국내시장에 출시한 2020년형 ‘LG 그램’ 시리즈를 이달부터 ...,LG전자,북미,0


In [55]:
temp = dataset.iloc[4:6]
temp
temp2 = TokenDataset(temp, tokenizer)
temp2[0]

  from ipykernel import kernelapp as app


{'input_ids': tensor([     0,      6,  76253, 244160,  11199,   1065,    294,  21290,    268,
          35725, 244942, 105051,   8740, 195810,      3,      2,      2,      6,
          76253, 244160,  11199,  76826,    697,    427,  10945,   2680,   1180,
            427,  13330,   2680,  62657,   1020,  20047,  12412,    469,    367,
          76826,  91368, 244942, 105051,   8740, 195810,      3,   1291,  61286,
         136949,   7342,   1571,  62657,    367,      6, 135608,  79796, 129172,
           1083,  48495,   2211, 135647,      5,      2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(8)}

In [35]:
import pickle
import os
import pandas as pd
import torch
from torch.utils.data import Dataset


class TokenDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokenized_datasets = self.tokenized_dataset(self.dataset, self.tokenizer)
        item = {key: torch.tensor(val[idx]) for key, val in tokenized_datasets.items()}
        item['labels'] = torch.tensor(list(self.dataset['label'])[idx])
        return item

    def __len__(self):
        return len(self.dataset)
        
    def tokenized_dataset(self, dataset, tokenizer):
        concat_entity = []
        for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
            temp = ''
            temp = e01 + '[SEP]' + e02
            # temp = e01 + '</s></s>' + e02  # roberta
            concat_entity.append(temp)
        
        tokenized_sentences = tokenizer(
            concat_entity,
            list(dataset['sentence']),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )

        return tokenized_sentences

In [36]:
max_len=128
batch_size=32
warmup_ratio=0.01
num_epochs=10
max_grad_norm=1
log_interval=50
learning_rate=5e-5

In [37]:
import torch.nn as nn

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=42, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_config = AutoConfig.from_pretrained('xlm-roberta-large')
model_config.num_labels = 42
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large', config=model_config)

model.parameters
model.to(device)

train_loader = DataLoader(temp2, batch_size=1, shuffle=True)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [39]:
from transformers.optimization import get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = LabelSmoothingLoss()

t_total = len(train_loader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [40]:
test = 0
test1 = 0

for batch in train_loader:
        print(batch)
        print("epoch!")
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        print("=" * 10)
        print(outputs)
        print("=" * 10)
        print(outputs[0])
        print("=" * 10)
        print(outputs[1])
        test = outputs[1]
        print("=" * 10)
        print(labels)
        test1 = labels
        loss = loss_fn(outputs, labels)

{'input_ids': tensor([[     0,    180,  20441,   1065,    294,  21290,    268, 107659,    713,
              2,      2, 170744,  20448,   1963,  28211,  16069,    769,  59066,
         153653, 109433,   2020,  21037,   4156,   8267, 162342, 160628,    132,
            670,  20441,     16,    713,  32617,   1291, 136892,  62657,   1180,
          23358,   1083,   3497,   2947,    713,   3626,  12057,  24788, 177441,
          55388,  46431,  44928,   1077, 174878,      5,      2,      1,      1,
              1,      1,      1,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([0])}
epoch!
SequenceClassifierOutput(loss=tensor(3.4021, device='cuda:0', grad_fn=<NllLossBackward>), logits=tensor([[ 0.5150,  0.7255,  0.6829,  0.3777, -0.4498, -0.1628,  0.5972,  0.4204,
    

  from ipykernel import kernelapp as app


AttributeError: 'SequenceClassifierOutput' object has no attribute 'log_softmax'

In [41]:
import torch.nn.functional as F
F.log_softmax(test, dim=-1)

tensor([[-3.4021, -3.1916, -3.2342, -3.5394, -4.3669, -4.0800, -3.3200, -3.4967,
         -3.5912, -4.5097, -3.7961, -3.8824, -3.3816, -4.2645, -3.6858, -3.6217,
         -3.5421, -4.0240, -3.8230, -3.8862, -4.2663, -4.4901, -3.7455, -3.6110,
         -3.8444, -4.0012, -3.9661, -3.7382, -3.7566, -3.4162, -3.9785, -3.3092,
         -3.7596, -3.7030, -4.1380, -3.4093, -3.6570, -4.1994, -4.3645, -3.5240,
         -4.2545, -3.6332]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)

In [42]:
class LabelSmoothingLoss(nn.Module):
	def __init__(self, classes=42, smoothing=0.0, dim=-1):
		super(LabelSmoothingLoss, self).__init__()
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing
		self.cls = classes
		self.dim = dim
		
	def forward(self, pred, target):
		pred = F.log_softmax(pred, dim=self.dim)
		# pred = pred.log_softmax(dim=self.dim)
		with torch.no_grad():
			true_dist = torch.zeros_like(pred)
			true_dist.fill_(self.smoothing / (self.cls - 1))
			true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
		return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [44]:
loss_fn = LabelSmoothingLoss()
print(loss_fn(test, test1))

tensor(3.4021, device='cuda:0', grad_fn=<MeanBackward0>)


In [46]:
outputs.logits

tensor([[ 0.5150,  0.7255,  0.6829,  0.3777, -0.4498, -0.1628,  0.5972,  0.4204,
          0.3260, -0.5926,  0.1211,  0.0347,  0.5355, -0.3474,  0.2313,  0.2954,
          0.3751, -0.1068,  0.0941,  0.0309, -0.3492, -0.5730,  0.1716,  0.3061,
          0.0727, -0.0841, -0.0490,  0.1789,  0.1605,  0.5009, -0.0613,  0.6079,
          0.1575,  0.2141, -0.2209,  0.5078,  0.2601, -0.2823, -0.4474,  0.3932,
         -0.3374,  0.2840]], device='cuda:0', grad_fn=<AddmmBackward>)

In [47]:
outputs

SequenceClassifierOutput(loss=tensor(3.4021, device='cuda:0', grad_fn=<NllLossBackward>), logits=tensor([[ 0.5150,  0.7255,  0.6829,  0.3777, -0.4498, -0.1628,  0.5972,  0.4204,
          0.3260, -0.5926,  0.1211,  0.0347,  0.5355, -0.3474,  0.2313,  0.2954,
          0.3751, -0.1068,  0.0941,  0.0309, -0.3492, -0.5730,  0.1716,  0.3061,
          0.0727, -0.0841, -0.0490,  0.1789,  0.1605,  0.5009, -0.0613,  0.6079,
          0.1575,  0.2141, -0.2209,  0.5078,  0.2601, -0.2823, -0.4474,  0.3932,
         -0.3374,  0.2840]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [48]:
outputs.logits.argmax(-1)

tensor([1], device='cuda:0')

In [49]:
from sklearn.metrics import accuracy_score

In [51]:
test1

tensor([0], device='cuda:0')

### Load model weights
- nn.Parallel 사용할 때는 유의해야함

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, XLMRobertaConfig, XLMRobertaTokenizer, XLMRobertaForSequenceClassification

In [5]:
model_name = "xlm-roberta-large"
model_config = XLMRobertaConfig.from_pretrained(model_name)
model_config.num_labels = 42
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, config=model_config)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [None]:
model_name = "xlm-roberta-large"
model_config = XLMRobertaConfig.from_pretrained(model_name)
model_config.num_labels = 42
model = XLMRobertaForSequenceClassification(model_config)
# model = XLMRobertaForSequenceClassification.from_pretrained(model_name, config=model_config)
model = torch.nn.DataParallel(model)
model.load_state_dict(torch.load("./checkpoints/expr/best_2.pt"))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

# load dataset
train_dataset = load_data("/opt/ml/input/data/train/train2.tsv")
train_label = train_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
RE_train_dataset = RE_Dataset(tokenized_train, train_label)