In [29]:
import pickle as pickle
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import random
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import *
from ipywidgets import IntProgress


In [30]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed = 210
seed_everything(210)

# 1.dataset

In [31]:
# Dataset 구성.
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_datasest,labels):
        self.tokenized_dataset = tokenized_datasest
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in     self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# 처음 불러온 tsv 파일을 원하는 형태의 DataFrame으로 변경 시켜줌
# 변경한 DataFrame 형태는 vaseline code description 이미지를 참고해주세요.
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])

    out_dataset = pd.DataFrame({'sentence' : dataset[1], 'entity_01' : dataset[2], 'entity_02':dataset[5], 'label' : label,})
    return out_dataset

def load_data(dataset_dir):
    # load label_type, classes
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    # load dataset
    dataset = pd.read_csv(dataset_dir, delimiter = '\t', header = None)
    # preprocessing dataset
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

# bert input을 위한 tokenizing.
# tip! 다양한 종류의 tokenizer와 special token들을 활용하는 것으로도 새로운 시도를 해볼 수 있습니다.
# baseline code에서는 2가지 부분을 활용했습니다.
def tokenized_dataset(dataset, tokenizer):
    concat_entity = []
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
        temp = ''
        temp = e01 + '[SEP]' + e02
        concat_entity.append(temp)
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors = 'pt',
        padding = True,
        truncation = True,
        max_length = 100,
        add_special_tokens = True,
    )
    return tokenized_sentences

In [32]:
# tokenizer가 정확히 뭐하는 앤지 한번 찾아보자
# label_type을 붙이는게 어떤 의미인가?
# load_data, RE_Dataset, tokenized_dataset을 어디다 쓰는지 알아보자.

# 2.Train

In [33]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

#def train():
bert_base_name = "bert-base-multilingual-cased"
#gpt2_name ="gpt2"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_base_name)
#gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_name,pad_token="<PAD>")

train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
train_label = train_dataset['label'].values
print(f'entity_01 : {train_dataset.loc[0, "entity_01"]}')
print(f'entity_02 : {train_dataset.loc[0, "entity_02"]}')
print(f'sentence : {train_dataset.loc[0, "sentence"]}')

print(bert_tokenizer.decode(bert_tokenizer.encode('랜드로버[SEP]자동차',train_dataset.loc[0, 'sentence'])))

# tokenizing dataset
tokenized_train= tokenized_dataset(train_dataset, bert_tokenizer)


# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

entity_01 : 랜드로버
entity_02 : 자동차
sentence : 영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버(Land Rover)와 지프(Jeep)가 있으며, 이 브랜드들은 자동차의 종류를 일컫는 말로 사용되기도 한다.
[CLS] 랜드로버 [SEP] 자동차 [SEP] 영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버 ( Land Rover ) 와 지프 ( Jeep ) 가 있으며, 이 브랜드들은 자동차의 종류를 일컫는 말로 사용되기도 한다. [SEP]


In [25]:
# setting gpt2 hyperparameter
gpt2_config = GPT2Config.from_pretrained(gpt2_name)
gpt2_config.num_labels = 42
# GPT2LMHeadModel은 input_ids와 labels를 함께 줄 경우 자동으로 cross entropy loss까지 계산해줍니다.
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_name)
gpt2_model.config = gpt2_config
gpt2_model.parameters
gpt2_model.to(device)

gpt2_training_args = TrainingArguments(
    output_dir = './results/gpt2',
    save_total_limit = 4,
    save_steps = 500,
    num_train_epochs = 20,
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs/gpt2',
    logging_steps = 100,
)
gpt2_trainer = Trainer(
    model = gpt2_model,
    args = gpt2_training_args,
    train_dataset=RE_train_dataset,
    compute_metrics=compute_metrics
)
gpt2_trainer.train()

RuntimeError: CUDA error: device-side assert triggered

In [34]:
# setting bert hyperparameter
bert_config = BertConfig.from_pretrained(bert_base_name)
bert_config.num_labels = 42
bert_model = BertForSequenceClassification.from_pretrained(bert_base_name, config = bert_config)
bert_model.parameters
bert_model.to(device)

bert_training_args = TrainingArguments(
    output_dir = './results/bert',
    save_total_limit = 4,
    save_steps = 500,
    num_train_epochs = 20,
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs/bert',
    logging_steps = 100,
)
bert_trainer = Trainer(
    model = bert_model,
    args = bert_training_args,
    train_dataset=RE_train_dataset,
    compute_metrics=compute_metrics
)
bert_trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

RuntimeError: CUDA error: device-side assert triggered

In [None]:
# loss는 어디갔지?
# forward도 없는데 어케 학습시키는거야? optimizer는?

# 3.inference

In [None]:
def inference(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size = 40, shuffle = False)
    model.eval()
    output_pred = []

    for i, data in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(
                input_ids = data['input_ids'].to(device),
                attention_mask = data['attention_mask'].to(device),
                token_type_ids = data['token_type_ids'].to(device)
            )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis = -1)

        output_pred.append(result)
    return np.array(output_pred).flatten()

def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label

# load test dataset
test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
test_dataset = RE_Dataset(test_dataset, test_label)

# predict answer
pred_answer = inference(gpt2_model, test_dataset, device)
# make csv file with predicted answer
# 아래 directory 와 columns의 형태는 지켜주시기 바랍니다. 

output = pd.DataFrame(pred_answer, columns = ['pred'])
output.to_csv('./prediction/submission.csv', index = False)

In [None]:
# label이 특정 label만 많이 나온다,,어떡하지

In [27]:
# ERNIE, ELECTRA, GPT-2