In [1]:
import pickle as pickle
import os
import pandas as pd
import torch

# Dataset 구성.
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()
        }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


# 처음 불러온 tsv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다.
# 변경한 DataFrame 형태는 baseline code description 이미지를 참고해주세요.
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == "blind":
            label.append(100)
        else:
            label.append(label_type[i])
            
    out_dataset = pd.DataFrame(
        {
            "sentence": dataset[1],
            "entity_01": dataset[2],
            "entity_02": dataset[5],
            "label": label,
        }
    )
    return out_dataset


# tsv 파일을 불러옵니다.
def load_data(dataset_dir):
    # load label_type, classes
    with open("/opt/ml/input/data/label_type.pkl", "rb") as f:
        label_type = pickle.load(f)
    # load dataset
    dataset = pd.read_csv(dataset_dir, delimiter="\t", header=None)
    # preprecessing dataset
    dataset = preprocessing_dataset(dataset, label_type)

    return dataset


# bert input을 위한 tokenizing.
# tip! 다양한 종류의 tokenizer와 special token들을 활용하는 것으로도 새로운 시도를 해볼 수 있습니다.
# baseline code에서는 2가지 부분을 활용했습니다.
def tokenized_dataset(dataset, tokenizer):
    concat_entity = []
    for e01, e02 in zip(dataset["entity_01"], dataset["entity_02"]):
        temp = ""
        temp = e01 + "[SEP]" + e02
        concat_entity.append(temp)
        
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset["sentence"]),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=100,
        add_special_tokens=True,
    )
    return tokenized_sentences

In [4]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertConfig,
)

# 평가를 위한 metrics function.
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


def train():
    # load model and tokenizer
    MODEL_NAME = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    # dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset["label"].values
    # dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    # tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    # RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # setting model hyperparameter
    bert_config = AutoConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config)
    model.parameters  # ?
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir="./results",  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=500,  # model saving step.
        num_train_epochs=10,  # total number of training epochs
        learning_rate=5e-5,  # learning_rate
        per_device_train_batch_size=16,  # batch size per device during training
        # per_device_eval_batch_size=16,   # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir="./logs",  # directory for storing logs
        logging_steps=100,  # log saving step.
        # evaluation_strategy='steps', # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        # eval_steps = 500,            # evaluation step.
    )
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        # eval_dataset=RE_dev_dataset,             # evaluation dataset
        # compute_metrics=compute_metrics         # define metrics function
    )

    # train model
    trainer.train()

In [7]:
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load dataset
train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
# dev_dataset = load_data("./dataset/train/dev.tsv")
train_label = train_dataset["label"].values
# dev_label = dev_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
# tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
# RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [10]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [9]:
RE_train_dataset[0]

  


{'input_ids': tensor([   101,   9167,  15001,  11261,  41605,    102,   9651,  99183,    102,
          50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625, 119376,
          12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,   9167,
          15001,  11261,  41605,    113,  12001,  57836,    114,   9590,   9706,
          28396,    113,  13796,  19986,    114,   8843,  22634,    117,   9638,
           9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,  11513,
           9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,  27792,
          16139,    119,    102,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [5]:
train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  


Step,Training Loss
100,3.2237
200,2.2208
300,1.8408
400,1.5776
500,1.4801
600,1.2721
700,1.1563
800,1.1025


  


KeyboardInterrupt: 

In [11]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [20]:
train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
# dev_dataset = load_data("./dataset/train/dev.tsv")
train_label = train_dataset["label"].values
# dev_label = dev_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
# tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)

In [21]:
dataloader = DataLoader(RE_train_dataset, batch_size=40, shuffle=False)

In [22]:
batch = next(iter(dataloader))

  


In [24]:
batch['input_ids'][0]

tensor([   101,   9167,  15001,  11261,  41605,    102,   9651,  99183,    102,
         50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625, 119376,
         12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,   9167,
         15001,  11261,  41605,    113,  12001,  57836,    114,   9590,   9706,
         28396,    113,  13796,  19986,    114,   8843,  22634,    117,   9638,
          9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,  11513,
          9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,  27792,
         16139,    119,    102,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0])

In [28]:
tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][0])
print(tokens)

['[CLS]', '랜', '##드', '##로', '##버', '[SEP]', '자', '##동차', '[SEP]', '영국', '##에서', '사', '##용', '##되는', '스', '##포츠', '유', '##틸', '##리', '##티', '자', '##동차', '##의', '브', '##랜드', '##로는', '랜', '##드', '##로', '##버', '(', 'Land', 'Rover', ')', '와', '지', '##프', '(', 'Je', '##ep', ')', '가', '있으며', ',', '이', '브', '##랜드', '##들은', '자', '##동차', '##의', '종', '##류', '##를', '일', '##컫', '##는', '말', '##로', '사', '##용', '##되', '##기도', '한다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [27]:
batch['token_type_ids'][0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [29]:
tokens[:9]

['[CLS]', '랜', '##드', '##로', '##버', '[SEP]', '자', '##동차', '[SEP]']

In [41]:
print(tokens[9:66])

['영국', '##에서', '사', '##용', '##되는', '스', '##포츠', '유', '##틸', '##리', '##티', '자', '##동차', '##의', '브', '##랜드', '##로는', '랜', '##드', '##로', '##버', '(', 'Land', 'Rover', ')', '와', '지', '##프', '(', 'Je', '##ep', ')', '가', '있으며', ',', '이', '브', '##랜드', '##들은', '자', '##동차', '##의', '종', '##류', '##를', '일', '##컫', '##는', '말', '##로', '사', '##용', '##되', '##기도', '한다', '.', '[SEP]']


In [42]:
batch['token_type_ids'][0][66]

tensor(0)

In [48]:
batch['attention_mask'][0][65]

tensor(1)

In [49]:
model.config.

SyntaxError: invalid syntax (<ipython-input-49-e5c9403477cf>, line 1)

In [16]:
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertConfig,
    BertTokenizer,
)
from torch.utils.data import DataLoader
import pandas as pd
import torch
import pickle as pickle
import numpy as np
import argparse


def inference(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
    model.eval()
    output_pred = []

    for i, data in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(
                input_ids=data["input_ids"].to(device),
                attention_mask=data["attention_mask"].to(device),
                token_type_ids=data["token_type_ids"].to(device),
            )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)

        output_pred.append(result)

    return np.array(output_pred).flatten()


def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset["label"].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label


def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # load tokenizer
    TOK_NAME = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = BertForSequenceClassification.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=["pred"])
    output.to_csv("./prediction/submission.csv", index=False)

In [52]:
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [19]:
model_dir = '/opt/ml/P-Stage/2-STAGE/notebook/results/checkpoint-11000/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(model_dir)
model.to(device)

test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
test_dataset = RE_Dataset(test_dataset, test_label)

pred_answer = inference(model, test_dataset, device)

  


In [21]:
output = pd.DataFrame(pred_answer, columns=["pred"])
output.to_csv("./submission.csv", index=False)