## Load_data

In [None]:
!df -h

In [1]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [5]:
def load_data(dataset_dir):
    # load label_type, classes
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    # load dataset
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    # preprecessing dataset
    dataset = preprocessing_dataset(dataset, label_type)
  
    return dataset

In [6]:
def tokenized_dataset(dataset, tokenizer):
    concat_entity = []
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
        temp = ''
        temp = e01 + ' </s> ' + e02
        concat_entity.append(temp)
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors="pt",
        padding=True,
        truncation='only_second',
        max_length=128,
        add_special_tokens=True,
    )
    return tokenized_sentences

In [7]:
df_train = load_data('./input/data/train/train.tsv')

In [None]:
for i in range(len(df_train)):
    df_train['sentence'][i] = df_train['sentence'][i] + f" [SEP] 이 문장에서 {df_train['entity_01'][i]}과 {df_train['entity_02'][i]}는 어떤 관계야?"

In [None]:
# 중복 제거
df_train.drop_duplicates(subset=['sentence'], inplace=True)
len(df_train)

In [8]:
df_train = df_train[~df_train['label'].isin((41, 37, 40, 29))]
len(df_train)

8985

## Train

In [9]:
import pickle as pickle
import os
import numpy as np
import pandas as pd
import re
import random
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig, ElectraTokenizer
# from tokenization_kobert import KoBertTokenizer
# from load_data import *

import argparse
from importlib import import_module
from pathlib import Path
import glob

In [10]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(0)

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [12]:
def increment_output_dir(output_path, exist_ok=False):
    path = Path(output_path)
    if (path.exists() and exist_ok) or (not path.exists()):
        return str(path)
    else:
        dirs = glob.glob(f"{path}*")
        matches = [re.search(rf"%s(\d+)" %path.stem, d) for d in dirs]
        i = [int(m.groups()[0]) for m in matches if m]
        n = max(i) + 1 if i else 2
        return f"{path}{n}"

In [13]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig, ElectraTokenizer, ElectraConfig, ElectraForSequenceClassification, XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaConfig

MODEL_NAME = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

In [14]:
train, val = train_test_split(df_train, test_size=0.15, random_state=0)

tokenized_train = tokenized_dataset(train, tokenizer)
tokenized_val = tokenized_dataset(val, tokenizer)

In [15]:
train_label = train['label'].values
val_label = val['label'].values

train_dataset = RE_Dataset(tokenized_train, train_label)
val_dataset = RE_Dataset(tokenized_val, val_label)

In [16]:
model_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 42
model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
model.to(device)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

In [17]:
output_dir = increment_output_dir('./results')

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    save_total_limit=3,              # number of total save model.
    save_steps=500,                 # model saving step.
    num_train_epochs=20,              # total number of training epochs
    learning_rate=5e-5,               # learning_rate
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,              # log saving step.
    metric_for_best_model='accuracy',
    evaluation_strategy='steps', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    eval_steps = 500,            # evaluation step.
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,         # define metrics function
)

In [None]:
trainer.train()

## Inference

In [None]:
trainer.evaluate()

In [None]:
# trainer.save_model('./results/checkpoint-4500/')
# trainer.save_state()

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig, BertTokenizer, ElectraTokenizer, ElectraForSequenceClassification
# from tokenization_kobert import KoBertTokenizer
from torch.utils.data import DataLoader
# from load_data import *
import pandas as pd
import torch
import pickle as pickle
import numpy as np
import argparse

# import argparse
from importlib import import_module

In [None]:
def inference(model, tokenized_sent, device):
    dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
    model.eval()
    output_pred = []
  
    for i, data in enumerate(dataloader):
        with torch.no_grad():
            outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
#                 token_type_ids=data['token_type_ids'].to(device)
            )
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)

        output_pred.append(result)
  
    return np.array(output_pred).flatten()

In [None]:
def load_test_dataset(dataset_dir, tokenizer):
    test_dataset = load_data(dataset_dir)
    test_label = test_dataset['label'].values
    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer)
    return tokenized_test, test_label

In [None]:
TOK_NAME = "xlm-roberta-large"
tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained('./results/checkpoint-4500/')
model.to(device)

In [None]:
test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
test_data = load_data(test_dataset_dir)
test_label = test_data['label']

In [None]:
test_data

In [None]:
for i in range(len(test_data)):
    test_data['sentence'][i] = test_data['sentence'][i] + f" [SEP] 이 문장에서 {test_data['entity_01'][i]}과 {test_data['entity_02'][i]}는 어떤 관계야?"

In [None]:
tokenized_test = tokenized_dataset(test_data, tokenizer)
test_dataset = RE_Dataset(tokenized_test ,test_label)

In [None]:
pred_answer = inference(model, test_dataset, device)
output = pd.DataFrame(pred_answer, columns=['pred'])
output.to_csv("./prediction/submission.csv", index=False)

In [None]:
from IPython.display import Audio
Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg", autoplay=True)

### check

In [None]:
df_train = load_data('./input/data/train/train.tsv')

In [None]:
pred = pd.read_csv('./prediction/submission.csv')

In [None]:
cnt = pred['pred'] == df_train['label']

In [None]:
print(cnt.sum()/len(cnt))