In [None]:
from torch.utils.data import Dataset

import torch
import pandas as pd
import numpy as np

In [None]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer

from sklearn.model_selection import StratifiedKFold

In [None]:
import random

def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

In [None]:
MODEL_NAME = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Train

In [None]:
dataset = pd.read_csv('/opt/ml/dataset/train/train_pororo.csv')

In [None]:
training_args = TrainingArguments(
  output_dir='./results',          # output directory
  save_total_limit=5,              # number of total save model.
  save_steps=500,                   # model saving step.
  num_train_epochs=2,              # total number of training epochs
  learning_rate=5e-5,               # learning_rate
  per_device_train_batch_size=32,  # batch size per device during training
  per_device_eval_batch_size=32,   # batch size for evaluation
  warmup_steps=500,                # number of warmup steps for learning rate scheduler
  weight_decay=0.01,               # strength of weight decay
  logging_dir='./logs',            # directory for storing logs
  logging_steps=100,              # log saving step.
  evaluation_strategy='steps', # evaluation strategy to adopt during training
                              # `no`: No evaluation during training.
                              # `steps`: Evaluate every `eval_steps`.
                              # `epoch`: Evaluate every end of epoch.
  eval_steps = 500,            # evaluation step.
  load_best_model_at_end = True 
)

In [None]:
# special_token 불러오기
MODEL_NAME = 'klue/bert-base'

special_token_list = []
with open('./dataset/pororo_special_token.txt', 'r', encoding = 'UTF-8') as f :
    for token in f :
        special_token_list.append(token.split('\n')[0])

In [None]:
added_token_num = tokenizer.add_special_tokens({"additional_special_tokens":list(set(special_token_list))})
print(added_token_num)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model_config = AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = model_config)
model.to(device)

In [None]:
print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings())

In [None]:
# 본 제출(max_length = 128)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

models = []
stf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed_everything(42))
for fold, (train_idx, dev_idx) in enumerate(stf.split(dataset, list(dataset['label']))) :
    print('Fold {}'.format(fold + 1))
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 30

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = model_config)
    model.to(device)

    # 추가한 token 개수만큼 token embedding size 늘려주기
    model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)

    train_dataset = dataset.iloc[train_idx]
    dev_dataset = dataset.iloc[dev_idx]

    train_label = label_to_num(train_dataset['label'].values)
    dev_label = label_to_num(dev_dataset['label'].values)

    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=RE_train_dataset,         # training dataset
    eval_dataset=RE_dev_dataset,             # evaluation dataset
    compute_metrics=compute_metrics         # define metrics function
    )
    trainer.train()
    models.append(model)

In [None]:
def makedirs(path) :
    try :
        os.makedirs(path)
    except OSError :
        if not os.path.isdir(path) :
            raise

for i, model in enumerate(models) :
    makedirs(f'./best_model/sixth_try/fold_{i}/')
    model.save_pretrained(f'./best_model/sixth_try/fold_{i}/')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
test_dataset = pd.read_csv('./dataset/test_pororo.csv')
test_dataset['label'] = 100
test_label = list(map(int, test_dataset['label'].values))
tokenized_test = tokenized_dataset(test_dataset, tokenizer)
test_id = test_dataset['id']
Re_test_dataset = RE_Dataset(tokenized_test ,test_label) 

dataloader = DataLoader(Re_test_dataset, batch_size=32, shuffle=False)

oof_pred = None
for i in range(5) :
    model_name = '/opt/ml/code/best_model/sixth_try/fold_{}'.format(i)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
    model.to(device)
    model.eval()

    output_pred = []
    for i, data in enumerate(tqdm(dataloader)) :
        with torch.no_grad() :
            outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
                )
        logits = outputs[0]
        prob = F.softmax(logits, dim = -1).detach().cpu().numpy()
        output_pred.append(prob)
    final_prob = np.concatenate(output_pred, axis = 0)

    if oof_pred is None :
        oof_pred = final_prob / 5
    else :
        oof_pred += final_prob / 5

result = np.argmax(oof_pred, axis = -1)
pred_answer = num_to_label(result)
output_prob = oof_pred.tolist()

output = pd.DataFrame({'id':test_id,'pred_label':pred_answer,'probs':output_prob,})
output.to_csv('./prediction/submission_seventh.csv', index=False) # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.