In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'klue/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to('cuda')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

In [5]:
def data_split(data, test_size, random_seed):
    train_data, test_data = pd.DataFrame([]), pd.DataFrame([])
    for label in range(7):
        temp = data[data['target']==label]  # target이 label인 애들 다 뽑기
        test_temp = temp.sample(frac=test_size, random_state=random_seed)
        test_data = pd.concat([test_data, test_temp], axis=0)
        train_temp = temp.drop(test_temp.index)
        train_data = pd.concat([train_data, train_temp], axis=0)
    return train_data, test_data

data = pd.read_csv('train_20241104_filtering_mask.csv')
dataset_train, dataset_valid = data_split(data, test_size=0.01, random_seed=64)
len(dataset_train), len(dataset_valid)

(10053, 103)

In [6]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [7]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

In [8]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [18]:
training_args = TrainingArguments(
    output_dir="./topic_train",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=5,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=64
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [53]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
torch.cuda.empty_cache()
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.15,0.520319,0.874848
200,0.5968,0.421095,0.871867
300,0.5381,0.430454,0.882221
400,0.4499,0.421656,0.903538
500,0.3077,0.427958,0.879709
600,0.3521,0.335875,0.921573
700,0.2646,0.388086,0.920978
800,0.2724,0.38657,0.899583
900,0.2158,0.369979,0.899583


TrainOutput(global_step=945, training_loss=0.44975327385796443, metrics={'train_runtime': 820.6596, 'train_samples_per_second': 36.75, 'train_steps_per_second': 1.152, 'total_flos': 7935522551884800.0, 'train_loss': 0.44975327385796443, 'epoch': 3.0})

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("topic_train/checkpoint-600", num_labels=7).to('cuda')

Test

In [47]:
dataset_test = pd.read_csv("to_relabel/train_noise_artaug.csv")

In [52]:
len(pd.read_csv("to_relabel/hard_case_generation.csv"))+len(pd.read_csv("to_relabel/title_augtitle_compare.csv"))+len(pd.read_csv("to_relabel/train_20241104_filtering_notnoisy_augtitle_drop.csv"))+len(pd.read_csv("to_relabel/train_noise_artaug.csv"))

5823

In [48]:
dataset_test.head()

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,"**KT, 3개월 단위 분할 납부 가능한 새로운 요금제 도입**",4
1,ynat-v1_train_00001,"대한축구협회 신임 회장, 선수 복지 및 유소년 축구 지원 정책 발표",3
2,ynat-v1_train_00002,"김정일 국방위원장, 자주통일 위한 새로운 길 강조하다",2
3,ynat-v1_train_00004,"러시아 해커들, 미국 대선 앞두고 사이버 감시 활동 강화 조짐",6
4,ynat-v1_train_00010,"**매력, 혁신적 신메뉴 출시로 주목받아**",5


In [49]:
model.eval()
preds = []
cor = 0
tot = 0
for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to('cuda')
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        tot += 1
        if pred == sample['target']: cor += 1
        preds.extend(pred)
        
print(cor / tot * 100)

Evaluating: 100%|██████████| 1399/1399 [00:10<00:00, 129.16it/s]

67.40528949249463





In [50]:
dataset_test['pred_target'] = preds
dataset_test.head()

Unnamed: 0,ID,text,target,pred_target
0,ynat-v1_train_00000,"**KT, 3개월 단위 분할 납부 가능한 새로운 요금제 도입**",4,5
1,ynat-v1_train_00001,"대한축구협회 신임 회장, 선수 복지 및 유소년 축구 지원 정책 발표",3,1
2,ynat-v1_train_00002,"김정일 국방위원장, 자주통일 위한 새로운 길 강조하다",2,2
3,ynat-v1_train_00004,"러시아 해커들, 미국 대선 앞두고 사이버 감시 활동 강화 조짐",6,6
4,ynat-v1_train_00010,"**매력, 혁신적 신메뉴 출시로 주목받아**",5,0


In [51]:
dataset_test.to_csv("topic_train/artaug_relabel.csv", encoding='utf-8-sig', index=False)