# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'resources/processed/v6')
OUTPUT_DIR = os.path.join(BASE_DIR, './resources/output_v6')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:
data = pd.read_csv(os.path.join('resources/processed/v6/train_20241104_case1_economyup.csv'))
# dataset_train, dataset_valid = train_test_split(data, test_size=0.1, random_state=SEED)
# print("Train 데이터:")
# print(len(dataset_train))
# print("\nValid 데이터:")
# print(len(dataset_valid))

In [None]:
# data = data[data['source'] == 'noisy']

In [7]:
data.head(10)

Unnamed: 0,ID,text,target,target_name,source,similarity,token_len
0,ynat-v1_train_00000,"정부, '주파수 미사용' KT에 이용기간 2년 단축 처분(종합2보)",4,IT과학,noisy,0.20599,21
1,ynat-v1_train_00000,"정부, KT 주파수 미사용에 강력한 제재 조치",4,IT과학,noisy,0.2312,11
2,ynat-v1_train_00000,"KT, 주파수 미사용으로 이용기간 2년 단축 처분",4,IT과학,noisy,0.21202,13
3,ynat-v1_train_00000,"KT, 주파수 미사용으로 인한 사용 기간의 2년 단축.",4,IT과학,noisy,0.22293,14
4,ynat-v1_train_00000,미사용 주파수에 대한 정부와 KT에 대한 강력한 제재.,4,IT과학,noisy,0.23634,14
5,ynat-v1_train_00001,찰 국 로 한 회장 송,3,사회,noisy,0.18184,6
6,ynat-v1_train_00001,경찰이 한국상공회의소 회장을 비롯해 20여 명의 '국회 불법 로비' 혐의로 구속했다.,3,사회,noisy,0.18318,25
7,ynat-v1_train_00001,"경찰, '국회 불법 로비' 한어총 회장 등 20명 송치",3,사회,noisy,0.17293,15
8,ynat-v1_train_00001,KTU 교원들이 한번 맞으면 탈락 체제 도입에 반대한다.,3,사회,noisy,0.22409,14
9,ynat-v1_train_00001,교육계에서 급식용 천막 설치에 대한 논란이 다시 불타오르고 있다.,3,사회,noisy,0.2256,18


In [8]:
dataset_train = pd.DataFrame()
dataset_valid = pd.DataFrame()

for target, group in data.groupby('target'):
    n = len(group)
    valid_size = max(int(n * 0.1), 1)  # 최소 1개는 valid로
    
    valid_indices = np.random.choice(group.index, size=valid_size, replace=False)
    train_indices = group.index.difference(valid_indices)
    
    dataset_train = pd.concat([dataset_train, group.loc[train_indices]])
    dataset_valid = pd.concat([dataset_valid, group.loc[valid_indices]])

# 결과 출력
print("Train 데이터:")
print(len(dataset_train))
print("\nValid 데이터:")
print(len(dataset_valid))

Train 데이터:
11152

Valid 데이터:
1237


In [9]:
dataset_train['target'].value_counts()

target
4    1982
6    1747
0    1644
2    1550
3    1515
1    1386
5    1328
Name: count, dtype: int64

In [10]:
dataset_valid['target'].value_counts()

target
4    220
6    194
0    182
2    172
3    168
1    154
5    147
Name: count, dtype: int64

In [11]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [12]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [14]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [15]:
## for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [16]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    # learning_rate= 5e-05,
    learning_rate= 3e-05,
    # learning_rate= 1e-04,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [45]:
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.0493,0.545224,0.822034
200,0.5231,0.405834,0.875747
300,0.4461,0.41348,0.874929
400,0.4188,0.381449,0.87855
500,0.4084,0.349625,0.894638
600,0.3596,0.304114,0.910338
700,0.3432,0.299436,0.910235
800,0.217,0.32296,0.90629
900,0.2051,0.33717,0.908363
1000,0.2261,0.289187,0.918957


TrainOutput(global_step=1394, training_loss=0.35690061467962936, metrics={'train_runtime': 808.5614, 'train_samples_per_second': 27.585, 'train_steps_per_second': 1.724, 'total_flos': 5868692430028800.0, 'train_loss': 0.35690061467962936, 'epoch': 2.0})

In [None]:
# f1
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.0208,0.505765,0.85545
200,0.5203,0.440108,0.862287
300,0.4665,0.429123,0.873862
400,0.444,0.399694,0.878991
500,0.4211,0.379867,0.88624
600,0.3394,0.371938,0.884975
700,0.3622,0.339983,0.896234
800,0.2278,0.376622,0.899426
900,0.243,0.34447,0.909185
1000,0.2108,0.364411,0.90443


TrainOutput(global_step=1394, training_loss=0.3609854644819175, metrics={'train_runtime': 807.9069, 'train_samples_per_second': 27.607, 'train_steps_per_second': 1.725, 'total_flos': 5868692430028800.0, 'train_loss': 0.3609854644819175, 'epoch': 2.0})

In [None]:
# f1
trainer.train()

Step,Training Loss,Validation Loss


## Evaluate Model

In [46]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [48]:
model.eval()
preds = []
probs = []

for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        prob = torch.nn.Softmax(dim=1)(logits)
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)
        probs.extend(prob.cpu().numpy())

Evaluating: 100%|██████████| 30000/30000 [03:54<00:00, 127.78it/s]


In [49]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'resources/output/output_2.csv'), index=False)

In [None]:
# output_with_prob = dataset_test.copy()
# probs = np.array(probs)
# for i in range(probs.shape[1]):
#     output_with_prob[f'prob_{i}'] = probs[:, i]
    
# output_with_prob.to_csv(os.path.join(BASE_DIR, 'resources/output/output_prob.csv'), index=False)