# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [None]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'resources/processed/v7')
OUTPUT_DIR = os.path.join(BASE_DIR, './resources/output_v7')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:
data = pd.read_csv(os.path.join('resources/output_v7/train_20241107_1.csv'))
# dataset_train, dataset_valid = train_test_split(data, test_size=0.1, random_state=SEED)
# print("Train 데이터:")
# print(len(dataset_train))
# print("\nValid 데이터:")
# print(len(dataset_valid))

In [None]:
# data = data[data['source'] == 'noisy']

In [7]:
data.head(10)

Unnamed: 0,ID,text,target,target_name,similarity
0,ynat-v1_train_00000,"정부, '주파수 미사용' KT에 이용기간 2년 단축 처분(종합2보)",4,IT과학,0.20599
1,ynat-v1_train_00000,"정부, KT 주파수 미사용에 강력한 제재 조치",4,IT과학,0.2312
2,ynat-v1_train_00000,"KT, 주파수 미사용으로 이용기간 2년 단축 처분",4,IT과학,0.21202
3,ynat-v1_train_00000,"KT, 주파수 미사용으로 인한 사용 기간의 2년 단축.",4,IT과학,0.22293
4,ynat-v1_train_00000,미사용 주파수에 대한 정부와 KT에 대한 강력한 제재.,4,IT과학,0.23634
5,ynat-v1_train_00001,찰 국 로 한 회장 송,3,사회,0.18184
6,ynat-v1_train_00001,경찰이 한국상공회의소 회장을 비롯해 20여 명의 '국회 불법 로비' 혐의로 구속했다.,3,사회,0.18318
7,ynat-v1_train_00001,"경찰, '국회 불법 로비' 한어총 회장 등 20명 송치",3,사회,0.17293
8,ynat-v1_train_00001,KTU 교원들이 한번 맞으면 탈락 체제 도입에 반대한다.,3,사회,0.22409
9,ynat-v1_train_00001,교육계에서 급식용 천막 설치에 대한 논란이 다시 불타오르고 있다.,3,사회,0.2256


In [8]:
dataset_train = pd.DataFrame()
dataset_valid = pd.DataFrame()

for target, group in data.groupby('target'):
    n = len(group)
    valid_size = max(int(n * 0.1), 1)  # 최소 1개는 valid로
    
    np.random.seed(SEED)
    valid_indices = np.random.choice(group.index, size=valid_size, replace=False)
    train_indices = group.index.difference(valid_indices)
    
    dataset_train = pd.concat([dataset_train, group.loc[train_indices]])
    dataset_valid = pd.concat([dataset_valid, group.loc[valid_indices]])

# 결과 출력
print("Train 데이터:")
print(len(dataset_train))
print("\nValid 데이터:")
print(len(dataset_valid))

Train 데이터:
14010

Valid 데이터:
1553


In [9]:
dataset_train.head()

Unnamed: 0,ID,text,target,target_name,similarity
48,ynat-v1_train_00018,"정연희, 본사 작가.",0,생활문화,0.19117
49,ynat-v1_train_00018,"개전 연정의 새 시대, H 작가의 문학적 통찰로 재조명",0,생활문화,0.22476
62,ynat-v1_train_00024,서울에 다시 오존주의보 도심 서북 동북권 발령종합,0,생활문화,0.19536
97,ynat-v1_train_00036,크루즈 관광객용 반나절 부산 해안 트레킹 상품 개발,0,생활문화,0.2065
170,ynat-v1_train_00071,영상 냄새가 나는데 퇴근길 마주친 수상한 차 따라가봤더니,0,생활문화,0.20018


In [10]:
dataset_valid.head()

Unnamed: 0,ID,text,target,target_name,similarity
13027,ynat-v1_train_02170,"일왕·왕비, 즉위식 거행",0,생활문화,0.19169
2410,ynat-v1_train_01010,노 김 피 손민 겨 는 베 트 연 곡,0,생활문화,0.19184
13168,ynat-v1_train_00600,도자 예술의 회고적인 전시인 YGLU 전시회에서 만나게 된 혁신적인 작품들.,0,생활문화,0.20621
13945,ynat-v1_train_01034,남쪽 먼 제주 동부 풍 강풍경 대,0,생활문화,0.21113
12399,905,만화박물관 3·1운동 기념벽화 걸고 영화 '동주' 무료 특별 관람 상영,0,생활문화,0.20029


In [11]:
dataset_train['target'].value_counts()

target
4    2348
6    2175
0    2021
2    1964
3    1893
1    1881
5    1728
Name: count, dtype: int64

In [12]:
dataset_valid['target'].value_counts()

target
4    260
6    241
0    224
2    218
3    210
1    208
5    192
Name: count, dtype: int64

In [13]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [14]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [16]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [17]:
## for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [18]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    # learning_rate= 5e-05,
    learning_rate= 3e-05,
    # learning_rate= 1e-04,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [91]:
# f1
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.1064,0.581233,0.818006
200,0.4952,0.488878,0.856991
300,0.4457,0.507613,0.855034
400,0.404,0.472658,0.853725
500,0.4579,0.467572,0.867872
600,0.3775,0.45571,0.868934
700,0.3714,0.411424,0.886336
800,0.2084,0.426571,0.880624
900,0.2285,0.422573,0.886753
1000,0.2459,0.393742,0.897721


TrainOutput(global_step=1394, training_loss=0.3644647919806723, metrics={'train_runtime': 813.5329, 'train_samples_per_second': 27.416, 'train_steps_per_second': 1.714, 'total_flos': 5868692430028800.0, 'train_loss': 0.3644647919806723, 'epoch': 2.0})

In [21]:
# eval
trainer.train(resume_from_checkpoint=True)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss,F1
1700,0.1515,0.26635,0.933515


TrainOutput(global_step=1752, training_loss=0.012891080553673174, metrics={'train_runtime': 86.0697, 'train_samples_per_second': 325.55, 'train_steps_per_second': 20.356, 'total_flos': 7372702738944000.0, 'train_loss': 0.012891080553673174, 'epoch': 2.0})

## Evaluate Model

In [24]:
dataset_test = pd.read_csv("resources/raw_data/test.csv")

In [25]:
model.eval()
preds = []
probs = []

for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        prob = torch.nn.Softmax(dim=1)(logits)
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)
        probs.extend(prob.cpu().numpy())

Evaluating: 100%|██████████| 30000/30000 [03:46<00:00, 132.61it/s]


In [26]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'resources/output/output_3.csv'), index=False)

In [None]:
# output_with_prob = dataset_test.copy()
# probs = np.array(probs)
# for i in range(probs.shape[1]):
#     output_with_prob[f'prob_{i}'] = probs[:, i]
    
# output_with_prob.to_csv(os.path.join(BASE_DIR, 'resources/output/output_prob.csv'), index=False)