# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'resources/pre_processed_data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [6]:
data = pd.read_csv(os.path.join(DATA_DIR,'train_20241104.csv'))
# dataset_train, dataset_valid = train_test_split(data, test_size=0.1, random_state=SEED)
# print("Train 데이터:")
# print(len(dataset_train))
# print("\nValid 데이터:")
# print(len(dataset_valid))

In [7]:
dataset_train = pd.DataFrame()
dataset_valid = pd.DataFrame()

for target, group in data.groupby('target'):
    n = len(group)
    valid_size = max(int(n * 0.1), 1)  # 최소 1개는 valid로
    
    valid_indices = np.random.choice(group.index, size=valid_size, replace=False)
    train_indices = group.index.difference(valid_indices)
    
    dataset_train = pd.concat([dataset_train, group.loc[train_indices]])
    dataset_valid = pd.concat([dataset_valid, group.loc[valid_indices]])

# 결과 출력
print("Train 데이터:")
print(len(dataset_train))
print("\nValid 데이터:")
print(len(dataset_valid))

Train 데이터:
10456

Valid 데이터:
1158


In [8]:
dataset_train['target'].value_counts()

target
6    1554
1    1553
2    1535
0    1494
3    1463
4    1454
5    1403
Name: count, dtype: int64

In [9]:
dataset_valid['target'].value_counts()

target
1    172
6    172
2    170
0    166
3    162
4    161
5    155
Name: count, dtype: int64

In [10]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [11]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [13]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [14]:
## for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [15]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.24,0.778846,0.755542
200,0.8357,0.683899,0.794302
300,0.7167,0.628422,0.806925
400,0.6093,0.601929,0.824906
500,0.5382,0.581376,0.827887
600,0.5098,0.562797,0.828998


TrainOutput(global_step=654, training_loss=0.7234333516625454, metrics={'train_runtime': 626.4333, 'train_samples_per_second': 33.383, 'train_steps_per_second': 1.044, 'total_flos': 5502425398886400.0, 'train_loss': 0.7234333516625454, 'epoch': 2.0})

## Train, Valid Check

In [19]:
print(dataset_train)
# print(dataset_valid)

                        ID                                  text  target  \
65     ynat-v1_train_00018                           개 전 연 정연 작가       0   
66     ynat-v1_train_00018        전쟁과 평화의 경계에서, H 작가의 작품이 던지는 질문       0   
67     ynat-v1_train_00018  작가 H의 문학적 통찰력으로 재조명된 전전쟁 연합의 새로운 시대.       0   
68     ynat-v1_train_00018     전쟁과 평화의 경계에서, H 작가들의 작품에서 제기된 질문들       0   
69     ynat-v1_train_00018                           정연희, 본사 작가.       0   
...                    ...                                   ...     ...   
11576  ynat-v1_train_02787       13일 노바 라이프미주 3개 품목 10% 할인 상품 출시       6   
11577  ynat-v1_train_02787                     이 노바 라이 미 어패 결 상품       6   
11593  ynat-v1_train_02791          거리에서 노래하던 베네수엘라 이민자에게 찾아온 기적       6   
11601  ynat-v1_train_02793         경찰 월초 유커와 日관광객위해 바가지 요금 집중 단속       6   
11603  ynat-v1_train_02795         트럼프 폭스뉴스 앵커들 충성도 점수매겨 점만점에 점도       6   

      target_name     source  
65           생활문화      noisy  
66           생활문화      no

In [20]:
print(dataset_valid)


                       ID                                            text  \
1205  ynat-v1_train_00287            프베이스볼 스타 윤호영이 무릎 수술을 위해 650만원을 지원한다.   
6877  ynat-v1_train_01617                            화천 산천어 축제 제2회 즐거운 눈썰   
3914  ynat-v1_train_00918                  미스터 션샤인, 새로운 시즌 예고...주요 인물들 복귀   
1959  ynat-v1_train_00494  대형 서점의 베스트셀러 코너에 있는 책의 다양성 문제를 제기하는 독자들의 불만이다.   
4722  ynat-v1_train_01088      메이샨은 일본 회사가 주최한 컨퍼런스에서 뛰어난 작곡으로 우수상을 수상했다.   
...                   ...                                             ...   
5516  ynat-v1_train_01271                              밀라노 지오디 광장의 야외 광고.   
3268  ynat-v1_train_00783                 북미, 치열한 경제 경쟁 속 아E존의 전략적 중요성 부각   
5831  ynat-v1_train_01346          초등학교 3부, 혁신적인 Y 교육 방식으로 학생들의 학습 효과 극대화   
9390  ynat-v1_train_02268           UOz 감독의 충격적 발언, 영화 '펄프 픽션' 캐릭터 비유로 논란   
7285  ynat-v1_train_01734                   트럼프의 루마니아 방문... 정경두 국방장관의 기도.   

      target target_name source  
1205       0        생활문화  noisy  
6877   

In [26]:

def evaluation (dataset_test):
    model.eval()
    preds = []
    probs = []
    for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
        inputs = tokenizer(sample['text'], padding='max_length',truncation=True, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            logits = model(**inputs).logits
            prob = torch.nn.Softmax(dim=1)(logits)
            pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
            preds.extend(pred)
            probs.extend(prob.cpu().numpy())
    dataset_test['predicted_value'] = preds
    dataset_test.to_csv(os.path.join(BASE_DIR, 'resources/analyze/valid_output.csv'), index=False)

In [25]:
evaluation(dataset_test=dataset_train)

Evaluating: 100%|██████████| 10456/10456 [02:05<00:00, 83.54it/s]


In [27]:
evaluation(dataset_test=dataset_valid)

Evaluating: 100%|██████████| 1158/1158 [00:13<00:00, 83.48it/s]


## Evaluate Model

In [18]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [20]:
model.eval()
preds = []
probs = []

for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        prob = torch.nn.Softmax(dim=1)(logits)
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)
        probs.extend(prob.cpu().numpy())

Evaluating:  82%|████████▏ | 24697/30000 [03:08<00:40, 131.56it/s]

In [22]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'resources/output/output.csv'), index=False)