# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, './resources/processed/')
OUTPUT_DIR = os.path.join(BASE_DIR, './resources/output')
# DATA_DIR = os.path.join(BASE_DIR, './resources/raw_data/')
# OUTPUT_DIR = os.path.join(BASE_DIR, './resources/output')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Dataset

In [29]:
# data = pd.read_csv(os.path.join(DATA_DIR, 'train_v2.csv'))
# data = pd.read_csv(os.path.join(DATA_DIR, 'train_headline_topic_inference.csv'))
data = pd.read_csv(os.path.join(DATA_DIR, 'simon_dataset_duplicate_noise_only.csv'))
# data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, random_state=SEED)

In [33]:
# # target별로 그룹화하고 각 그룹에서 99%는 train, 1%는 valid로 분할
# dataset_train = pd.DataFrame()
# dataset_valid = pd.DataFrame()

# for target, group in data.groupby('target'):
#     n = len(group)
#     valid_size = max(int(n * 0.1), 1)  # 최소 1개는 valid로
    
#     valid_indices = np.random.choice(group.index, size=valid_size, replace=False)
#     train_indices = group.index.difference(valid_indices)
    
#     dataset_train = pd.concat([dataset_train, group.loc[train_indices]])
#     dataset_valid = pd.concat([dataset_valid, group.loc[valid_indices]])

# # 결과 출력
# print("Train 데이터:")
# print(len(dataset_train))
# print("\nValid 데이터:")
# print(len(dataset_valid))

Train 데이터:
5716

Valid 데이터:
632


In [34]:
dataset_train['target'].value_counts()

target
6    825
2    821
5    818
4    814
0    814
3    814
1    810
Name: count, dtype: int64

In [35]:
dataset_valid['target'].value_counts()

target
2    91
6    91
0    90
1    90
3    90
4    90
5    90
Name: count, dtype: int64

In [27]:
dataset_valid

Unnamed: 0.1,Unnamed: 0,ID,text,target,target_name
2143,3356,ynat-v1_train_00918,"미스터 션샤인, 팬들 기대 속 시즌2 제작 확정",0,생활문화
1780,2993,ynat-v1_train_00784,토 애 레 듀오 넷플 서 틀,0,생활문화
2028,3241,ynat-v1_train_00873,청 항 짙은 안 항공 회 지 운,0,생활문화
6333,7546,ynat-v1_train_02788,남원소식 춘향단장 모집,0,생활문화
3803,5016,ynat-v1_train_01638,"H영 극단, 호주 무대서 첫 선...다양한 언어로 감동 전달",0,생활문화
...,...,...,...,...,...
3289,4502,ynat-v1_train_01380,'렉스트 X1 출시 기념 이벤트 중)>행7재 I동 무계약,6,세계
1019,2232,ynat-v1_train_00482,"올해 노벨상 수상자들, 글로벌 무대에서 업적 발표",6,세계
163,1376,ynat-v1_train_00067,"삼성, 갤럭시 S9 퀀텀 패키지 5천대 한정 판매 개시",6,세계
142,1355,ynat-v1_train_00058,"M2 치료제 개발, 미국 임플란트 시장 본격화",6,세계


In [11]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [12]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [14]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [15]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [24]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 5e-04,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,# 5~10
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [26]:
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.9057,1.955235,0.057595


KeyboardInterrupt: 

## Evaluate Model

In [58]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [59]:
model.eval()
preds = []

for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

Evaluating: 100%|██████████| 30000/30000 [03:51<00:00, 129.64it/s]


In [60]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'output.csv'), index=False)