# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data/preprocessed/important_morphemes_text.csv') # processed된 파일까지 경로 
TEST_DIR = os.path.join(BASE_DIR, '../data/preprocessed/unique_ids.csv')
train_name = os.path.splitext(os.path.basename(DATA_DIR))[0]  # processed된 파일 이름 추출
OUTPUT_DIR = os.path.join(BASE_DIR, f'../output/{train_name}_plm') # 해당 파일 이름으로 output 폴더 생성

## Load Tokenizer and Model

In [5]:
model_name = 'klue/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 31.74 GiB of which 6.38 MiB is free. Process 4182598 has 31.16 GiB memory in use. Process 30667 has 582.00 MiB memory in use. Of the allocated memory 258.25 MiB is allocated by PyTorch, and 17.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Define Dataset

In [None]:
data = pd.read_csv(DATA_DIR)
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, random_state=SEED)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [None]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [None]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [None]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=10,
    eval_steps=10,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,F1
10,1.952,1.944743,0.041288
20,1.9457,1.910668,0.098039
30,1.8674,1.757157,0.358569
40,1.6652,1.582846,0.405418
50,1.5215,1.421421,0.581657
60,1.3737,1.347166,0.557421
70,1.2632,1.273501,0.604143
80,1.1576,1.21714,0.608813
90,1.0774,1.204189,0.609231
100,1.0683,1.182934,0.60932


TrainOutput(global_step=148, training_loss=1.315018144813744, metrics={'train_runtime': 169.8794, 'train_samples_per_second': 27.832, 'train_steps_per_second': 0.871, 'total_flos': 1244044916121600.0, 'train_loss': 1.315018144813744, 'epoch': 4.0})

## Evaluate Model

In [32]:
dataset_test = pd.read_csv(TEST_DIR)

In [33]:
model.eval()
preds = []

for idx, sample in tqdm(dataset_test.iterrows(), total=len(dataset_test), desc="Evaluating"):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

Evaluating: 100%|██████████| 1322/1322 [00:13<00:00, 100.93it/s]


In [34]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(OUTPUT_DIR, 'output.csv'), index=False) # processed된 파일 이름 폴더에 결과 저장