# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [97]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [98]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [99]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [100]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [129]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /opt/ml/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tra

## Define Dataset

In [130]:
data = pd.read_csv(os.path.join(DATA_DIR, '../data/final_5_train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, stratify=data['target'],random_state=SEED)

In [131]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', 
                                        truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [132]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [133]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [134]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

## Train Model

In [24]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

import wandb

os.environ["TOKENIZERS_PARALLELISM"] = "false"
wandb_api_key = 'd72561b0aa234e715af5ded01122bb373372004b'
wandb.login(key = wandb_api_key)

wandb.init(project='주제 분류 프로젝트', entity= 'level2-klue-nlp-05',name='0.3.1 (baseline)')





VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▃▄▆▇█
eval/loss,▁▅▆███
eval/runtime,▁▃▆▃▆█
eval/samples_per_second,█▆▃▆▃▁
eval/steps_per_second,█▆▃▆▃▁
train/epoch,▁▁▂▂▄▄▅▅▆▆███
train/global_step,▁▁▂▂▄▄▅▅▆▆███
train/learning_rate,█▇▅▄▂▁
train/loss,█▄▅▁▁▁
train/total_flos,▁

0,1
eval/f1,0.85139
eval/loss,0.72968
eval/runtime,23.7195
eval/samples_per_second,88.535
eval/steps_per_second,22.134
train/epoch,2.0
train/global_step,2450.0
train/learning_rate,0.0
train/loss,0.5198
train/total_flos,2578604098560000.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112979871945249, max=1.0…

In [135]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=1000,
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [136]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [137]:
torch.cuda.empty_cache()

trainer.train()

***** Running training *****
  Num examples = 23678
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 11840
  Number of trainable parameters = 110622727
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
1000,0.4213,0.276734,0.944399
2000,0.2875,0.300045,0.942948
3000,0.2518,0.201204,0.959644
4000,0.25,0.231488,0.957448
5000,0.216,0.190278,0.965717
6000,0.1873,0.214772,0.963065
7000,0.0826,0.217244,0.966155
8000,0.0932,0.18152,0.970842
9000,0.075,0.228869,0.963998
10000,0.0796,0.186802,0.970042


***** Running Evaluation *****
  Num examples = 10149
  Batch size = 4
Saving model checkpoint to /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-1000
Configuration saved in /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-1000/config.json
Model weights saved in /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10149
  Batch size = 4
Saving model checkpoint to /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-2000
Configuration saved in /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-2000/config.json
Model weights saved in /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10149
  Batch size = 4
Saving model checkpoint to /data/ephemeral/level2-nlp-datacentric-nlp-05/code/../output/checkpoint-3000
Configuration s

TrainOutput(global_step=11840, training_loss=0.1721070763227102, metrics={'train_runtime': 3168.7045, 'train_samples_per_second': 14.945, 'train_steps_per_second': 3.737, 'total_flos': 1.24604464991232e+16, 'train_loss': 0.1721070763227102, 'epoch': 2.0})

## Evaluate Model

In [138]:
#역번역 한거만 cleanlab해보기

dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [139]:
model.eval()
preds = []
probs = []
for idx, sample in tqdm(dataset_test.iterrows()):
    inputs = tokenizer(str(sample['text']), return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        prob = (torch.nn.Softmax(dim=1)(logits)).cpu().numpy()
        preds.extend(pred)
        probs.extend(prob)

47785it [07:29, 106.21it/s]


In [140]:
dataset_test['target'] = preds
# dataset_test['probs'] = probs
dataset_test.to_csv(os.path.join(BASE_DIR, 'output_final.csv'), index=False)