# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [35]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from preprocessing import data_preprocess

## Set Hyperparameters

In [36]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [37]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [38]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [39]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /opt/ml/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tra

## Define Dataset

In [40]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
data = data_preprocess(data)    # data_preprocess 적용
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, stratify=data['target'],random_state=SEED)

In [41]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [42]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [43]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [44]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

## Train Model

In [45]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'
import wandb

entity_name = input("input entity name : ")
wandb.init(project='주제 분류 프로젝트', entity=entity_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [46]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [48]:
trainer.train()
wandb.finish()

***** Running training *****
  Num examples = 4900
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2450
  Number of trainable parameters = 110622727
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
100,1.5326,0.966105,0.757923
200,0.8808,0.6292,0.818208
300,0.6948,0.603933,0.824771
400,0.7339,0.581596,0.833349
500,0.7415,0.629022,0.837566
600,0.6284,0.678362,0.831726
700,0.5662,0.670042,0.840018
800,0.7135,0.643392,0.839775
900,0.6942,0.684778,0.834016
1000,0.6204,0.6799,0.845226


***** Running Evaluation *****
  Num examples = 2100
  Batch size = 4
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-100
Configuration saved in /data/ephemeral/code/../output/checkpoint-100/config.json
Model weights saved in /data/ephemeral/code/../output/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/code/../output/checkpoint-2100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2100
  Batch size = 4
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-200
Configuration saved in /data/ephemeral/code/../output/checkpoint-200/config.json
Model weights saved in /data/ephemeral/code/../output/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/code/../output/checkpoint-2400] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2100
  Batch size = 4
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-300
Configuration saved in /dat

VBox(children=(Label(value='0.002 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.08742086904095431, max=1.…

0,1
eval/f1,▁▅▆▆▇▆▇▇▆▇▇▇▇█▇██▇▇▇████
eval/loss,█▂▁▁▂▃▃▂▃▃▂▃▃▂▄▄▄▄▄▄▄▄▄▄
eval/runtime,▂▂▁▁▃▂▁▂▂▁▁▆▁▁▂█▄▁▄▂▂▂▁▂
eval/samples_per_second,▇▇██▆▇▇▇▇██▂█▇▇▁▅█▅▇▇▇█▇
eval/steps_per_second,▇▇██▆▇▇▇▇██▂█▇▇▁▅█▅▇▇▇█▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▁▁
train/loss,█▄▃▃▃▃▂▃▃▃▄▃▂▃▁▂▂▂▂▂▁▂▂▃
train/total_flos,▁

0,1
eval/f1,0.85258
eval/loss,0.7293
eval/runtime,5.0882
eval/samples_per_second,412.722
eval/steps_per_second,103.181
train/epoch,2.0
train/global_step,2450.0
train/learning_rate,0.0
train/loss,0.6249
train/total_flos,95067525307800.0


## Evaluate Model

In [49]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [50]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_test.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

0it [00:00, ?it/s]

47785it [06:54, 115.15it/s]


In [51]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'output.csv'), index=False)