# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [1]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
 

from sklearn.model_selection import train_test_split

## Set Hyperparameters

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = "/data/ephemeral/level2-nlp-datacentric-nlp-06/"
DATA_DIR = os.path.join(BASE_DIR, 'data/')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output/')
LOG_DIR = os.path.join(BASE_DIR, 'logs/')
CACHE_DIR = os.path.join(BASE_DIR, 'cache/')

## Load Tokenizer and Model

In [5]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7, cache_dir=CACHE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initial

## Define Dataset

In [6]:
train = pd.read_csv(os.path.join(DATA_DIR, 'hanspell_p2g_t5_google_train.csv'))
valid = pd.read_csv(os.path.join(DATA_DIR, 'valid_split.csv'))

In [7]:
train.shape, valid.shape

((14700, 5), (2100, 5))

In [8]:
train.isnull().sum()

ID        0
text      0
target    0
url       0
date      0
dtype: int64

In [9]:
valid.isnull().sum()

ID        0
text      0
target    0
url       0
date      0
dtype: int64

In [10]:
# # 한 번 shuffle 해주기
# train = train.sample(frac=1).reset_index(drop=True)

In [11]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [12]:
data_train = BERTDataset(train, tokenizer)
data_valid = BERTDataset(valid, tokenizer)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [14]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

## Train Model

In [15]:
from konlpy.tag import Mecab
mecab = Mecab()

In [16]:
import wandb
import os

## for wandb setting
os.environ["WANDB_API_KEY"] = "6f6f03437c7053a9ea262a0afa0edb5694d48a1d"
os.environ["WANDB_PROJECT"] = "clf_HYW"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [17]:
wandb.init(name="backtrans_t5_google_hanspell_p2g")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhwyewon[0m ([33mbe-our-friend[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    logging_dir=LOG_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=500,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED,
    report_to="wandb",
    run_name="backtrans_t5_google_hanspell_p2g"
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [20]:
trainer.train()

***** Running training *****
  Num examples = 14700
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7350
  Number of trainable parameters = 110622727
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
500,1.0089,0.594866,0.819423
1000,0.7874,0.665501,0.825106
1500,0.7693,0.731238,0.82136
2000,0.7634,0.667476,0.84876
2500,0.7279,0.703386,0.838379
3000,0.6626,0.708399,0.835002
3500,0.6632,0.699915,0.840929
4000,0.4997,0.807308,0.839901
4500,0.4333,0.832997,0.838726


***** Running Evaluation *****
  Num examples = 2100
  Batch size = 4
Saving model checkpoint to /data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-500
Configuration saved in /data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-500/config.json
Model weights saved in /data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-2000] due to args.save_total_limit
Logging checkpoint artifacts in checkpoint-500. ...
[34m[1mwandb[0m: Adding directory to artifact (/data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-500)... Done. 10.5s
***** Running Evaluation *****
  Num examples = 2100
  Batch size = 4
Saving model checkpoint to /data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-1000
Configuration saved in /data/ephemeral/level2-nlp-datacentric-nlp-06/output/checkpoint-1000/config.json
Model weights saved in /data/ephemeral/le

TrainOutput(global_step=4500, training_loss=0.7017408277723525, metrics={'train_runtime': 1026.8489, 'train_samples_per_second': 28.631, 'train_steps_per_second': 7.158, 'total_flos': 4736211609600000.0, 'train_loss': 0.7017408277723525, 'epoch': 1.22})

## Evaluate Model

### validation (total)

In [21]:
model.eval()
preds = []
for idx, sample in tqdm(valid.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

2100it [00:20, 101.94it/s]


In [22]:
f1 = evaluate.load('f1')
labels = valid['target'].values

f1.compute(predictions=preds, references=labels, average='macro')

{'f1': 0.8487600351873285}

### inference

In [23]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
dataset_test.isnull().sum()

ID      0
text    0
url     0
date    0
dtype: int64

In [24]:
dataset_test.head()

Unnamed: 0,ID,text,url,date
0,ynat-v1_dev_00000,제임스 부상 레이커스 성탄매치서 골든스테이트에 완승종합,https://sports.news.naver.com/news.nhn?oid=001...,2018.12.26 15:16
1,ynat-v1_dev_00001,프랑스 극우정치인 르펜 노란 조끼 덕에 승승장구,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.01.17. 오후 8:04
2,ynat-v1_dev_00002,대통령개헌안 ⑥토지공개념 명시…개발이익환수·부동산과세 강화 전망종합,https://news.naver.com/main/read.nhn?mode=LS2D...,2018.03.21. 오후 1:09
3,ynat-v1_dev_00003,의사 살해 환자 경찰서 나와 법원으로,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.01.02. 오후 2:47
4,ynat-v1_dev_00004,이란 최고지도자 유럽 맹비난…핵합의 미이행 뻔뻔하고 오만,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.07.16. 오후 8:11


In [25]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_test.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

47785it [07:45, 102.68it/s]


In [26]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'results/backtrans_google_hanspell_p2g_output.csv'), index=False)

In [27]:
dataset_test.head()

Unnamed: 0,ID,text,url,date,target
0,ynat-v1_dev_00000,제임스 부상 레이커스 성탄매치서 골든스테이트에 완승종합,https://sports.news.naver.com/news.nhn?oid=001...,2018.12.26 15:16,5
1,ynat-v1_dev_00001,프랑스 극우정치인 르펜 노란 조끼 덕에 승승장구,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.01.17. 오후 8:04,4
2,ynat-v1_dev_00002,대통령개헌안 ⑥토지공개념 명시…개발이익환수·부동산과세 강화 전망종합,https://news.naver.com/main/read.nhn?mode=LS2D...,2018.03.21. 오후 1:09,6
3,ynat-v1_dev_00003,의사 살해 환자 경찰서 나와 법원으로,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.01.02. 오후 2:47,2
4,ynat-v1_dev_00004,이란 최고지도자 유럽 맹비난…핵합의 미이행 뻔뻔하고 오만,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.07.16. 오후 8:11,4
