In [60]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import evaluate
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# Seed Set
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

# 디바이스 설정 (GPU가 사용 가능하면 GPU를 사용하고, 그렇지 않으면 CPU 사용)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [61]:
data_path = os.path.join('..', '..', 'data', 'train.csv')
print(data_path)

../../data/train.csv


In [62]:
model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Dataset Load

In [63]:
data = pd.read_csv(data_path)
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, random_state=SEED)

In [82]:
raw_data = pd.read_csv(data_path)

In [64]:
data

Unnamed: 0,ID,text,target
0,ynat-v1_train_00000,정i :파1 미사z KT( 이용기간 2e 단] Q분종U2보,4
1,ynat-v1_train_00001,K찰.국DLwo 로L3한N% 회장 2 T0&}송=,3
2,ynat-v1_train_00002,"m 김정) 자주통일 새,?r열1나가야1보",2
3,ynat-v1_train_00003,갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩,5
4,ynat-v1_train_00004,pI美대선I앞두고 R2fr단 발] $비해 감시 강화,6
...,...,...,...
2795,ynat-v1_train_02795,트럼프 폭스뉴스 앵커들 충성도 점수매겨…10점만점에 12점도,6
2796,ynat-v1_train_02796,삼성 갤럭시S9 정식 출시 첫 주말 이통시장 잠잠,2
2797,ynat-v1_train_02797,텔레그램+한D 등h亞서 2시간H다운…C버T정gf39종!2보,4
2798,ynat-v1_train_02798,인터뷰 류현진 친구에게 안타 맞는 것 싫어해…승부는 냉정,1


In [65]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        id_code = data["ID"]
        self.inputs = []; self.labels = []; self.ids = []
        for text, label, id in zip(input_texts, targets, id_code):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
            self.ids.append(id)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0),
            'id': self.ids[idx]
        }

    def __len__(self):
        return len(self.labels)

In [66]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [67]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model Train

In [68]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


In [69]:
training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    eval_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=1,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED,
    report_to="none"
)

In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [71]:
trainer.train()

Step,Training Loss,Validation Loss,F1
100,1.8727,1.775349,0.308782


TrainOutput(global_step=124, training_loss=1.845765236885317, metrics={'train_runtime': 125.4689, 'train_samples_per_second': 31.243, 'train_steps_per_second': 0.988, 'total_flos': 1031441639424000.0, 'train_loss': 1.845765236885317, 'epoch': 2.0})

# Loss 기반으로 노이즈 포함된 텍스트 탐지

In [73]:
criterion = torch.nn.CrossEntropyLoss(reduction='none')

In [74]:
model.eval()
losses = []
all_idxs = []
train_dataloader = DataLoader(data_train, batch_size=32)
with torch.no_grad():
    for i, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        ids = batch["id"]

        # 모델 출력 계산
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        # 각 샘플별 개별 손실 계산
        loss_per_sample = criterion(logits, labels)

        # 손실값 저장
        losses.extend(loss_per_sample.cpu().numpy())

        # 배치 내 각 샘플에 대한 인덱스 생성 및 저장
        all_idxs.extend(ids)

# 손실값을 기준으로 데이터프레임 생성
loss_df = pd.DataFrame({
    'idx': all_idxs,
    'loss': losses
})

In [75]:
loss_df

Unnamed: 0,idx,loss
0,ynat-v1_train_00724,1.058253
1,ynat-v1_train_01939,1.797024
2,ynat-v1_train_02720,1.696572
3,ynat-v1_train_00283,1.822225
4,ynat-v1_train_00805,1.772357
...,...,...
1955,ynat-v1_train_02543,2.461077
1956,ynat-v1_train_02090,1.278301
1957,ynat-v1_train_02649,2.283865
1958,ynat-v1_train_00613,1.447923


In [None]:
high_loss_threshold = np.percentile(loss_df['loss'], 90)
high_loss_idxs = loss_df[loss_df['loss'] >= high_loss_threshold]['idx'].tolist()
len(high_loss_idxs)

196

In [83]:
result = raw_data[raw_data["ID"].isin(high_loss_idxs)]

In [84]:
result

Unnamed: 0,ID,text,target
5,ynat-v1_train_00005,美성인 6명 중 1명꼴 배우자·연인 빚 떠안은 적 있다,0
11,ynat-v1_train_00011,NH투자 1월 옵션 만기일 매도 우세,1
31,ynat-v1_train_00031,세계K인무역협회 올S#$'2 무역인 1천b55w 배출한(,3
49,ynat-v1_train_00049,"인국공I{태NL년D의 bT,무jlv공u{고?진보인가",3
63,ynat-v1_train_00063,영상 선출부터 퇴장까지…나경원 원내대표 파란만장 1년,5
...,...,...,...
2674,ynat-v1_train_02674,인크루트 로봇직원 알리사 채용…사번은 24365,2
2690,ynat-v1_train_02690,패스트트랙 수사 검찰 국회 운영위 등 압색,3
2692,ynat-v1_train_02692,무~림 입국금지 ?속 트y프 `임 /던시장은 P외종8l보,6
2707,ynat-v1_train_02707,못믿을 아파트 관리비…경기 556곳서 150억 비리 적발종합,1


In [87]:
result.to_csv("./noise_data.csv",encoding="utf-8-sig",sep=",")