# 노이즈, 라벨 오류 수정

### 원본 데이터
- `python main.py --data ../datasets/v0.0.2 --model ./model --mode train`
    - 'eval_loss': 1.6945666074752808, 'eval_f1': 0.34721457497955194

## 노이즈 수정
- 번역기 이용: 성능 안좋음
- LLM 이용: Llama 3.2B instruct 이용

In [None]:
# !huggingface-cli login

In [None]:
from llama import LlamaInference

llama_model_name = "meta-llama/Llama-3.1-8B-Instruct"
data_path = "../../datasets/v0.0.2/train.csv"
prompt_path = "./prompt/prompt_ko_v1.txt"
example_path = "./prompt/few_shot_v1.json"

inferencer = LlamaInference(llama_model_name, prompt_path, example_path)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [4]:
import pandas as pd

data = pd.read_csv(data_path)

# noise 점수가 30 이상 70 미만인 것들만 재작성.
# 70 이상인건 삭제
data_denoise = data[(data.noise_score >= 30) & (data.noise_score < 70)].reset_index(drop=True)
data_denoise.head()

Unnamed: 0,ID,text,target,noise_score,label_quality
0,ynat-v1_train_00000,정i :파1 미사z KT( 이용기간 2e 단] Q분종U2보,4,58.909091,0.215058
1,ynat-v1_train_00002,"m 김정) 자주통일 새,?r열1나가야1보",2,61.714286,0.271684
2,ynat-v1_train_00004,pI美대선I앞두고 R2fr단 발] $비해 감시 강화,6,63.0,0.275686
3,ynat-v1_train_00006,프로야구~롯TKIAs광주 경기 y천취소,1,60.0,0.392143
4,ynat-v1_train_00007,아가메즈 33득점 우리카드 KB손해보험 완파…3위 굳...,4,44.0,0.099699


In [None]:
import warnings
warnings.filterwarnings('ignore')

data_denoise['denoised_text'] = ""
for idx, row in data_denoise.iterrows():
    denoised_text = inferencer.inference(row['text'])
    data_denoise.loc[idx,'denoised_text'] = denoised_text
    print(idx)
    print(f"원본: {row['text']}")
    print(f"수정: {denoised_text}\n")

In [27]:
data_denoise[data_denoise.denoised_text == "복구 불가"]

Unnamed: 0,ID,text,target,noise_score,label_quality,denoised_text
5,ynat-v1_train_00010,oi 매력 R모h츠a열#w3약 >l·주가 고Q/진,5,64.8,0.19769,복구 불가
23,ynat-v1_train_00067,x콩-P면금[T 나%\…g트=물J1h나고 지>철 %기고,6,64.615385,0.145765,복구 불가
52,ynat-v1_train_00152,김정은 $향gIx x산시 za,2,69.6,0.275045,복구 불가
54,ynat-v1_train_00168,"(말\N 여행 aiq *#의 q…e을 반3는 ?딧M""<1짝",0,63.2,0.159596,복구 불가
68,ynat-v1_train_00216,"k@ w고지;~ 반정,시/에 美X사주한 1)",6,64.8,0.231543,복구 불가
122,ynat-v1_train_00425,X극VM=에 / 문:대통~,2,68.571429,0.229332,복구 불가
146,ynat-v1_train_00499,신:p(피U 로y의 시-,0,54.0,0.320529,복구 불가
151,ynat-v1_train_00520,"콘텐W~w짜 제O u0UK수( 위로…통신업,f술Z종=",4,60.0,0.194422,복구 불가
168,ynat-v1_train_00563,"@_르담 붕괴 위VL:전…佛당국 %kn^\급,보강Q작]",6,65.0,0.307064,복구 불가
175,ynat-v1_train_00579,"英의 \렉시Q 혼N에 C z?獨(佛…*y한 입장 제Q해""[합",6,52.615385,0.329716,복구 불가


In [40]:
data_denoise_final = data_denoise[data_denoise.denoised_text != "복구 불가"]
data_denoise_final['text'] = data_denoise_final['denoised_text']
data_denoise_final = data_denoise_final.drop(['denoised_text'], axis=1)

# data_denoise2 = data[(data.noise_score >= 30) & (data.noise_score < 70)].reset_index(drop=True)
data_clear = data[data.noise_score < 30]
denoised_final = pd.concat([data_clear, data_denoise_final], axis=0)
denoised_final.to_csv("../../datasets/v0.0.3/train.csv", index=False)

In [None]:
data_denoise_final = data_denoise[data_denoise.denoised_text != "복구 불가"]
data_denoise_final['text'] = data_denoise_final['denoised_text']
data_denoise_final.drop('denoised_text')

data_denoise = data[(data.noise_score >= 30) & (data.noise_score < 70)].reset_index(drop=True)
data_clear = data[data.noise_score < 30]
denoised_final = pd.concat([data_clear, data_denoise], axis=0)
denoised_final.to_csv("../../datasets/v0.0.3/train.csv", index=False)

## label 다시 매기기

In [None]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold

import torch
from torch.utils.data import Dataset, DataLoader
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

from cleanlab.filter import find_label_issues

SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
os.environ['WANDB_DISABLED'] = 'true'

class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', max_length=50, truncation=False, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


train_data = pd.read_csv("../../datasets/v0.0.3/train.csv")
val_data = pd.read_csv("../../datasets/v0.0.2/validation.csv")

# Dataset 준비
train_dataset = BERTDataset(train_data, tokenizer)
val_dataset = BERTDataset(val_data, tokenizer)

# Trainer 설정
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
training_args = TrainingArguments(
    output_dir=f'./outputs',
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='epoch',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    # eval_steps=100,
    # save_steps=100,
    save_total_limit=1,
    learning_rate= 2e-05, # 가능
    adam_beta1 = 0.9, # 불가
    adam_beta2 = 0.999, # 불가
    adam_epsilon=1e-08, # 불가
    weight_decay=0.01, # 불가
    lr_scheduler_type='linear', # 불가
    per_device_train_batch_size=32, # 가능
    per_device_eval_batch_size=32, # 32인 건 이유가 있다.
    num_train_epochs=2, # 불가
    # load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED, # 불가? 가급적 건드리지 말기
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

# 모델 학습
trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1
1,1.9522,1.772822,0.369701
2,1.8379,1.664312,0.495439


100%|██████████| 5/5 [00:00<00:00, 45.74it/s]
100%|██████████| 5/5 [00:00<00:00, 45.74it/s]


In [29]:
model.eval()

# Validation 데이터에 대해 예측 확률 계산
val_loader = DataLoader(val_dataset, batch_size=32)
pred_probs = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        pred_probs.extend(probs.cpu().numpy())

pred_probs = np.array(pred_probs)


# Train 데이터에 대해 예측 확률 계산
train_loader = DataLoader(train_dataset, batch_size=32)
train_pred_probs = []

with torch.no_grad():
    for batch in tqdm(train_loader):
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        train_pred_probs.extend(probs.cpu().numpy())

train_pred_probs = np.array(train_pred_probs)

100%|██████████| 5/5 [00:00<00:00, 42.49it/s]
100%|██████████| 54/54 [00:01<00:00, 41.41it/s]


In [17]:
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"F1 Score: {eval_results['eval_f1']:.4f}")

F1 Score: 0.4954


In [31]:
len(train_pred_probs)

1706

In [32]:
# cleanlab을 사용하여 라벨 이슈 탐색
ordered_label_issues = find_label_issues(
    labels=train_data['target'].values,
    pred_probs=train_pred_probs,
    return_indices_ranked_by='self_confidence',
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [35]:
from cleanlab.rank import get_label_quality_scores

# 라벨 품질 점수 계산
label_quality_scores = get_label_quality_scores(
    labels=train_data['target'].values,
    pred_probs=train_pred_probs
)

# clean_data에 'label_quality' 열 추가
train_data['label_quality2'] = label_quality_scores

In [42]:
from cleanlab.dataset import health_summary
class_names=[0,1,2,3,4,5,6]
health_summary(train_data['target'], train_pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,706 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,0,0,176,80,0.752137,0.57971,0.247863
1,4,4,175,96,0.751073,0.623377,0.248927
2,3,3,135,136,0.581897,0.583691,0.418103
3,1,1,146,143,0.557252,0.552124,0.442748
4,6,6,137,185,0.545817,0.618729,0.454183
5,2,2,134,130,0.529644,0.522088,0.470356
6,5,5,124,257,0.514523,0.687166,0.485477



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,5,6,5,6,77,0.045135
1,3,5,3,5,68,0.039859
2,4,5,4,5,68,0.039859
3,1,5,1,5,63,0.036928
4,2,5,2,5,62,0.036342
5,1,4,1,4,53,0.031067
6,0,2,0,2,53,0.031067
7,1,6,1,6,52,0.030481
8,0,6,0,6,52,0.030481
9,2,6,2,6,52,0.030481



 * Overall, about 54% (917 of the 1,706) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.46.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.4624853458382181,
 'joint': array([[0.03399766, 0.01641266, 0.01758499, 0.01289566, 0.01641266,
         0.02168816, 0.01817116],
        [0.00468933, 0.06799531, 0.0123095 , 0.01758499, 0.01172333,
         0.02051583, 0.01875733],
        [0.01348183, 0.00820633, 0.06975381, 0.00820633, 0.00468933,
         0.02520516, 0.01875733],
        [0.00644783, 0.01172333, 0.014068  , 0.05685815, 0.00820633,
         0.02344666, 0.01524033],
        [0.00644783, 0.01934349, 0.00937866, 0.01524033, 0.03399766,
         0.03165299, 0.02051583],
        [0.003517  , 0.01641266, 0.01113716, 0.01641266, 0.00820633,
         0.06858148, 0.01699883],
        [0.0123095 , 0.01172333, 0.01172333, 0.00937866, 0.007034  ,
         0.02813599, 0.06682298]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           0            0           176                    80     0.752137   
 1           4            4  

In [43]:
# Convert to a set for faster lookups
issue_indices = set(ordered_label_issues)

# Filter out rows with label issues
cleaned_train_data = train_data[~train_data.index.isin(issue_indices)].reset_index(drop=True)

# 확인용 출력
print(f"Original dataset size: {len(train_data)}, Cleaned dataset size: {len(cleaned_train_data)}")

Original dataset size: 1706, Cleaned dataset size: 997


In [45]:
cleaned_train_data.to_csv("../../datasets/v0.0.4/train.csv", index=False)

### 축소된 데이터로 다시 klue/bert-base 모델 이용해 학습 및 예측
- `python main.py --data ../datasets/v0.0.4 --model ./model --mode train`

### TODO
- LLM으로 라벨 추정해서 대입 (라벨 수정)