In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from cleanlab.filter import find_label_issues
import torch
import torch.nn.functional as F
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

set_seed()
# 이 셀만 실행하면 시드 고정 됨

In [3]:
# CSV 파일을 Pandas 데이터프레임으로 로드
df_train = pd.read_csv('data/train.csv')  # 파일 경로에 맞게 수정

In [4]:
import re


In [5]:
# 특수문자 비율과 문자-특수문자 교차 빈도 계산 함수
def is_noisy_text(text):
    # 특수문자 비율
    special_chars = re.findall(r'[^\w\s]', text)
    special_char_ratio = len(special_chars) / len(text) if len(text) > 0 else 0
    
    # 문자와 특수문자의 교차 빈도
    cross_count = len(re.findall(r'(\w[^\w\s])|([^\w\s]\w)', text))
    
    # 노이즈 기준: 특수문자 비율이 높고, 문자-특수문자 교차가 많은 경우
    if special_char_ratio >= 0.2 and cross_count >= 3:
        return True
    else:
        return False

In [6]:
# 노이즈 여부에 따라 데이터 분리
df_train['is_noisy'] = df_train['text'].apply(is_noisy_text)
df_noisy = df_train[df_train['is_noisy'] == True]
df_clean = df_train[df_train['is_noisy'] == False]

In [7]:
# 결과 확인
print("노이즈 샘플 수:", len(df_noisy))
print("정상 샘플 수:", len(df_clean))

노이즈 샘플 수: 313
정상 샘플 수: 2487


In [12]:
# df_clean과 df_noisy를 CSV 파일로 저장
df_clean.to_csv('clean_samples.csv', index=False, encoding='utf-8-sig')
df_noisy.to_csv('noisy_samples.csv', index=False, encoding='utf-8-sig')

print("df_clean과 df_noisy를 각각 'clean_samples.csv'와 'noisy_samples.csv'로 저장했습니다.")


df_clean과 df_noisy를 각각 'clean_samples.csv'와 'noisy_samples.csv'로 저장했습니다.


In [None]:
# 텍스트와 타겟 열 추출
texts = df_train['text'].tolist()  # 텍스트 데이터를 리스트로 변환
labels = df_train['target'].tolist()  # 라벨 데이터를 리스트로 변환

### 사용하는 모델
- klue/bert-base
- klue/roberta-large
- FacebookAI/xlm-roberta-large
- monologg/koelectra-base-v3-discriminator

In [None]:
# 모델과 토크나이저 로드
model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)  # 라벨 개수에 맞게 설정


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 텍스트 데이터 토큰화
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

### 모델별 예측 소요 시간(참고용)
- klue/bert-base : 1m 14s
- klue/roberta-large : 4m 12s
- FacebookAI/xlm-roberta-large : 4m 24s
- monologg/koelectra-base-v3-discriminator : 1m 13s

In [None]:
# 예측 수행 및 확률 계산
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

train_pred_probs = F.softmax(logits, dim=1).numpy()  # 예측 확률

In [8]:
# train_pred_probs 형태 확인
train_pred_probs.shape

(2800, 7)

In [None]:
# Cleanlab으로 라벨 오류 탐지
ordered_label_issues = find_label_issues(
    labels=labels,
    pred_probs=train_pred_probs,
    return_indices_ranked_by='self_confidence'
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [19]:
# 모델마다 oredered_label_issues 개수가 다릅니다
len(ordered_label_issues)

1534

In [None]:
# 라벨 오류가 의심되는 샘플 출력
head_issues = ordered_label_issues[0:5]
for issue in head_issues:
    print('ID:', df_train.iloc[issue]['ID'])
    print('input text:', df_train.iloc[issue]['text'])
    print('label:', df_train.iloc[issue]['target'])
    print("-------------------")

ID: ynat-v1_train_02415
input text: 베네수엘라 외무 유엔서 트럼프마두로 직접 담판 제안종합
label: 1
-------------------
ID: ynat-v1_train_02648
input text: ,7안컵 선V단장에 7영일$축구7회 부회장6우Bh위해 지원
label: 1
-------------------
ID: ynat-v1_train_01389
input text: 20대 여성 전주서 실종…전주 여성 살해 피의자 연관 가능성종합
label: 1
-------------------
ID: ynat-v1_train_00841
input text: 31년만의 취재 관련 언론사 압수수색…어수선한 채널A종합
label: 1
-------------------
ID: ynat-v1_train_01657
input text: KB국민은행 농M 국가대표팀과-6년간 후원 <약
label: 1
-------------------


In [12]:
from cleanlab.dataset import health_summary


In [13]:
class_names=[0,1,2,3,4,5,6]
health_summary(df_train['target'], train_pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 2,800 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,2,2,381,88,0.981959,0.926316,0.018041
1,0,0,375,107,0.944584,0.829457,0.055416
2,1,1,385,104,0.939024,0.806202,0.060976
3,5,5,377,272,0.899761,0.866242,0.100239
4,4,4,346,338,0.852217,0.849246,0.147783
5,6,6,296,508,0.749367,0.836903,0.250633
6,3,3,223,966,0.579221,0.856383,0.420779



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,3,6,3,6,240,0.085714
1,3,4,3,4,216,0.077143
2,3,5,3,5,194,0.069286
3,2,3,2,3,184,0.065714
4,0,3,0,3,184,0.065714
5,1,3,1,3,171,0.061071
6,4,6,4,6,148,0.052857
7,5,6,5,6,131,0.046786
8,4,5,4,5,117,0.041786
9,0,6,0,6,104,0.037143



 * Overall, about 85% (2,383 of the 2,800) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.15.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.1489285714285714,
 'joint': array([[0.00785714, 0.00607143, 0.00428571, 0.05964286, 0.015     ,
         0.0175    , 0.03142857],
        [0.00642857, 0.00892857, 0.00785714, 0.05571429, 0.02071429,
         0.0175    , 0.02928571],
        [0.00571429, 0.00642857, 0.0025    , 0.06214286, 0.01607143,
         0.01964286, 0.02607143],
        [0.00607143, 0.00535714, 0.00357143, 0.05785714, 0.02071429,
         0.01285714, 0.03107143],
        [0.00714286, 0.0075    , 0.00607143, 0.05642857, 0.02142857,
         0.01392857, 0.0325    ],
        [0.00714286, 0.00678571, 0.00535714, 0.05642857, 0.02785714,
         0.015     , 0.03107143],
        [0.00571429, 0.005     , 0.00428571, 0.05464286, 0.02035714,
         0.01571429, 0.03535714]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           2            2           381                    88     0.981959   
 1           0            0  