In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from cleanlab.filter import find_label_issues
import torch
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Step 1: CSV 파일을 Pandas 데이터프레임으로 로드
df_train = pd.read_csv('data/train.csv')  # 파일 경로에 맞게 수정

In [4]:
# Step 2: 텍스트와 타겟 열 추출
texts = df_train['text'].tolist()  # 텍스트 데이터를 리스트로 변환
labels = df_train['target'].tolist()  # 라벨 데이터를 리스트로 변환

In [5]:
# Step 3: 모델과 토크나이저 로드
model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)  # 라벨 개수에 맞게 설정


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Step 4: 텍스트 데이터 토큰화
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)


In [7]:
# Step 5: 예측 수행 및 확률 계산
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

train_pred_probs = F.softmax(logits, dim=1).numpy()  # 예측 확률

In [8]:
train_pred_probs.shape

(2800, 7)

In [9]:
# Step 6: Cleanlab으로 라벨 오류 탐지
ordered_label_issues = find_label_issues(
    labels=labels,
    pred_probs=train_pred_probs,
    return_indices_ranked_by='self_confidence'
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [11]:
# Step 7: 라벨 오류가 의심되는 샘플 출력
head_issues = ordered_label_issues[:10]
for issue in head_issues:
    print('ID:', df_train.iloc[issue]['ID'])
    print('input text:', df_train.iloc[issue]['text'])
    print('label:', df_train.iloc[issue]['target'])
    print("-------------------")

ID: ynat-v1_train_01348
input text: 금의환향한 류현진 추신수와 같은 팀 특별할 것 같다
label: 5
-------------------
ID: ynat-v1_train_01775
input text: 시즌 첫 골 손흥민 모든 상황 준비해 좋은 결과로 이어졌다
label: 5
-------------------
ID: ynat-v1_train_01702
input text: 연말부터 12시간 단위 데이터 로밍 가능
label: 5
-------------------
ID: ynat-v1_train_02354
input text: 손가락 욕설한 델리 알리 A매치 1경기 출전금지
label: 5
-------------------
ID: ynat-v1_train_02174
input text: 주상복합건물 동 사이 거리 좁아진다…건축연면적↑
label: 5
-------------------
ID: ynat-v1_train_00689
input text: 카톡방 URL 검색 노출 개인정보법 위반 소지 약해
label: 5
-------------------
ID: ynat-v1_train_01576
input text: 되살아난 트라우마 롯데 연패에 울고 판정에 울고
label: 5
-------------------
ID: ynat-v1_train_01976
input text: 오션a6지 중& 자회사 지분 2억~O 취득
label: 5
-------------------
ID: ynat-v1_train_02329
input text: 윤상현 오늘 아침 김무성 자택 찾아 사과했다속보
label: 5
-------------------
ID: ynat-v1_train_01805
input text: 하나금투 서울반도체 올해 실적 상저하고…매수
label: 5
-------------------


In [12]:
from cleanlab.dataset import health_summary


In [13]:
class_names=[0,1,2,3,4,5,6]
health_summary(df_train['target'], train_pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 2,800 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,0,0,393,13,0.989924,0.764706,0.010076
1,5,5,414,28,0.988067,0.848485,0.011933
2,2,2,380,110,0.979381,0.932203,0.020619
3,4,4,388,55,0.955665,0.753425,0.044335
4,3,3,363,128,0.942857,0.853333,0.057143
5,1,1,381,145,0.929268,0.833333,0.070732
6,6,6,78,1918,0.197468,0.858166,0.802532



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,2,6,2,6,348,0.124286
1,1,6,1,6,344,0.122857
2,5,6,5,6,334,0.119286
3,3,6,3,6,332,0.118571
4,4,6,4,6,325,0.116071
5,0,6,0,6,313,0.111786
6,1,2,1,2,48,0.017143
7,1,3,1,3,44,0.015714
8,3,4,3,4,37,0.013214
9,1,5,1,5,33,0.011786



 * Overall, about 86% (2,396 of the 2,800) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.14.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.14428571428571424,
 'joint': array([[0.00142857, 0.01107143, 0.00678571, 0.00821429, 0.0025    ,
         0.00178571, 0.11      ],
        [0.        , 0.01035714, 0.0075    , 0.00785714, 0.00321429,
         0.00214286, 0.11535714],
        [0.00071429, 0.00964286, 0.00285714, 0.00321429, 0.00285714,
         0.00178571, 0.1175    ],
        [0.00107143, 0.00785714, 0.00392857, 0.00785714, 0.00428571,
         0.00142857, 0.11107143],
        [0.00035714, 0.00607143, 0.00857143, 0.00892857, 0.00642857,
         0.00107143, 0.11357143],
        [0.00071429, 0.00964286, 0.00571429, 0.01      , 0.00428571,
         0.00178571, 0.1175    ],
        [0.00178571, 0.0075    , 0.00678571, 0.0075    , 0.0025    ,
         0.00178571, 0.11321429]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           0            0           393                    13     0.989924   
 1           5            5 