In [1]:
import pandas as pd

In [2]:
# 훈련 데이터셋 선언

data_train = pd.read_csv("../data/train.csv")
print(len(data_train))

2800


In [3]:
# 텍스트 오염 데이터셋 선언

data_contaminated_texts = pd.read_csv("../data/df_contaminated_texts.csv")
data_contaminated_texts = data_contaminated_texts.drop(["special_char_count", "special_char_ratio"], axis=1)
print(len(data_contaminated_texts))


# 기존 텍스트 오염 데이터셋 타겟을 숫자형으로 복구시켜줌

def mapping_label(df):
    # 딕셔너리로 매핑 정의
    label_map = {
        0:'생활문화',1:'스포츠',2:'정치',3:'사회',4:'IT과학',5:'경제',6:'세계'
    }
    inverted_label_map = {value: key for key, value in label_map.items()}

    # map 함수를 사용하여 변환
    df['target'] = df['target'].map(inverted_label_map)
    
    return df

# 사용
data_contaminated_texts = mapping_label(data_contaminated_texts)

for i in range(10):
    print(data_contaminated_texts.iloc[i])

1595
ID              ynat-v1_train_00624
text      사건!>실"를 b$#라 #극)]체O.;월f:?
target                            0
Name: 0, dtype: object
ID                   ynat-v1_train_02413
text      전W;참c 이nd 유j]"m객ie((우려` ?<. 개조
target                                 6
Name: 1, dtype: object
ID                   ynat-v1_train_02275
text      더*} ]i대+ 김현권!v원a}_대8*vdL줄!알았d 
target                                 2
Name: 2, dtype: object
ID                   ynat-v1_train_02765
text      $G ;!p 서울 |곳 나vo객 =적"bs 귀s,d거)
target                                 3
Name: 3, dtype: object
ID                   ynat-v1_train_01203
text      nO통령z\후 p싱턴으로 ^R!與지p부 @re-#{{\
target                                 2
Name: 4, dtype: object
ID                     ynat-v1_train_02105
text      ^r^[P홀w2019년H'v요?서트vYp 세,#uNs[cJ
target                                   0
Name: 5, dtype: object
ID                  ynat-v1_train_01383
text      H@마 대통령uir6후m워싱턴_(ymy무( 개k_z정
target                                6
Name: 6,

In [4]:
# 텍스트 비오염 데이터셋 선언

data_clean_texts = pd.DataFrame(columns=data_train.columns)
data_contaminated_texts_ids = data_contaminated_texts['ID'].tolist()

for i in range(len(data_train)):
    id = data_train.iloc[i]["ID"]
    if id not in data_contaminated_texts_ids:
        data_clean_texts.loc[len(data_clean_texts)] = data_train.iloc[i]

for i in range(10):
    print(data_clean_texts.iloc[i])

ID                  ynat-v1_train_00003
text      갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩
target                                5
Name: 0, dtype: object
ID                   ynat-v1_train_00005
text      美성인 6명 중 1명꼴 배우자·연인 빚 떠안은 적 있다
target                                 0
Name: 1, dtype: object
ID                     ynat-v1_train_00007
text      아가메즈 33득점 우리카드 KB손해보험 완파…3위 굳...
target                                   4
Name: 2, dtype: object
ID                   ynat-v1_train_00008
text      朴대통령 얼마나 많이 놀라셨어요…경주 지진현장 방문종합
target                                 6
Name: 3, dtype: object
ID               ynat-v1_train_00009
text      듀얼심 아이폰 하반기 출시설 솔솔…알뜰폰 기대감
target                             4
Name: 4, dtype: object
ID         ynat-v1_train_00011
text      NH투자 1월 옵션 만기일 매도 우세
target                       1
Name: 5, dtype: object
ID             ynat-v1_train_00012
text      황총리 각 부처 비상대비태세 철저히 강구해야
target                           2
Name: 6, dtype: object
ID                   ynat-v1_train_

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
from cleanlab.filter import find_label_issues

# KoBERT 모델 로드
num_classes = data_train["target"].nunique()
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=num_classes)

# 학습에 사용할 데이터: 정확한 라벨이 있는 데이터로 학습
texts_correct_labels = data_contaminated_texts['text'].tolist()
labels_correct_labels = data_contaminated_texts['target'].tolist()

# 텍스트 데이터 토크나이저 적용
inputs_correct_labels = tokenizer(texts_correct_labels, padding=True, truncation=True, return_tensors="pt")
labels_correct_labels = torch.tensor(labels_correct_labels)

# 모델 학습 단계
model.train()
outputs = model(**inputs_correct_labels, labels=labels_correct_labels)
loss = outputs.loss
loss.backward()
# 옵티마이저 설정 후 학습 업데이트 필요
# optimizer.step()  # 옵티마이저가 설정된 경우에만 필요

# 라벨 노이즈가 있는 데이터에 대한 예측
texts_incorrect_labels = data_clean_texts['text'].tolist()
inputs_incorrect_labels = tokenizer(texts_incorrect_labels, padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs_noisy = model(**inputs_incorrect_labels)
    pred_probs_incorrect_labels = F.softmax(outputs_noisy.logits, dim=-1)

# find_label_issues로 라벨 이슈 탐지
label_issues = find_label_issues(
    labels=data_clean_texts['target'].tolist(),  # 라벨을 리스트 형태로 전달
    pred_probs=pred_probs_incorrect_labels.numpy(),  # np 배열로 전달
    return_indices_ranked_by='self_confidence'
)

print("라벨 이슈 인덱스:", label_issues)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


라벨 이슈 인덱스: [ 132 1029  707  624 1000   28 1136  355 1022  733  981  483  759 1174
  621 1186  469  691 1109  625  275  856 1150 1070  931 1161   11  884
  419   32  777  103  100  127 1163  813  297  926  396  541  709  655
  728 1200  565 1180  605 1162  255  223  865  947  227   18  807 1035
  912  608  178  864  133  531 1042  485   23  154  481  896  686  742
 1004  357  921  607  737 1052  122  291  995  311  873  587  322  113
  671  609  244  778  163   45  725  841  131  299   92  251    6  243
 1055  330  372  774  825 1062  990  294  490   13  511  563  340  687
  929  513  206  267  676  648 1073   62  920  452  703  599  916  705
  862  379  377  158  853  684  793  700   97  660  189 1007  519   77
  505   56  581  306  222  610  237  555  999  560  855   27  274 1203
  724  453  577  938  474  298  939  423  309  834  214  230  883   10
  658 1189  240 1199 1076 1044  304  162  507  664  107  965  213  738
 1175  549  958   94  797  871  852  598 1159  247  721  786 1098 

In [None]:
from cleanlab.dataset import health_summary

# 예측 확률을 numpy 배열로 변환
pred_probs_incorrect_labels_np = pred_probs_incorrect_labels.numpy()

# health_summary 함수 호출
class_names = [0, 1, 2, 3, 4, 5, 6]
health_summary(data_clean_texts['target'].tolist(), pred_probs_incorrect_labels_np, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,205 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,5,5,188,0,0.994709,0.0,0.005291
1,6,6,164,9,0.993939,0.9,0.006061
2,2,2,159,0,0.99375,0.0,0.00625
3,3,3,158,0,0.993711,0.0,0.006289
4,1,1,181,5,0.989071,0.714286,0.010929
5,4,4,131,206,0.731844,0.811024,0.268156
6,0,0,31,792,0.182353,0.850698,0.817647



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,0,4,0,4,157,0.13029
1,0,5,0,5,152,0.126141
2,0,1,0,1,134,0.111203
3,0,2,0,2,127,0.105394
4,0,6,0,6,127,0.105394
5,0,3,0,3,126,0.104564
6,1,4,1,4,46,0.038174
7,4,6,4,6,39,0.032365
8,4,5,4,5,34,0.028216
9,3,4,3,4,31,0.025726



 * Overall, about 84% (1,016 of the 1,205) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.16.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.15684647302904564,
 'joint': array([[0.1153527 , 0.00082988, 0.        , 0.        , 0.02406639,
         0.        , 0.00082988],
        [0.11037344, 0.00165975, 0.        , 0.        , 0.03817427,
         0.        , 0.00165975],
        [0.10539419, 0.00082988, 0.00082988, 0.        , 0.02489627,
         0.        , 0.00082988],
        [0.10456432, 0.00082988, 0.        , 0.00082988, 0.02572614,
         0.        , 0.        ],
        [0.10622407, 0.        , 0.        , 0.        , 0.03983402,
         0.        , 0.00248963],
        [0.12614108, 0.        , 0.        , 0.        , 0.02821577,
         0.00082988, 0.00165975],
        [0.10456432, 0.00165975, 0.        , 0.        , 0.02987552,
         0.        , 0.00082988]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           5            5           188                     0     0.994709   
 1           6            6 