In [31]:
import pandas as pd

In [32]:
# 훈련 데이터셋 선언

data_train = pd.read_csv("../data/train.csv")
print(len(data_train))

2800


In [33]:
# 텍스트 오염 데이터셋 선언

data_contaminated_texts = pd.read_csv("../data/df_contaminated_texts.csv")
data_contaminated_texts = data_contaminated_texts.drop(["special_char_count", "special_char_ratio"], axis=1)
print(len(data_contaminated_texts))


# 기존 텍스트 오염 데이터셋 타겟을 숫자형으로 복구시켜줌

def mapping_label(df):
    # 딕셔너리로 매핑 정의
    label_map = {
        0:'생활문화',1:'스포츠',2:'정치',3:'사회',4:'IT과학',5:'경제',6:'세계'
    }
    inverted_label_map = {value: key for key, value in label_map.items()}

    # map 함수를 사용하여 변환
    df['target'] = df['target'].map(inverted_label_map)
    
    return df

# 사용
data_contaminated_texts = mapping_label(data_contaminated_texts)

for i in range(10):
    print(data_contaminated_texts.iloc[i])

1595
ID              ynat-v1_train_00624
text      사건!>실"를 b$#라 #극)]체O.;월f:?
target                            0
Name: 0, dtype: object
ID                   ynat-v1_train_02413
text      전W;참c 이nd 유j]"m객ie((우려` ?<. 개조
target                                 6
Name: 1, dtype: object
ID                   ynat-v1_train_02275
text      더*} ]i대+ 김현권!v원a}_대8*vdL줄!알았d 
target                                 2
Name: 2, dtype: object
ID                   ynat-v1_train_02765
text      $G ;!p 서울 |곳 나vo객 =적"bs 귀s,d거)
target                                 3
Name: 3, dtype: object
ID                   ynat-v1_train_01203
text      nO통령z\후 p싱턴으로 ^R!與지p부 @re-#{{\
target                                 2
Name: 4, dtype: object
ID                     ynat-v1_train_02105
text      ^r^[P홀w2019년H'v요?서트vYp 세,#uNs[cJ
target                                   0
Name: 5, dtype: object
ID                  ynat-v1_train_01383
text      H@마 대통령uir6후m워싱턴_(ymy무( 개k_z정
target                                6
Name: 6,

In [34]:
# 텍스트 비오염 데이터셋 선언

data_clean_texts = pd.DataFrame(columns=data_train.columns)
data_contaminated_texts_ids = data_contaminated_texts['ID'].tolist()

for i in range(len(data_train)):
    id = data_train.iloc[i]["ID"]
    if id not in data_contaminated_texts_ids:
        data_clean_texts.loc[len(data_clean_texts)] = data_train.iloc[i]

for i in range(10):
    print(data_clean_texts.iloc[i])

ID                  ynat-v1_train_00003
text      갤노트8 주말 27만대 개통…시장은 불법 보조금 얼룩
target                                5
Name: 0, dtype: object
ID                   ynat-v1_train_00005
text      美성인 6명 중 1명꼴 배우자·연인 빚 떠안은 적 있다
target                                 0
Name: 1, dtype: object
ID                     ynat-v1_train_00007
text      아가메즈 33득점 우리카드 KB손해보험 완파…3위 굳...
target                                   4
Name: 2, dtype: object
ID                   ynat-v1_train_00008
text      朴대통령 얼마나 많이 놀라셨어요…경주 지진현장 방문종합
target                                 6
Name: 3, dtype: object
ID               ynat-v1_train_00009
text      듀얼심 아이폰 하반기 출시설 솔솔…알뜰폰 기대감
target                             4
Name: 4, dtype: object
ID         ynat-v1_train_00011
text      NH투자 1월 옵션 만기일 매도 우세
target                       1
Name: 5, dtype: object
ID             ynat-v1_train_00012
text      황총리 각 부처 비상대비태세 철저히 강구해야
target                           2
Name: 6, dtype: object
ID                   ynat-v1_train_

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

num_classes = data_train["target"].nunique()

# KoBERT 모델 로드 및 학습
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=num_classes)

# 정확한 라벨이 있는 데이터로 모델 학습
texts_correct_labels = data_contaminated_texts['text'].tolist()
labels_correct_labels = data_contaminated_texts['target'].tolist()

# 텍스트 데이터를 모델 입력 형식으로 변환
inputs_clean = tokenizer(texts_correct_labels, padding=True, truncation=True, return_tensors="pt")
labels_clean = torch.tensor(labels_correct_labels)

# 모델 학습 단계
model.train()
outputs = model(**inputs_clean, labels=labels_clean)
loss = outputs.loss
loss.backward()


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch.nn.functional as F

# 노이즈가 있는 데이터에 대한 예측
texts_contaminated = data_clean_texts['text'].tolist()
inputs_contaminated = tokenizer(texts_contaminated, padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs_noisy = model(**inputs_contaminated)
    pred_probs_noisy = F.softmax(outputs_noisy.logits, dim=-1).numpy()

In [None]:
from cleanlab.filter import find_label_issues

label_issues = find_label_issues(
    labels=data_contaminated_texts['target'],
    pred_probs=pred_probs_noisy,
    return_indices_ranked_by='self_confidence',
)

In [43]:
from cleanlab.dataset import health_summary
class_names=[0,1,2,3,4,5,6]
health_summary(data_contaminated_texts['target'], pred_probs_noisy, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,595 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,2,2,227,0,0.995614,0.0,0.004386
1,0,0,226,14,0.995595,0.933333,0.004405
2,1,1,226,1,0.995595,0.5,0.004405
3,4,4,226,4,0.995595,0.8,0.004405
4,6,6,216,150,0.93913,0.914634,0.06087
5,5,5,214,78,0.930435,0.829787,0.069565
6,3,3,44,1132,0.19469,0.861492,0.80531



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,3,6,3,6,228,0.142947
1,3,5,3,5,198,0.124138
2,1,3,1,3,191,0.119749
3,0,3,0,3,188,0.117868
4,2,3,2,3,188,0.117868
5,3,4,3,4,183,0.114734
6,5,6,5,6,36,0.022571
7,4,6,4,6,29,0.018182
8,0,6,0,6,28,0.017555
9,2,6,2,6,23,0.01442



 * Overall, about 87% (1,383 of the 1,595) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.13.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.13291536050156738,
 'joint': array([[0.00062696, 0.        , 0.        , 0.11661442, 0.        ,
         0.01128527, 0.0137931 ],
        [0.00062696, 0.00062696, 0.        , 0.11974922, 0.        ,
         0.00752351, 0.0137931 ],
        [0.00062696, 0.00062696, 0.00062696, 0.11786834, 0.00125392,
         0.00752351, 0.01442006],
        [0.00125392, 0.        , 0.        , 0.11410658, 0.00062696,
         0.00877743, 0.0169279 ],
        [0.00125392, 0.        , 0.        , 0.11410658, 0.00062696,
         0.00815047, 0.01818182],
        [0.00125392, 0.        , 0.        , 0.1153605 , 0.00062696,
         0.01003135, 0.0169279 ],
        [0.00376176, 0.        , 0.        , 0.12601881, 0.        ,
         0.00564263, 0.00877743]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           2            2           227                     0     0.995614   
 1           0            0 