In [None]:
import random
import re

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from cleanlab.filter import find_label_issues
from transformers import AutoModelForSequenceClassification, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
SEED = 42


def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)


set_seed()
# 이 셀만 실행하면 시드 고정 됨

In [None]:
# CSV 파일을 Pandas 데이터프레임으로 로드
df_train = pd.read_csv("data/train.csv")  # 파일 경로에 맞게 수정

---

## 노이즈 데이터 필터링

In [None]:
# 특수문자 비율과 문자-특수문자 교차 빈도 계산 함수
def is_noisy_text(text):
    # 특수문자 비율
    special_chars = re.findall(r"[^\w\s]", text)
    special_char_ratio = len(special_chars) / len(text) if len(text) > 0 else 0

    # 문자와 특수문자의 교차 빈도
    cross_count = len(re.findall(r"(\w[^\w\s])|([^\w\s]\w)", text))

    # 노이즈 기준: 특수문자 비율이 높고, 문자-특수문자 교차가 많은 경우
    if special_char_ratio >= 0.1 and cross_count >= 3:
        return True
    else:
        return False

In [None]:
# 두 번째 필터링 조건 (clean sample에서 남은 노이즈 제거)
def secondary_noise_filter(text):
    english_chars = re.findall(r"[a-zA-Z]", text)
    special_chars = re.findall(r"[^\w\s]", text)
    english_ratio = len(english_chars) / len(text) if len(text) > 0 else 0
    special_char_ratio = len(special_chars) / len(text) if len(text) > 0 else 0
    cross_count = len(re.findall(r"(\w[^\w\s])|([^\w\s]\w)", text))

    # 영어 비율이 0.1 이상이거나 특수문자 비율이 0.1 이상이고, 교차 개수가 2 이상인 경우 노이즈로 간주
    return (english_ratio >= 0.1 or special_char_ratio >= 0.1) and cross_count >= 3

In [None]:
# 첫 번째 필터링 수행
df_train["is_noisy_initial"] = df_train["text"].apply(is_noisy_text)
df_noisy_initial = df_train[df_train["is_noisy_initial"] == True]
df_clean_initial = df_train[df_train["is_noisy_initial"] == False]

In [None]:
# 1차 필터링 결과 저장
df_clean_initial.to_csv("data/filtered/clean_samples_v1.csv", index=False, encoding="utf-8-sig")
df_noisy_initial.to_csv("data/filtered/noisy_samples_v1.csv", index=False, encoding="utf-8-sig")

In [None]:
# 두 번째 필터링 수행 (clean sample에서 추가로 노이즈 제거)
df_clean_initial["is_noisy_secondary"] = df_clean_initial["text"].apply(secondary_noise_filter)
df_noisy_secondary = df_clean_initial[df_clean_initial["is_noisy_secondary"] == True]
df_clean_final = df_clean_initial[df_clean_initial["is_noisy_secondary"] == False]

In [None]:
# 결과 확인
print("첫 번째 필터링 후 노이즈 샘플 수:", len(df_noisy_initial))
print("첫 번째 필터링 후 clean 샘플 수:", len(df_clean_initial))
print("두 번째 필터링 후 추가된 노이즈 샘플 수:", len(df_noisy_secondary))
print("최종 clean 샘플 수:", len(df_clean_final))

첫 번째 필터링 후 노이즈 샘플 수: 1060
첫 번째 필터링 후 clean 샘플 수: 1740
두 번째 필터링 후 추가된 노이즈 샘플 수: 56
최종 clean 샘플 수: 1684


In [None]:
# 2차 필터링 결과 저장
df_clean_final.to_csv("data/filtered/clean_samples_v2.csv", index=False, encoding="utf-8-sig")
df_noisy_secondary.to_csv("data/filtered/noisy_samples_v2.csv", index=False, encoding="utf-8-sig")

---

## 라벨 에러 탐지

In [None]:
df = df_clean_final

In [None]:
# 텍스트와 타겟 열 추출
texts = df["text"].tolist()  # 텍스트 데이터를 리스트로 변환
labels = df["target"].tolist()  # 라벨 데이터를 리스트로 변환

### 사용하는 모델
- klue/bert-base
- klue/roberta-large
- FacebookAI/xlm-roberta-large
- monologg/koelectra-base-v3-discriminator

In [None]:
# 모델과 토크나이저 로드
model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)  # 라벨 개수에 맞게 설정

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 텍스트 데이터 토큰화
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

#### transformer 계열 모델을 통한 주제 분류

In [None]:
# 예측 수행 및 확률 계산
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

train_pred_probs = F.softmax(logits, dim=1).numpy()  # 예측 확률

In [None]:
# train_pred_probs 형태 확인
train_pred_probs.shape

# (clean_data 개수, 7)이 나오면 정상

(1684, 7)

#### 모델이 분류한 라벨을 기준으로 라벨 에러 탐지

In [None]:
# Cleanlab으로 라벨 오류 탐지
ordered_label_issues = find_label_issues(
    labels=labels, pred_probs=train_pred_probs, return_indices_ranked_by="self_confidence"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# 라벨에 이슈가 있는 데이터 개수
len(ordered_label_issues)

855

#### 정상 데이터 추출

In [None]:
# 전체 인덱스에서 라벨 이슈 인덱스를 제외해 정상 데이터 인덱스만 추출
all_indices = set(range(len(df)))
normal_indices = list(all_indices - set(ordered_label_issues))

In [None]:
# 정상 데이터만 따로 추출
df_no_issues = df.iloc[normal_indices]

In [None]:
# 정상 데이터셋 확인
print(f"정상 라벨 데이터 개수: {len(df_no_issues)}")
df_no_issues.head()

정상 라벨 데이터 개수: 829


Unnamed: 0,ID,text,target,is_noisy_initial,is_noisy_secondary
4,ynat-v1_train_00004,pI美대선I앞두고 R2fr단 발] $비해 감시 강화,6,False,False
5,ynat-v1_train_00005,美성인 6명 중 1명꼴 배우자·연인 빚 떠안은 적 있다,0,False,False
6,ynat-v1_train_00006,프로야구~롯TKIAs광주 경기 y천취소,1,False,False
9,ynat-v1_train_00009,듀얼심 아이폰 하반기 출시설 솔솔…알뜰폰 기대감,4,False,False
12,ynat-v1_train_00012,황총리 각 부처 비상대비태세 철저히 강구해야,2,False,False


In [None]:
# target 컬럼 기준으로 정렬
df_no_issues_sorted = df_no_issues.sort_values(by="target").reset_index(drop=True)

# 저장
df_no_issues_sorted.to_csv("data/filtered_2/no_label_errors_v1.csv", index=False, encoding="utf-8-sig")

#### 비정상(라벨 이슈 있는) 데이터 확인

In [None]:
# 라벨 오류가 의심되는 샘플 출력

# 원하는 범위의 샘플을 출력하세요
head_issues = ordered_label_issues[0:10]
for issue in head_issues:
    print("ID:", df_train.iloc[issue]["ID"])
    print("input text:", df_train.iloc[issue]["text"])
    print("label:", df_train.iloc[issue]["target"])
    print("-------------------")

ID: ynat-v1_train_01459
input text: 악성"드[감$kIoTR기기 saeS #료법E3려j`
label: 4
-------------------
ID: ynat-v1_train_01606
input text: FJO로M딜라(P 인수]추진E실d9시작
label: 5
-------------------
ID: ynat-v1_train_00809
input text: &r A농축 >라늄 저장한4 U+…핵yg3 발 :
label: 6
-------------------
ID: ynat-v1_train_00494
input text: 대형서점엔=없uDtOJ네yC과 손}는 b판사들
label: 0
-------------------
ID: ynat-v1_train_00982
input text: 회전 카메라 탑재한 갤럭시A80 SKT 단독출시…59만9천500원
label: 4
-------------------
ID: ynat-v1_train_01369
input text: 화천A마토축제 성,리 폐a|13C; 찾아 2억 판_
label: 0
-------------------
ID: ynat-v1_train_00481
input text: 알뜰폰 헬로모바일 청소년 요금 반값 할인
label: 3
-------------------
ID: ynat-v1_train_01315
input text: 김ChK}X리당*C朴대통6+탄핵L 9  `장설 y속보
label: 2
-------------------
ID: ynat-v1_train_00983
input text: @U·i 9일 정상회의…북핵·무역·투자z경=협력 등 f의
label: 6
-------------------
ID: ynat-v1_train_01018
input text: 5산 초Q학생 3명-9가RE진…김:+Th접b8P치원q'-#확(
label: 3
-------------------


#### health_summary 통한 진단

In [None]:
from cleanlab.dataset import health_summary

In [None]:
class_names = [0, 1, 2, 3, 4, 5, 6]

summary_df = health_summary(df["target"], train_pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,684 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,2,2,221,54,0.977876,0.915254,0.022124
1,0,0,225,85,0.949367,0.876289,0.050633
2,1,1,219,91,0.908714,0.80531,0.091286
3,5,5,228,214,0.857143,0.849206,0.142857
4,4,4,204,243,0.836066,0.858657,0.163934
5,3,3,184,296,0.769874,0.843305,0.230126
6,6,6,143,441,0.619048,0.833648,0.380952



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,3,6,3,6,117,0.069477
1,4,6,4,6,114,0.067696
2,5,6,5,6,109,0.064727
3,3,4,3,4,100,0.059382
4,0,6,0,6,90,0.053444
5,1,6,1,6,84,0.049881
6,3,5,3,5,83,0.049287
7,4,5,4,5,79,0.046912
8,0,3,0,3,71,0.042162
9,2,6,2,6,70,0.041568



 * Overall, about 85% (1,424 of the 1,684) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.15.

Generated with <3 from Cleanlab.



In [None]:
summary_df["classes_by_label_quality"]

# 라벨별로 노이즈 정도와 퀄리티 스코어를 계산한 값입니다

Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,2,2,221,54,0.977876,0.915254,0.022124
1,0,0,225,85,0.949367,0.876289,0.050633
2,1,1,219,91,0.908714,0.80531,0.091286
3,5,5,228,214,0.857143,0.849206,0.142857
4,4,4,204,243,0.836066,0.858657,0.163934
5,3,3,184,296,0.769874,0.843305,0.230126
6,6,6,143,441,0.619048,0.833648,0.380952
