# Cleanlab 라이브러리를 사용한 라벨 오류 제거

In [None]:
import os
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification

from tokenization_kobert import KoBertTokenizer

SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
pd.set_option("display.max_colwidth", None)

import re 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

from cleanlab.classification import CleanLearning

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

dataset_train = pd.read_csv(os.path.join(DATA_DIR, 'train_crawl.csv'))
dataset_valid = pd.read_csv(os.path.join(DATA_DIR, 'dev_crawl.csv'))

# input 데이터
raw_train_texts = dataset_train['text'].values
raw_valid_texts = dataset_valid['text'].values

# label 데이터
train_labels = dataset_train['target'].values
valid_labels = dataset_valid['target'].values

## klue 버전 cleanlab - ai-hub 데이터

In [None]:
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel
full_hub = pd.read_csv('../notebooks/full_hub_data.csv')

MODEL_NAME = "klue/roberta-large"
# Use HuggingFace/Transformers model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(MODEL_NAME)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(bert_model.config.hidden_size)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_texts = model.encode(full_hub['text'].values)
train_labels = full_hub['target'].values

# calculate label error rate, cleansing error label depends on cutoff
def label_cleaning(source_df: pd.DataFrame, data: np.ndarray, labels: np.ndarray) -> pd.DataFrame:
    """
    Cleanlab 라이브러리를 활용해 잘못 라벨링이 되어있는 경우를 예측해 0과 1사이의 labe_quality와 예상되는 정답라벨을 출력
    이를 원본 데이터의 컬럼으로 추가해서 반환

    Args:
        source_df: 원본 데이터가 있는 DataFrame
        data: 로지스틱 회귀 모델에 공급될 특성 값을 가진 데이터 배열
        labels: 데이터와 연관된 원래 레이블의 배열
    
    Returns:
        df: 원본 데이터와 label_quality, predicted_label을 붙인 결과

    """
    model = LogisticRegression(max_iter=500)
    cv_n_folds = 5  
    cl = CleanLearning(model, cv_n_folds=cv_n_folds)

    label_issues_df = cl.find_label_issues(X=data, labels=labels)

    label_issues_df = label_issues_df[['label_quality', 'predicted_label']]
    source_df.reset_index(drop=True, inplace=True)

    df = pd.concat([source_df, label_issues_df], axis=1)

    return df

df_hub = label_cleaning(full_hub, train_texts, train_labels)

In [41]:
df = df_hub.copy()

dictionary = {
    0: 'IT과학',
    1: '경제',
    2: '사회',
    3: '문화',
    4: '국제',
    5: '스포츠',
    6: '정치'
}

df['text_target'] = df['target'].replace(dictionary)
df['text_predicted_label'] = df['predicted_label'].replace(dictionary)
df.drop(['publish_date', 'year', 'len'], axis=1, inplace=True)

In [82]:
label_nums = [1500, 1500, 1500, 1500, 1000, 1000, 1000]

final_df = pd.DataFrame(columns=df.columns)
for i in range(7):
    new_df = df[df['target'] == i].sort_values('label_quality', ascending=False)[:label_nums[i]]
    final_df = pd.concat([final_df, new_df])
final_df = final_df[['text', 'target']]
final_df

Unnamed: 0,text,target
104453,"광주시, 국가 인공지능 데이터센터 서비스 이용자 모집",0
105292,"광주시, 인공지능·양자산업 육성",0
107684,2023년 빅데이터·AI 활용 '똑똑한 발전소' 나온다,0
21558,"부산시, 인공지능(AI) 기반 실시간 게임 분석기술 개발 추진",0
25912,"부산시, 인공지능 기반 실시간 게임 분석기술 개발",0
...,...,...
30476,"신영대 의원, 민주당 대선경선기획단에 인선",6
13018,"""친애하는 정성호 동지""란 추미애…野 ""소음에 온 국민 피곤""",6
377,"여야 후보들 레이스 본격화… 박-우, 나-오 ‘토론 맞대결’ 하이라이트",6
45005,"'풍요속 불안', 국민의힘 서울시장 주자 10명...안 대표와 단일화 '관심'",6


In [83]:
# final_df.to_csv('../hub_cleanlab_data.csv', index=False)

## klue 버전 cleanlab - train, valid

In [4]:
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel
train = pd.read_csv('../data/train.csv')

raw_train_texts = train['text'].values
train_labels = train['target'].values

MODEL_NAME = "klue/roberta-large"
model = SentenceTransformer(MODEL_NAME)

# Get the sentence embeddings
train_texts = model.encode(raw_train_texts)

df_train = label_cleaning(train, train_texts, train_labels)

No sentence-transformers model found with name /opt/ml/.cache/torch/sentence_transformers/klue_roberta-large. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /opt/ml/.cache/torch/sentence_transformers/klue_roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# 원래 나뉘어져 있던 것을 가져와서 각각 clean 하는 과정
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "klue/roberta-large"
# Use HuggingFace/Transformers model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(MODEL_NAME)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(bert_model.config.hidden_size)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Get the sentence embeddings
train_texts = model.encode(raw_train_texts)
valid_texts = model.encode(raw_valid_texts)

df_train = label_cleaning(dataset_train, train_texts, train_labels)
df_valid = label_cleaning(dataset_valid, valid_texts, valid_labels)

In [9]:
df = df_train.copy()

dictionary = {
    0: 'IT과학',
    1: '경제',
    2: '사회',
    3: '문화',
    4: '국제',
    5: '스포츠',
    6: '정치'
}

df['text_target'] = df['target'].replace(dictionary)
df['text_predicted_label'] = df['predicted_label'].replace(dictionary)
df.drop(['url', 'date'], axis=1, inplace=True)
df[df['label_quality'] <= 0.01].sample(10)

Unnamed: 0,ID,text,target,label_quality,predicted_label,text_target,text_predicted_label
5480,ynat-v1_train_05480,KT·SKT 5G 통신 국제 표준 규격 개발 주도종합,2,0.008461,0,사회,IT과학
30154,ynat-v1_train_30154,구글코리아 머신러닝 챌린지 2017 개최,3,0.002102,0,문화,IT과학
7624,ynat-v1_train_07624,英 총리 후보 메이·레드섬 EU탈퇴 협상 싸고 견해차 뚜렷,6,0.009837,4,정치,국제
37594,ynat-v1_train_37594,7월 분양권 거래 양극화…수도권 76% 급증·지방 36% 줄어,3,0.001852,1,문화,경제
13797,ynat-v1_train_13797,네이버TV캐스트 네이버TV로 새 단장…모바일 편의성↑,6,0.004966,0,정치,IT과학
30141,ynat-v1_train_30141,가루다항공 인도네시아·호주 항공권 최대 15% 할인,6,0.001298,4,정치,국제
36521,ynat-v1_train_36521,혈압측정에 무선이어폰 수납까지…각양각색 스마트워치 눈길,3,0.000443,0,문화,IT과학
9195,ynat-v1_train_09195,미원상사 1주당 1천원 분기배당,2,0.008016,1,사회,경제
26949,ynat-v1_train_26949,판문점 선언 함께 걷는 남북정상,5,0.000612,6,스포츠,정치
21106,ynat-v1_train_21106,애플 뮤직에 성인용 콘텐츠 추가…5천만곡으로 확대,2,0.006954,0,사회,IT과학


In [30]:
df = df_valid.copy()

dictionary = {
    0: 'IT과학',
    1: '경제',
    2: '사회',
    3: '문화',
    4: '국제',
    5: '스포츠',
    6: '정치'
}

df['text_target'] = df['target'].replace(dictionary)
df['text_predicted_label'] = df['predicted_label'].replace(dictionary)
df.drop(['url', 'date'], axis=1, inplace=True)

df[df['label_quality'] <= 0.01]

Unnamed: 0,ID,text,target,label_quality,predicted_label,text_target,text_predicted_label
42,ynat-v1_train_21763,올림픽 평창 계촌5리 작은 마을서 지구촌 화합 파티,5,0.004693,5,문화,스포츠
92,ynat-v1_train_10074,친박계 김무성 권력자 발언에 발끈…확전은 자제,6,0.000762,6,IT과학,정치
97,ynat-v1_train_34479,한수원 안전 공익광고로 서울영상광고제 은상 수상,2,0.006309,2,문화,사회
289,ynat-v1_train_00354,인공지능에서 구글과 경쟁하려면 데이터 확보 중요,0,0.001035,0,국제,IT과학
381,ynat-v1_train_26552,단독 멈춰선 롯데…수조원대 면세점·호텔 인수도 무산,1,0.002261,1,IT과학,경제
...,...,...,...,...,...,...,...
13569,ynat-v1_train_24010,백악관 IS 시리아에서 100% 제거됐다,4,0.000833,4,경제,국제
13572,ynat-v1_train_27428,연구성과로 창업을…과기부 실험실 일자리 토크콘서트,0,0.004482,0,문화,IT과학
13618,ynat-v1_train_06491,靑 우병우 수석 휴가복귀 후 정상근무,6,0.001107,6,스포츠,정치
13642,ynat-v1_train_02464,그리스 조만간 10년물 국고채 발행…구제금융 이후 처음,4,0.000253,4,사회,국제


In [10]:
df_train['target'] = df_train.apply(lambda row: row['predicted_label'] if row['label_quality'] <= 0.01 else row['target'], axis=1)
df_valid['target'] = df_valid.apply(lambda row: row['predicted_label'] if row['label_quality'] <= 0.01 else row['target'], axis=1)