# CleanLab
- 참고 : https://github.com/cleanlab/cleanlab , https://midannii.notion.site/CleanLab-c98a2be6c7be49f2a1e08b0dc99289a4?pvs=4
- 목표 : label 노이즈를 제거해보자

In [None]:
import sys
print(sys.executable)

In [None]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
BASE_DIR = "/data/ephemeral/level2-nlp-datacentric-nlp-06/"
DATA_DIR = os.path.join(BASE_DIR, 'data/')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output/')
LOG_DIR = os.path.join(BASE_DIR, 'logs/')
CACHE_DIR = os.path.join(BASE_DIR, 'cache/')

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [None]:
# 모델 불러오기
model_name = 'bash1130/bert-base-finetuned-ynat'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7, cache_dir=CACHE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

## Inference

In [None]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'train_p2g.csv'))

In [None]:
model.eval()
preds = []
pred_probs = []
for idx, sample in tqdm(dataset_test.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_prob_ = torch.nn.Softmax(dim=1)(logits)
        pred_prob = pred_prob_.cpu().numpy()
        pred = torch.argmax(pred_prob_, dim=1).cpu().numpy()

        pred_probs.append(pred_prob)
        preds.extend(pred)

In [None]:
dataset_test['preds'] = preds
dataset_test['pred_probs'] = pred_probs

# dataset_test.to_csv(os.path.join(BASE_DIR, 'cleanlab_data.csv'), index=False)

In [None]:
dataset_test.head()

In [None]:
from cleanlab.filter import find_label_issues
from cleanlab.dataset import health_summary

def clean_lab(dataset):
    class_names=[0,1,2,3,4,5,6]

    ordered_label_issues = find_label_issues(
        labels=dataset['target'], #데이터셋 라벨
        pred_probs=np.array(dataset['pred_probs'].to_list()).squeeze(), #정답 예측 확률
        return_indices_ranked_by='self_confidence',
    )

    summary = health_summary(dataset['target'], pred_probs=np.array(dataset['pred_probs'].to_list()).squeeze(), class_names=class_names)

    return ordered_label_issues, summary

In [None]:
ordered_label_issues, summary = clean_lab(dataset_test)

In [None]:
ordered_label_issues

In [None]:
summary.keys()

In [None]:
summary['classes_by_label_quality']

## label issue 데이터 모두 제거한 데이터 마련

In [None]:
total_p2g_df = pd.read_csv(os.path.join(DATA_DIR, 'train_p2g.csv'))

# total_p2g_df에서 ordered_label_issues에 해당하는 index만 제거
total_p2g_df = total_p2g_df.drop(ordered_label_issues)

total_p2g_df

In [None]:
# 저장
total_p2g_df.to_csv(os.path.join(DATA_DIR, 'p2g_cleanlab.csv'), index=False)