### logits 확인

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained('../../model_v1.2.2')
model = AutoModelForSequenceClassification.from_pretrained('../../model_v1.2.2/checkpoint-144', num_labels=7).to(DEVICE) # v1.2.1로 학습한 모델

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

train = pd.read_csv('../../v1.2.2/train.csv')

model.eval()
preds = []
for _, data in tqdm(train.iterrows(), total=len(train)):
    tokenized = tokenizer(data['text'], padding='max_length', max_length=50, truncation=True, return_tensors='pt').to(DEVICE)
    with torch.no_grad():
        logits = model(**tokenized).logits.cpu().numpy()
        softmax = [np.exp(x)/np.sum(np.exp(logits)) for x in logits]
        preds.append({'ID': data['ID'], 'softmax': softmax})

preds[0]

  0%|          | 0/3250 [00:00<?, ?it/s]

{'ID': 'ynat-v1_train_00003',
 'softmax': [array([0.06120225, 0.17026156, 0.05120712, 0.04558142, 0.3258167 ,
         0.30573836, 0.04019262], dtype=float32)]}

In [17]:
from cleanlab.filter import find_label_issues

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

pred_probs = [np.array(p['softmax'][0]) for p in preds]
pred_probs = np.array(pred_probs)

ordered_label_issues = find_label_issues(
    labels=train['target'],
    pred_probs=pred_probs,
    return_indices_ranked_by='self_confidence'
)

In [34]:
print(f"total {len(ordered_label_issues)}")
for issue in ordered_label_issues:
    print('input text:', train.iloc[issue]['text'])
    print('label:', train.iloc[issue]['target'])
    pred = np.argmax(pred_probs[issue])
    print('pred:', pred)
    train.loc[train['text']==train.iloc[issue]['text'], 'target'] = pred

    print('----------------------------------------')

total 466
input text: 메시·호날두 UEFA 올해의 팀에 선정…EPL 선수 제로
label: 5
pred: 1
----------------------------------------
input text: 스피드 P7zO 호tel에 W240 코르국 사자 K전환
label: 5
pred: 1
----------------------------------------
input text: 호날두 vs 메시 266번째 엘클라시코가 온다
label: 5
pred: 1
----------------------------------------
input text: 기성용정우영 없는 중원 구자철황인범 라인 뜰까
label: 3
pred: 1
----------------------------------------
input text: MLB.com 다저스 3년 연속 WS 유력하지만 우승은 글쎄…
label: 4
pred: 1
----------------------------------------
input text: WNBA 박지수 LA 스파크스 상대로 2득점 2리바운드
label: 4
pred: 1
----------------------------------------
input text: 여자7위혁신k 출75M박 r발 작용한I
label: 2
pred: 1
----------------------------------------
input text: 홈런 2방에 벌랜더 와르르…MLB 양키스 ALCS서 기사회생
label: 0
pred: 1
----------------------------------------
input text: 칠레 청년 사자 우리서 자살 기도…구조과정서 사자 2마리 사살
label: 5
pred: 6
----------------------------------------
input text: 누워서 타는 자전거로 평창동계올림픽 홍보
label: 5
pred: 1
------------------------------------

In [None]:
train[train['text']=='WNBA 박지수 LA 스파크스 상대로 2득점 2리바운드']

train.to_csv('../../v1.2.3/train.csv', index=False)

In [22]:
from cleanlab.dataset import health_summary
class_names = range(7)
health_summary(train['target'], pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 3,250 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,5,5,83,116,0.186937,0.243187,0.813063
1,3,3,76,16,0.185819,0.045845,0.814181
2,4,4,67,101,0.14922,0.20911,0.85078
3,0,0,69,60,0.143154,0.12685,0.856846
4,6,6,62,65,0.132479,0.138004,0.867521
5,1,1,73,45,0.123939,0.080214,0.876061
6,2,2,43,70,0.105134,0.16055,0.894866



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,4,5,4,5,94,0.028923
1,0,1,0,1,41,0.012615
2,2,6,2,6,38,0.011692
3,5,6,5,6,38,0.011692
4,3,5,3,5,28,0.008615
5,0,3,0,3,24,0.007385
6,1,2,1,2,24,0.007385
7,2,3,2,3,21,0.006462
8,1,4,1,4,21,0.006462
9,0,4,0,4,18,0.005538



 * Overall, about 10% (327 of the 3,250) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.90.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.8993846153846153,
 'joint': array([[0.12707692, 0.00553846, 0.00338462, 0.00338462, 0.00369231,
         0.00276923, 0.00246154],
        [0.00707692, 0.15876923, 0.00646154, 0.00030769, 0.00430769,
         0.00184615, 0.00246154],
        [0.00061538, 0.00092308, 0.11261538, 0.00061538, 0.00215385,
         0.00123077, 0.00769231],
        [0.004     , 0.00092308, 0.00584615, 0.10246154, 0.00338462,
         0.00830769, 0.00092308],
        [0.00184615, 0.00215385, 0.        , 0.00030769, 0.11753846,
         0.01538462, 0.00092308],
        [0.00215385, 0.00215385, 0.00184615, 0.00030769, 0.01353846,
         0.11107692, 0.00553846],
        [0.00276923, 0.00215385, 0.004     , 0.        , 0.004     ,
         0.00615385, 0.12492308]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           5            5            83                   116     0.186937   
 1           3            3  