### validation f1 확인

In [2]:
import numpy as np
import pandas as pd

score = pd.DataFrame(columns=['target', 'total', 'train', 'valid', 'f1'])
score['target'] = np.arange(7)

total = pd.read_csv('../../v1.3.1_aug3/train.csv')
val_pred = pd.read_csv('../../v1.3.1_aug3/valid_output_0.8596.csv')

val_ids = list(val_pred['ID'].values)
train = total[~total['ID'].str.contains('|'.join(val_ids))]
val_answer = total[total['ID'].str.contains('|'.join(val_ids))]

label = range(7)
for l in label:
    score.loc[score['target']==l, 'total'] = len(total[total['target']==l])
    score.loc[score['target']==l, 'train'] = len(train[train['target']==l])
    score.loc[score['target']==l, 'valid'] = len(val_pred[val_pred['target']==l])

    val_correct = val_pred[(val_pred['target']==l) & val_pred['ID'].str.contains('|'.join((val_answer[val_answer['target']==l]['ID'].values)))]
    recall = len(val_correct) / (len(val_answer[val_answer['target']==l]))
    precision = len(val_correct) / (len(val_pred[val_pred['target']==l]))

    score.loc[score['target']==l, 'f1'] = 2 * precision * recall / (precision + recall)

score

Unnamed: 0,target,total,train,valid,f1
0,0,286,227,67,0.857143
1,1,315,253,56,0.898305
2,2,259,208,60,0.828829
3,3,216,155,38,0.686869
4,4,272,223,42,0.879121
5,5,279,225,56,0.836364
6,6,308,243,68,0.917293


### logits 확인

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained('../../model_v1.3.1')
model = AutoModelForSequenceClassification.from_pretrained('../../model_v1.3.1/checkpoint-98', num_labels=7).to(DEVICE)

In [4]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

train = pd.read_csv('../../v1.3.1_aug3/train.csv')

model.eval()
preds = []
for _, data in tqdm(train.iterrows(), total=len(train)):
    tokenized = tokenizer(data['text'], padding='max_length', max_length=50, truncation=True, return_tensors='pt').to(DEVICE)
    with torch.no_grad():
        logits = model(**tokenized).logits.cpu().numpy()
        softmax = [np.exp(x)/np.sum(np.exp(logits)) for x in logits]
        preds.append({'ID': data['ID'], 'softmax': softmax})

preds[0]

  0%|          | 0/1935 [00:00<?, ?it/s]

{'ID': 'ynat-v1_train_00003',
 'softmax': [array([0.05140854, 0.05740018, 0.04042352, 0.04931035, 0.6059849 ,
         0.15501192, 0.04046061], dtype=float32)]}

In [5]:
from cleanlab.filter import find_label_issues

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

pred_probs = [np.array(p['softmax'][0]) for p in preds]
pred_probs = np.array(pred_probs)

ordered_label_issues = find_label_issues(
    labels=train['target'],
    pred_probs=pred_probs,
    return_indices_ranked_by='self_confidence'
)

In [6]:
print(f"total {len(ordered_label_issues)}")
for issue in ordered_label_issues:
    print('input text:', train.iloc[issue]['text'])
    print('label:', train.iloc[issue]['target'])
    pred = np.argmax(pred_probs[issue])
    print('pred:', pred)
    # train.loc[train['text']==train.iloc[issue]['text'], 'target'] = pred

    print('----------------------------------------')

# train.to_csv('../../v1.2.3/train.csv', index=False)

total 16
input text: H조원 실_  보…라인 D로벌 기z 사냥 7망
label: 5
pred: 1
----------------------------------------
input text: 밤낮없는 무더위…강릉 30.1도 등 강원 대부분 열대야
label: 1
pred: 0
----------------------------------------
input text: 러시!p[7A 정식 제안 온k면7푸틴+a프_만날4것
label: 6
pred: 0
----------------------------------------
input text: 명불@전 b기민…마린스키:왕9의\c격에0객석]4y
label: 0
pred: 1
----------------------------------------
input text: 차범K3>지성과 |B 나@히 손3민~지금부터Pq새 역사
label: 0
pred: 1
----------------------------------------
input text: t스 정복한 _공:능 .둑은 8 늦D&
label: 4
pred: 0
----------------------------------------
input text: 실l나는 괴t에 8름>식은!…b포 5_$=험해b니
label: 4
pred: 0
----------------------------------------
input text: 방탄소년단 LG G7 씽큐 광고 50일만에 1.5억뷰 돌파
label: 5
pred: 4
----------------------------------------
input text: 우산 챙기세요…전국 구름 많고 소나기
label: 6
pred: 0
----------------------------------------
input text: 내일날씨 가을 적시는 비…충청·남부 시간당 30㎜ 이상
label: 6
pred: 0
----------------------------------------
input text:

In [7]:
from cleanlab.dataset import health_summary
class_names = range(7)
health_summary(train['target'], pred_probs, class_names=class_names)

----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,935 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,6,6,5,2,0.016234,0.006557,0.983766
1,5,5,4,0,0.014337,0.0,0.985663
2,2,2,2,0,0.007722,0.0,0.992278
3,4,4,2,2,0.007353,0.007353,0.992647
4,0,0,2,8,0.006993,0.027397,0.993007
5,1,1,1,4,0.003175,0.012579,0.996825
6,3,3,0,0,0.0,0.0,1.0



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,0,6,0,6,5,0.002584
1,0,1,0,1,3,0.00155
2,0,4,0,4,2,0.001034
3,1,5,1,5,2,0.001034
4,4,5,4,5,2,0.001034
5,2,6,2,6,2,0.001034
6,0,5,0,5,0,0.0
7,0,2,0,2,0,0.0
8,0,3,0,3,0,0.0
9,1,4,1,4,0,0.0



 * Overall, about 0% (9 of the 1,935) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 1.00.

Generated with <3 from Cleanlab.



{'overall_label_health_score': 0.9953488372093023,
 'joint': array([[0.14677003, 0.00103359, 0.        , 0.        , 0.        ,
         0.        , 0.        ],
        [0.0005168 , 0.1622739 , 0.        , 0.        , 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.13281654, 0.        , 0.        ,
         0.        , 0.00103359],
        [0.        , 0.        , 0.        , 0.11162791, 0.        ,
         0.        , 0.        ],
        [0.00103359, 0.        , 0.        , 0.        , 0.13953488,
         0.        , 0.        ],
        [0.        , 0.00103359, 0.        , 0.        , 0.00103359,
         0.14211886, 0.        ],
        [0.00258398, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.15658915]]),
 'classes_by_label_quality':    Class Name  Class Index  Label Issues  Inverse Label Issues  Label Noise  \
 0           6            6             5                     2     0.016234   
 1           5            5  