In [2]:
import numpy as np
import pandas as pd

score = pd.DataFrame(columns=['target', 'total', 'train', 'valid', 'f1'])
score['target'] = np.arange(7)

total = pd.read_csv('../../data/v0.3/v3_concat.csv')
val_pred = pd.read_csv('../../data/v0.3/valid_output.csv')

val_ids = list(val_pred['ID'].values)
train = total[~total['ID'].str.contains('|'.join(val_ids))]
val_answer = total[total['ID'].str.contains('|'.join(val_ids))]

label = range(7)
for l in label:
    score.loc[score['target']==l, 'total'] = len(total[total['target']==l])
    score.loc[score['target']==l, 'train'] = len(train[train['target']==l])
    score.loc[score['target']==l, 'valid'] = len(val_pred[val_pred['target']==l])

    val_correct = val_pred[(val_pred['target']==l) & val_pred['ID'].str.contains('|'.join((val_answer[val_answer['target']==l]['ID'].values)))]
    recall = len(val_correct) / (len(val_answer[val_answer['target']==l]))
    precision = len(val_correct) / (len(val_pred[val_pred['target']==l]))

    score.loc[score['target']==l, 'f1'] = 2 * precision * recall / (precision + recall)

score

Unnamed: 0,target,total,train,valid,f1
0,0,261,197,36,0.68
1,1,527,341,105,0.639175
2,2,404,238,73,0.543933
3,3,681,533,151,0.675585
4,4,496,380,108,0.830357
5,5,433,325,91,0.703518
6,6,531,413,103,0.733032


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained('../model')
model = AutoModelForSequenceClassification.from_pretrained('../model/checkpoint-334', num_labels=7).to(DEVICE)

In [5]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

train = pd.read_csv('../../data/v0.3/v3_concat.csv')

model.eval()
preds = []
for _, data in tqdm(train.iterrows(), total=len(train)):
    tokenized = tokenizer(data['text'], padding='max_length', max_length=50, truncation=True, return_tensors='pt').to(DEVICE)
    with torch.no_grad():
        logits = model(**tokenized).logits.cpu().numpy()
        softmax = [np.exp(x)/np.sum(np.exp(logits)) for x in logits]
        preds.append({'ID': data['ID'], 'softmax': softmax})

preds[0]

  0%|          | 0/3333 [00:00<?, ?it/s]

{'ID': 'ynat-v1_train_01346',
 'softmax': [array([0.03626942, 0.04212366, 0.02697006, 0.76633376, 0.05744023,
         0.04488959, 0.02597326], dtype=float32)]}

In [6]:
from cleanlab.filter import find_label_issues

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

pred_probs = [np.array(p['softmax'][0]) for p in preds]
pred_probs = np.array(pred_probs)

ordered_label_issues = find_label_issues(
    labels=train['target'],
    pred_probs=pred_probs,
    return_indices_ranked_by='self_confidence'
)

In [10]:
print(f"total {len(ordered_label_issues)}")
for issue in ordered_label_issues:
    print('input text:', train.iloc[issue]['text'])
    print('label:', train.iloc[issue]['target'])
    pred = np.argmax(pred_probs[issue])
    print('pred:', pred)
    train.loc[train['text']==train.iloc[issue]['text'], 'target'] = pred

    print('----------------------------------------')

train.to_csv('../../data/v0.3/v3_concat_cleanlab.csv', index=False)

total 252
input text: 10대 그룹, 토지가액 73조…현대차 최고 땅값자 
label: 5
pred: 5
----------------------------------------
input text: 북V K구O표팀;F리1오픈 EM/위해 ?남
label: 1
pred: 1
----------------------------------------
input text: 과학기술정보통신부, 인공지능 윤리법안 제출 예고
label: 4
pred: 4
----------------------------------------
input text: 여자농구 KB국민은행 20182019 시즌권 11일부터 판매
label: 1
pred: 1
----------------------------------------
input text: 게시판 SKT AI 콘퍼런스 ai.x 2019 개최
label: 4
pred: 4
----------------------------------------
input text: 경기 Xa권 8개 B, Q존주m보h_T활8 자P
label: 1
pred: 1
----------------------------------------
input text: 美 미네소타r州H람M장 입구H 2격d 명 부상
label: 6
pred: 6
----------------------------------------
input text: 감사인 수 '과학 J지구 M연구 H작업>·강의
label: 3
pred: 3
----------------------------------------
input text: 아스널 스토크시티가 41승... 4위 리버풀이 1점으로.
label: 1
pred: 1
----------------------------------------
input text: 단골 고객 탄원에 난항 비행기 승객 대피시켜달라 
label: 3
pred: 3
----------------------------------------
input te