# Make a Model
- ~~seed 42, 456, 777, 109, 1004 총 5개의 seed로 앙상블 한 결과값 생성~~
- ~~roberta-large 모델로 앙상블~~
- ~~detection 의 detection~~

In [1]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [5]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, './data')
OUTPUT_DIR = os.path.join(BASE_DIR, './output')

In [6]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7, cache_dir='/data/ephemeral/huggingface').to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='/data/ephemeral/huggingface')

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initial

In [9]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train_aihub.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, stratify=data['target'],random_state=SEED)

In [11]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', max_length=64, truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'token_type_ids': self.inputs[idx]['token_type_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [12]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

In [15]:
### for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [19]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=0.05,
    eval_steps=0.05,
    save_steps=0.05,
    save_total_limit=1,
    learning_rate= 2e-05,
    gradient_accumulation_steps=1,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 94349
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 11794
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss,F1
1000,0.3582,0.795797,0.772574
2000,0.5561,0.610835,0.778774
3000,0.6025,0.554258,0.794589
4000,0.5849,0.547556,0.80288
5000,0.5604,0.541434,0.80353
6000,0.5429,0.540865,0.807547
7000,0.4192,0.553316,0.804258
8000,0.4036,0.565455,0.808688
9000,0.3977,0.552258,0.808941
10000,0.3982,0.546384,0.810033


***** Running Evaluation *****
  Num examples = 23588
  Batch size = 16
Saving model checkpoint to /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-1000
Configuration saved in /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-1000/config.json
Model weights saved in /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/level2_datacentric/JHW/output/checkpoint-900] due to args.save_total_limit
Deleting older checkpoint [/data/ephemeral/level2_datacentric/JHW/output/checkpoint-1200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 23588
  Batch size = 16
Saving model checkpoint to /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-2000
Configuration saved in /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-2000/config.json
Model weights saved in /data/ephemeral/level2_datacentric/JHW/./output/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [/d

TrainOutput(global_step=11794, training_loss=0.46935305725584114, metrics={'train_runtime': 1341.5966, 'train_samples_per_second': 140.652, 'train_steps_per_second': 8.791, 'total_flos': 6206344849363200.0, 'train_loss': 0.46935305725584114, 'epoch': 2.0})

# Making Probability

In [22]:
model.eval()
logits_list = []
for idx, sample in tqdm(data.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        logits = logits.softmax(dim=-1)
        logits_list.extend(logits.cpu().numpy())

117937it [16:41, 117.70it/s]


In [23]:
probs = [list(item) for item in logits_list]

In [23]:
data['probs'] = probs
data.to_csv(f'./train_with_logits/electra_{SEED}.csv', index=False)

# Ensemble Label Prob

In [127]:
import glob

files = glob.glob('./train_with_logits/*.csv')

In [148]:
probs = [pd.read_csv(file)['probs'] for file in files]

In [150]:
probs = [np.array(prob.apply(eval).to_list()) for prob in probs]

In [159]:
probs = np.array(probs).mean(0)

# ERROR DETECT

In [24]:
probs = np.array(probs)
data = pd.read_csv('./data/train_aihub.csv')

In [25]:
id2label = {
    0:'IT과학',
    1:'경제',
    2:'사회',
    3:'생활문화',
    4:'세계',
    5:'스포츠',
    6:'정치',
}

label2id = {
    'IT과학':0,
    '경제':1,
    '사회':2,
    '생활문화':3,
    '세계':4,
    '스포츠':5,
    '정치':6,    
}

In [26]:
from cleanlab.filter import find_label_issues

ordered_label_issues = find_label_issues(
    labels=data['target'], #데이터셋 라벨
    pred_probs=probs, #정답 예측 확률
    return_indices_ranked_by='self_confidence',
)

head_issues=ordered_label_issues[:10]
for issue in head_issues:
    print('input text:',data.iloc[issue]['text'])
    print('label:',id2label[data.iloc[issue]['target']])
print('total_issues:',len(ordered_label_issues))

input text: 주말 N 여행 제주권 만설 한라산…그 순수한 아름다움 속으로
label: 세계
input text: 리듬체조 유망주 서고은, '집사부일체' 깜짝 등장
label: 스포츠
input text: '복수해라' 윤현민 "좋은 사람들과 동행...행복했던 시간" 종영 소감
label: IT과학
input text: "죄질이 좋지 않다"…자가격리 두번 이탈 20대 첫 징역형
label: 스포츠
input text: '애로부부' 박철민·유경진 부부, 싸한 분위기 "오는 길에도 싸웠다"
label: IT과학
input text: 무릎 부상 기성용 선두 첼시전 출전 명단서 제외
label: 경제
input text: [철원]철원 모든 대학생에 장학금 지원
label: 스포츠
input text: [철원]철원장학회 장학생 선발
label: 스포츠
input text: 충북경찰 ‘앞담화’ 시간…“그냥 쉬고 싶은데 휴가 사유 왜 묻냐?”
label: 경제
input text: “코로나로 우울” 마약 밀수입 제주 30대 집행유예
label: 경제
total_issues: 5694


In [27]:
from cleanlab.dataset import health_summary

class_names=[0,1,2,3,4,5,6]
summary = health_summary(data['target'], probs, class_names=class_names)

------------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary            |
|   for your dataset with 117,937 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.          |
------------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,0,0,944,754,0.166784,0.137843,0.833216
1,1,1,1125,1626,0.053646,0.075727,0.946354
2,2,2,1931,1601,0.047062,0.039336,0.952938
3,6,6,683,782,0.044928,0.051108,0.955072
4,4,4,241,363,0.037209,0.055008,0.962791
5,3,3,607,421,0.029036,0.02032,0.970964
6,5,5,165,149,0.021454,0.019414,0.978546



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,1,2,1,2,1286,0.010904
1,0,1,0,1,858,0.007275
2,2,6,2,6,839,0.007114
3,0,2,0,2,550,0.004664
4,2,3,2,3,543,0.004604
5,1,6,1,6,277,0.002349
6,2,4,2,4,212,0.001798
7,1,3,1,3,178,0.001509
8,4,6,4,6,135,0.001145
9,0,3,0,3,106,0.000899



 * Overall, about 5% (5,696 of the 117,937) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.95.

Generated with <3 from Cleanlab.



## Change Error by Threshold

In [28]:
model_logits = np.array(probs)

In [61]:
import evaluate

# Error 이슈에 대한 macro f1 
f1 = evaluate.load('f1')
f1.compute(predictions=np.argmax(model_logits[ordered_label_issues], axis=1), references=data.iloc[ordered_label_issues]['target'].values, average='macro')

{'f1': 0.05952908352452441}

In [30]:
# model이 어느 정도의 확신으로 예측했는지
# `ordered_label_issues` 는 모델의 라벨값과 가장 다른 순으로 정렬
print(model_logits[ordered_label_issues][0])
print(model_logits[ordered_label_issues][-1])

[1.2225989e-03 7.4410113e-04 8.2024140e-03 9.8873597e-01 1.7811389e-04
 5.7038205e-04 3.4642071e-04]
[0.59375054 0.38971087 0.00827854 0.00182783 0.00181179 0.00385392
 0.00076651]


In [None]:
# logi2s의 threshold를 찾기
i = 98
print('Sentence:', data.iloc[ordered_label_issues[i]]['text'])
print('Model Prob:', model_logits[ordered_label_issues[i]])
print('Model Pred:', id2label[np.argmax(model_logits[ordered_label_issues[i]])])
print('Train Label:', id2label[data.iloc[ordered_label_issues[i]]['target']])

In [59]:
# 모델이 99%로 예측하면 변경
for idx in ordered_label_issues:
    if model_logits[idx].max() >= 0.99:
        data.iloc[idx, 2] = np.argmax(model_logits[idx])

In [126]:
non_json = []

for idx in ordered_label_issues:
    try:
        json_label = title2label[data.iloc[idx]['text']]
        json_exists = True
    except:
        json_exists = False
        non_json.append(idx)
    
    if json_exists:
        data.iloc[idx, 2] = label2id[json_label]
    else:
        pass

In [62]:
data.to_csv('./data/train_aihub+LED.csv')

In [63]:
data.target.value_counts()

target
2    41124
1    20923
3    20892
6    15194
5     7709
4     6469
0     5626
Name: count, dtype: int64

In [131]:
# logits의 threshold를 찾기
i = 111
print('Sentence:', data.iloc[i]['text'])
print('Model Prob:', model_logits[i])
print('Model Pred:', id2label[np.argmax(model_logits[i])])
print('Train Label:', id2label[data.iloc[i]['target']])

Sentence: 자고나면 한 명씽 느느 녜비후보…열기 다라오르는 서부산
Model Prob: [1.9274342e-03 9.9778979e-04 2.5337322e-03 9.9153203e-01 8.4386056e-04
 1.4843867e-03 6.8072486e-04]
Model Pred: 생활문화
Train Label: 정치


In [128]:
non_json

[6458,
 111,
 797,
 685,
 2207,
 2365,
 4393,
 6717,
 5831,
 5101,
 2724,
 2096,
 1681,
 2879,
 6638,
 841,
 1382,
 4738,
 62,
 157,
 5953,
 5364,
 2316,
 4122,
 2649,
 3768,
 4329,
 3749,
 3866,
 78,
 6027,
 3327,
 5497,
 3957,
 2515,
 4163,
 5245,
 4048,
 1458,
 6546,
 1865,
 5958,
 3189,
 4369,
 2355,
 6327,
 370,
 2949,
 3406,
 4275,
 4523,
 2299,
 5107,
 2182,
 2402,
 3655,
 1607,
 4360,
 6845,
 6978,
 5535,
 405,
 4206,
 5754,
 631,
 248,
 5092,
 1957,
 6666,
 6113,
 4718,
 3523,
 3119,
 6481,
 6297,
 5887,
 6375,
 3585,
 312,
 6634,
 297,
 2921,
 3448,
 1377,
 4133,
 5352,
 2530,
 1346,
 2540,
 6650,
 4548,
 3925,
 5084,
 6829,
 2835,
 5386,
 266,
 664,
 2873,
 6519,
 4319,
 2967,
 4556,
 5738,
 5938,
 4493,
 6674,
 234,
 2227,
 6140,
 6368,
 447,
 4730,
 414,
 6114,
 4799,
 5895,
 508,
 4189,
 5240,
 2913,
 6109,
 764,
 5437,
 4906,
 708,
 4357,
 3673,
 2051,
 3505,
 1254,
 432,
 5963,
 4153,
 2572,
 5659,
 4517,
 707]

In [184]:
new_data = pd.read_csv('./data/train.csv')

for i in range(154):
    new_data.iloc[ordered_label_issues[i], 2] = np.argmax(model_logits[ordered_label_issues[i]])

In [185]:
new_data.to_csv('./data/train_LED_roberta-large.csv')