In [1]:
import cleanlab

In [1]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader


from sklearn.model_selection import train_test_split, StratifiedKFold

from cleanlab.filter import find_label_issues

In [2]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [4]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, './')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output/labelerrordetect')

In [5]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [6]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initial

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, stratify=data['target'],random_state=SEED)

In [12]:
from sklearn.model_selection import train_test_split

# Splitting the data for demonstration purposes
# train_data, val_data = train_test_split(dataset_train, test_size=0.2, stratify=dataset_train['target'], random_state=SEED)

# Convert to datasets that your model can understand (you've already defined BERTDataset)
train_dataset = BERTDataset(dataset_train, tokenizer)
val_dataset = BERTDataset(dataset_valid, tokenizer)

In [9]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

In [10]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

## Base train

In [77]:
# Assuming you have the Trainer and TrainingArguments set up as before
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Get predictions
raw_pred, _, _ = trainer.predict(data)
predicted_probabilities = torch.nn.functional.softmax(torch.from_numpy(raw_pred), dim=-1).numpy()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1
100,0.2564,0.579752,0.849797
200,0.2372,0.582167,0.85147
300,0.2157,0.592979,0.852502


KeyError: 0

In [69]:
print(predicted_probabilities.shape)

(1400, 7)


In [70]:
predicted_indices = predicted_probabilities.tolist()


In [71]:
predicted_indices[:10]

[[0.004360963590443134,
  0.006389416754245758,
  0.0060189333744347095,
  0.0014049927704036236,
  0.004288696218281984,
  0.00175621104426682,
  0.9757807850837708],
 [0.7953826189041138,
  0.1586526483297348,
  0.015102075412869453,
  0.0058844927698373795,
  0.014815210364758968,
  0.005386636592447758,
  0.00477625010535121],
 [0.004212699830532074,
  0.008883902803063393,
  0.0038658350240439177,
  0.01026345044374466,
  0.9609853029251099,
  0.006625839043408632,
  0.005162952467799187],
 [0.7276847958564758,
  0.03390750288963318,
  0.20110687613487244,
  0.007409890182316303,
  0.012534319423139095,
  0.005419180728495121,
  0.011937368661165237],
 [0.9557198286056519,
  0.023168807849287987,
  0.006172679830342531,
  0.006018315441906452,
  0.004839946515858173,
  0.0014716783771291375,
  0.0026087539736181498],
 [0.004116144496947527,
  0.9716964960098267,
  0.009814871475100517,
  0.007066609803587198,
  0.0027463610749691725,
  0.0018550943350419402,
  0.002704543061554432

In [78]:
from cleanlab.filter import find_label_issues

ordered_label_issues = find_label_issues(
    labels=dataset_valid['target'], #데이터셋 라벨
    pred_probs=predicted_probabilities, #정답 예측 확률
    return_indices_ranked_by='self_confidence',
)

head_issues=ordered_label_issues[:3]
for issue in head_issues:
    print('input text:',dataset_valid.iloc[issue]['text'])
    print('label:',dataset_valid.iloc[issue]['target'])
    print('------------------')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

input text: 판문점 선언 함께 걷는 남북정상
label: 5
------------------
input text: 공시가 인상에 경기침체속 임대료도 오르나 촉각…매물 늘 듯
label: 5
------------------
input text: 시리아 대통령 내전 뒤 첫 이란 방문…수뇌부 환대종합
label: 1
------------------


In [28]:
head_issues

NameError: name 'head_issues' is not defined

In [75]:
len(ordered_label_issues)

130

1400 130 10% 20% ~280

->어떻게 할 것인가?
지울것인가?
얼마나지울것인가?

In [74]:
from cleanlab.filter import get_label_quality_scores
scores = get_label_quality_scores(
    labels=dataset_valid['target'], #데이터셋 라벨
    pred_probs=predicted_probabilities, #정답 예측 확률
    method='self_confidence',
    )
print(scores)

[0.9757808  0.7953826  0.9609853  ... 0.972451   0.94867504 0.93196356]


In [58]:
val_dataset

<__main__.BERTDataset at 0x7f53653d5be0>

In [66]:
dataset_valid_values = dataset_valid['target'].values
dataset_valid_values.shape

(1400,)

In [60]:
print(len(val_dataset), len(dataset_valid['target']))


1120 1400


## KFold Train

In [13]:
problem_list = list()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
batch_size = 32

for i, (train_idx, val_idx) in enumerate(skf.split(data, data.target)):
    data_train = BERTDataset(data.iloc[train_idx], tokenizer)
    data_val = BERTDataset(data.iloc[val_idx], tokenizer)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=data_train,
        eval_dataset=data_val,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    model.eval()
    
    found_list = torch.from_numpy(trainer.predict(data_val).predictions)
    found_list = torch.softmax(found_list, dim=-1).detach().numpy()
    problem_list.extend(zip(val_idx, found_list))





***** Running training *****
  Num examples = 5600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 110622727
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
100,1.043,0.639645,0.814513
200,0.5608,0.58381,0.831742
300,0.4555,0.576625,0.836553


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-100
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-200
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-300
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-300/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-300/p

***** Running training *****
  Num examples = 5600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss,F1
100,0.4851,0.370083,0.885478
200,0.4403,0.369766,0.894182
300,0.3507,0.390189,0.880999


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-100
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-200
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to 

***** Running training *****
  Num examples = 5600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss,F1
100,0.3882,0.302326,0.923518
200,0.3796,0.300287,0.925404
300,0.2825,0.290133,0.927722


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-100
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-200
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to 

***** Running training *****
  Num examples = 5600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss,F1
100,0.2907,0.2463,0.938353
200,0.2568,0.228492,0.944044
300,0.2048,0.2279,0.942032


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-100
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-200
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to 

***** Running training *****
  Num examples = 5600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss,F1
100,0.2489,0.148463,0.967862
200,0.2106,0.137782,0.967131
300,0.1616,0.132907,0.96933


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-100
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /opt/ml/data/../output/labelerrordetect/checkpoint-200
Configuration saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/config.json
Model weights saved in /opt/ml/data/../output/labelerrordetect/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/opt/ml/data/../output/labelerrordetect/checkpoint-300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to 

In [14]:
problem_list

[(0,
  array([0.01099389, 0.9158487 , 0.03630439, 0.01522569, 0.00720511,
         0.00718246, 0.00723976], dtype=float32)),
 (2,
  array([0.8651484 , 0.07627812, 0.02127753, 0.0156678 , 0.01096719,
         0.00553323, 0.0051278 ], dtype=float32)),
 (4,
  array([0.00712623, 0.00464951, 0.00764176, 0.00674135, 0.9603604 ,
         0.00529827, 0.00818236], dtype=float32)),
 (6,
  array([0.01073612, 0.96319115, 0.01000569, 0.00696896, 0.00333821,
         0.00284758, 0.00291237], dtype=float32)),
 (12,
  array([0.00894506, 0.00710012, 0.00649693, 0.00496125, 0.93244696,
         0.00826156, 0.03178815], dtype=float32)),
 (18,
  array([0.00821571, 0.02099655, 0.3974594 , 0.0141196 , 0.00483011,
         0.00699622, 0.5473825 ], dtype=float32)),
 (31,
  array([0.00992622, 0.00652307, 0.03450454, 0.92610496, 0.00543082,
         0.00747162, 0.0100388 ], dtype=float32)),
 (35,
  array([0.00414896, 0.00575699, 0.01446802, 0.00230293, 0.0037607 ,
         0.00407494, 0.9654875 ], dtype=float32

In [15]:
sorted_problem_list = sorted(problem_list)

In [16]:
sorted_problem_list

[(0,
  array([0.01099389, 0.9158487 , 0.03630439, 0.01522569, 0.00720511,
         0.00718246, 0.00723976], dtype=float32)),
 (1,
  array([0.43876755, 0.17487185, 0.05954035, 0.3039769 , 0.01062761,
         0.00538721, 0.00682856], dtype=float32)),
 (2,
  array([0.8651484 , 0.07627812, 0.02127753, 0.0156678 , 0.01096719,
         0.00553323, 0.0051278 ], dtype=float32)),
 (3,
  array([0.01106682, 0.00237731, 0.00323946, 0.97921693, 0.00133124,
         0.00134292, 0.00142546], dtype=float32)),
 (4,
  array([0.00712623, 0.00464951, 0.00764176, 0.00674135, 0.9603604 ,
         0.00529827, 0.00818236], dtype=float32)),
 (5,
  array([0.0020214 , 0.00391662, 0.00354655, 0.00160949, 0.00149502,
         0.98555225, 0.00185861], dtype=float32)),
 (6,
  array([0.01073612, 0.96319115, 0.01000569, 0.00696896, 0.00333821,
         0.00284758, 0.00291237], dtype=float32)),
 (7,
  array([0.00393656, 0.98176783, 0.00439526, 0.00269976, 0.00208232,
         0.00281363, 0.00230462], dtype=float32)),


In [17]:
data["prob"] = [prob for idx, prob in sorted_problem_list]
pred_probs = np.stack(data["prob"])
ordered_label_issues = find_label_issues(labels=data['target'], pred_probs=pred_probs, return_indices_ranked_by='self_confidence')
data["new_label"] = data['target']

print("ordered_label_issues", len(ordered_label_issues))
print(data.columns)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [26]:
ordered_label_issues

array([3934, 6589,  536, 1411, 6856, 6127, 6349, 6585, 1925,   76, 6808,
       5761, 1753, 5557, 6619, 1922, 6311, 3703, 3777, 1650, 4122, 4323,
       1173, 3095, 1856, 5779, 6566, 6287, 2778, 1371, 3790, 3747, 2879,
       6034, 1105, 4990, 1071, 5673,  213, 1526, 5560, 1401,   64, 2981,
       1426, 1321, 1034, 3311, 5002, 3756, 3320, 2350, 5748,  342, 4872,
       2206, 4313,  135, 2864,  477, 3396, 4266, 3558, 5540,  963, 6091,
       4477, 5052, 1410, 6458, 2351, 6582, 2089, 2279, 5206, 2367, 1806,
       4131, 6666, 5671, 4706, 6939, 3132, 2649, 3503, 5898, 2293, 3776,
       2788, 6546, 3622, 3093, 3941, 6638, 5881, 6698, 4832, 1382, 5828,
       6969, 2899, 1266, 2480, 5440, 3921, 2974, 1480, 5057, 3011, 6838,
        978, 5199, 6131, 5983, 6812, 2595, 1560, 4466, 3221, 2983, 2909,
       3942, 4778, 2562, 1146, 6600,  791,  520, 4884, 5205, 5420, 2724,
       4727, 4685, 5418, 3342, 1865, 2829,  290, 3596, 3580, 6936,  157,
       3957, 3195, 2959, 5226, 3851, 3757, 5813, 40

In [27]:
7000 * 0.2

1400.0

In [18]:
for issue_idx in ordered_label_issues:
    new_label = data.iloc[issue_idx, 5].argmax(-1)
    data.iloc[issue_idx, 2] = new_label

data.to_csv("train_noise_corrected.csv", index=False)

In [19]:
noisedel_df = pd.read_csv("train_noise_corrected.csv")

In [20]:
different_labels = noisedel_df[noisedel_df['target'] != noisedel_df['new_label']]
different_labels


Unnamed: 0,ID,text,target,url,date,prob,new_label
35,ynat-v1_train_00035,與 대구·경북 공천면접심사…현역 vs 진박 격돌,6,https://news.naver.com/main/read.nhn?mode=LS2D...,2016.02.26. 오전 5:00,[0.00414896 0.00575699 0.01446802 0.00230293 0...,2
64,ynat-v1_train_00064,나경복 개인 첫 트리플크라운…우리카드 구단 최다 타이 5연승,5,https://sports.news.naver.com/news.nhn?oid=001...,2019.11.27 20:28,[0.00334295 0.00389253 0.00430756 0.00442711 0...,0
76,ynat-v1_train_00076,눈물 쏟은 문경은 선수 때 우승하고도 안 울었는데…,5,https://sports.news.naver.com/news.nhn?oid=001...,2018.04.18 21:50,[0.00662148 0.00491064 0.01998748 0.01372119 0...,0
78,ynat-v1_train_00078,코로나 백신 노령층에 효율적이지 않을 수도 부산대 연구팀,0,https://news.naver.com/main/read.nhn?mode=LS2D...,2020.12.08. 오후 4:44,[0.91792226 0.0099129 0.04564395 0.00899207 0...,2
111,ynat-v1_train_00111,자고나면 한 명씽 느느 녜비후보…열기 다라오르는 서부산,3,https://news.naver.com/main/read.nhn?mode=LS2D...,2016.01.29. 오전 10:02,[0.0190297 0.00680997 0.02860258 0.9077913 0...,6
...,...,...,...,...,...,...,...
6877,ynat-v1_train_06877,네이버 공공기관용 클라우드 서비스 개시,0,https://news.naver.com/main/read.nhn?mode=LS2D...,2017.07.19. 오전 10:21,[0.9250926 0.05409564 0.00769059 0.00235537 0...,1
6879,ynat-v1_train_06879,경북지사·포항시장 국무총리에 실질적 지진피해 구제 요청,2,https://news.naver.com/main/read.nhn?mode=LS2D...,2020.08.11. 오전 10:53,[0.01054969 0.02327736 0.7922866 0.01676239 0...,6
6936,ynat-v1_train_06936,줄줄이 잘리는 MLB 감독들…다저스 감독도 안심 못해,5,https://sports.news.naver.com/news.nhn?oid=001...,2019.09.30 10:16,[0.00789833 0.00579681 0.00540307 0.00443842 0...,2
6939,ynat-v1_train_06939,러시아 어린이 오디션 프로그램 우승자 투표 조작으로 떠들썩,4,https://news.naver.com/main/read.nhn?mode=LS2D...,2019.05.17. 오전 1:03,[0.04126047 0.00895448 0.020823 0.28170574 0...,0


In [21]:
len(different_labels)

316

In [22]:
len(different_labels) / len(noisedel_df)

0.045142857142857144

: 