# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [15]:
try:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/newstopic/code
    !pip install -r requirements.txt
except:
    pass

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

from utils import *
import matplotlib.pyplot as plt
import seaborn as sns

## Set Hyperparameters

In [17]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [18]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [19]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [20]:
model_name = '../model/rlm/checkpoint-200'
tokenizer_name = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

## Define Dataset

In [21]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }

    def __len__(self):
        return len(self.labels)

In [22]:
class MyBERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label).view(1,-1))

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'],
            'attention_mask': self.inputs[idx]['attention_mask'],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [24]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [25]:
## for wandb setting
os.environ['WANDB_DISABLED'] = 'true'

In [95]:
import torch
import torch.nn.functional as F
from sklearn.base import BaseEstimator, ClassifierMixin

class CleanlabModel(BaseEstimator, ClassifierMixin):
    def __init__(self, model, optimizer, criterion):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.epochs = 2
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def fit(self, X, Y):

        self.model.train()
        self.model.to(self.device)

        for epoch in range(self.epochs):
            for x in X:
                x['input_ids'] = x['input_ids'].to(self.device)
                x['attention_mask'] = x['attention_mask'].to(self.device)
                y = x['labels'].to(self.device)
                del x['labels']
                self.optimizer.zero_grad()
                outputs = self.model(**x)
                loss = self.criterion(outputs.logits, y.view(1))
                loss.backward()
                self.optimizer.step()
    
    def predict_proba(self, X):
        self.model.eval()
        self.model.to(self.device)

        probs_list = []
        with torch.no_grad():
            for x in X:
                x['input_ids'] = x['input_ids'].to(self.device)
                x['attention_mask'] = x['attention_mask'].to(self.device)
                del x['labels']
                outputs = self.model(**x)
                probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()
                probs_list += [probs]
        probs = np.concatenate(probs_list, axis=0)
        return probs
    
    def predict(self, X):
        probs = self.predict_proba(X)
        return probs.argmax(axis=-1)

## Evaluate Model

In [None]:
### cleanlab filter
from cleanlab.filter import find_label_issues
from cleanlab.classification import CleanLearning




def label_print(model, file_name):


    model.eval()
    model.to(DEVICE)
    
    data_train = pd.read_csv(file_name)
    dataset_train = BERTDataset(data_train, tokenizer)

    train_pred_probs=[]

    with torch.no_grad():
        for batch in dataset_train:
            #batch['input_ids'] = batch['input_ids']
            batch['input_ids'] = batch['input_ids'].view(1, -1).to(DEVICE)
            batch['attention_mask'] = batch['attention_mask'].view(1, -1).to(DEVICE)
            batch['labels'] = batch['labels'].view(1, -1).to(DEVICE)
            outputs = model(**batch)
            train_pred_probs += [torch.nn.functional.softmax(outputs.logits, dim=-1)]

    train_pred_probs = torch.cat(train_pred_probs, dim=0)
    train_pred_probs = train_pred_probs.detach().cpu().numpy()
    ordered_label_issues = find_label_issues(
        labels=data_train['target'],
        pred_probs=train_pred_probs,
        return_indices_ranked_by='self_confidence',
    )

    head_issues = ordered_label_issues[:3]
    for issue in head_issues:
        print('input_text:', data_train.iloc[issue]['text'])
        print('label_text:', data_train.iloc[issue]['target'])
        print('-------------------------------------------------')

    from cleanlab.dataset import health_summary
    class_names = [*range(7)]
    health_summary(data_train['target'], train_pred_probs, class_names=class_names)


In [101]:
def relabel(model, file_name):
    data_train = pd.read_csv(file_name)
    dataset_train = BERTDataset(data_train, tokenizer)

    train_pred_probs=[]

    with torch.no_grad():
        for batch in dataset_train:
            #batch['input_ids'] = batch['input_ids']
            batch['input_ids'] = batch['input_ids'].view(1, -1).to(DEVICE)
            batch['attention_mask'] = batch['attention_mask'].view(1, -1).to(DEVICE)
            batch['labels'] = batch['labels'].view(1, -1).to(DEVICE)
            outputs = model(**batch)
            train_pred_probs += [torch.nn.functional.softmax(outputs.logits, dim=-1)]

    train_pred_probs = torch.cat(train_pred_probs, dim=0)
    train_pred_probs = train_pred_probs.detach().cpu().numpy()


    print("relabeling start")
    model = CleanlabModel(model, torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01), torch.nn.CrossEntropyLoss())
    cleanlab_relabel = CleanLearning(clf=model, seed=SEED)  # You can pass your PyTorch model
    cleanlab_relabel.fit(MyBERTDataset(data_train, tokenizer), data_train['target'], pred_probs=train_pred_probs)  # You can pass the dataset and predicted probabilities

    new_labels = cleanlab_relabel.predict(MyBERTDataset(data_train, tokenizer))  # Get the new labels for relabeling
    print("new_labels", new_labels)
    return cleanlab_relabel, new_labels

In [None]:
### cleanlab filter
from cleanlab.filter import find_label_issues

label_print(model, '../data/preprocess/ascii_ratio_higher_20.csv')



In [102]:
cleanlab_relabel, new_labels = relabel(model, '../data/preprocess/ascii_ratio_higher_20.csv')

relabeling start


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

new_labels [4 3 2 ... 1 1 4]


In [105]:
print(len(new_labels))

1639


In [None]:
train_data = pd.read_csv('../data/preprocess/ascii_ratio_higher_20.csv')

In [None]:
train_data = pd.read_csv('../data/preprocess/ascii_ratio_higher_20.csv')
new_train_data = train_data.copy()
new_train_data['target'] = new_labels

train_data.to_csv('../data/preprocess/ascii_ratio_higher_20_not_relabeled.csv')
new_train_data.to_csv('../data/preprocess/ascii_ratio_higher_20.csv')

In [4]:
new_data = pd.read_csv('../data/preprocess/ascii_ratio_higher_20.csv')
old_data = pd.read_csv('../data/preprocess/ascii_ratio_higher_20_not_relabeled.csv')
c = 0
for a ,b in zip(new_data.target, old_data.target):
    if a!=b:c+=1
print(c)

127


In [None]:
### cleanlab filter
from cleanlab.filter import find_label_issues

label_print(model, '../data/preprocess/ascii_ratio_higher_20_train.csv')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

input_text: 네이&3美 유11인;지Nj2회_A{문_2편j호평
label_text: 4
-------------------------------------------------
input_text: =nb돌파' GO까…與3검`~혁H先처리1카드: r?~ 깬E
label_text: 3
-------------------------------------------------
input_text: 메시·호날두 UEFA 올해의 팀에 선정…EPL 선수 제로
label_text: 5
-------------------------------------------------
----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,153 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,3,3,38,41,0.228916,0.242604,0.771084
1,6,6,34,16,0.225166,0.120301,0.774834
2,5,5,32,38,0.198758,0.227545,0.801242
3,4,4,28,48,0.164706,0.252632,0.835294
4,0,0,28,10,0.16092,0.064103,0.83908
5,2,2,13,13,0.079268,0.079268,0.920732
6,1,1,11,18,0.065868,0.103448,0.934132



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,3,4,3,4,25,0.021683
1,4,5,4,5,20,0.017346
2,3,5,3,5,19,0.016479
3,5,6,5,6,17,0.014744
4,0,3,0,3,13,0.011275
5,1,4,1,4,10,0.008673
6,3,6,3,6,10,0.008673
7,2,6,2,6,9,0.007806
8,0,5,0,5,9,0.007806
9,4,6,4,6,9,0.007806



 * Overall, about 11% (127 of the 1,153) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.89.

Generated with <3 from Cleanlab.



In [17]:


label_print(model, '../data/preprocess/ascii_ratio_higher_20_mlm_train.csv')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

input_text: 메시 · 호날두 ·의FFA리그 올해의 축구 팀에 공식 선정 … 한국의의의 선수는 제로
label_text: 5
-------------------------------------------------
input_text: 오늘은안당 안는는 안다 … 與 [UNK] 검 [UNK] [UNK] [UNK] [UNK] 先 처리 대통령 카드 안 안 안 안 안을 [UNK]다
label_text: 3
-------------------------------------------------
input_text: 메리츠종금은 내년부터 코스피 ·에월월월 ∼월월월월의.
label_text: 1
-------------------------------------------------
----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,153 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,3,3,40,30,0.240964,0.192308,0.759036
1,5,5,35,34,0.217391,0.2125,0.782609
2,4,4,31,43,0.182353,0.236264,0.817647
3,6,6,25,15,0.165563,0.106383,0.834437
4,0,0,27,12,0.155172,0.075472,0.844828
5,2,2,16,20,0.097561,0.119048,0.902439
6,1,1,13,33,0.077844,0.176471,0.922156



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,4,5,4,5,27,0.023417
1,3,5,3,5,20,0.017346
2,1,4,1,4,15,0.01301
3,3,4,3,4,15,0.01301
4,0,3,0,3,14,0.012142
5,2,6,2,6,10,0.008673
6,5,6,5,6,10,0.008673
7,1,3,1,3,9,0.007806
8,0,4,0,4,9,0.007806
9,1,6,1,6,7,0.006071



 * Overall, about 11% (122 of the 1,153) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.89.

Generated with <3 from Cleanlab.



In [18]:
label_print(model, '../data/preprocess/ascii_ratio_higher_20_mlm_btm_train.csv')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

input_text: 메시 · 호날두 ·의FFA리그 올해의 축구 팀에 공식 선정 .. 한국의 선수는 제로
label_text: 5
-------------------------------------------------
input_text: 경기의 첫 번째 활약은 센 투 업하다 ..
label_text: 3
-------------------------------------------------
input_text: 오늘은안당 안가는 아나.. 與 [UNK] 검 [UNK] [UNK] [UNK] [UNK] 先 처리 대통령 카드 안 안을 [UNK]다
label_text: 3
-------------------------------------------------
----------------------------------------------------------
|  Generating a Cleanlab Dataset Health Summary          |
|   for your dataset with 1,153 examples and 7 classes.  |
|  Note, Cleanlab is not a medical doctor... yet.        |
----------------------------------------------------------

Overall Class Quality and Noise across your dataset (below)
------------------------------------------------------------ 



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,Class Name,Class Index,Label Issues,Inverse Label Issues,Label Noise,Inverse Label Noise,Label Quality Score
0,3,3,50,29,0.301205,0.2,0.698795
1,5,5,40,55,0.248447,0.3125,0.751553
2,4,4,39,56,0.229412,0.299465,0.770588
3,6,6,33,18,0.218543,0.132353,0.781457
4,0,0,38,22,0.218391,0.139241,0.781609
5,2,2,19,17,0.115854,0.104938,0.884146
6,1,1,8,30,0.047904,0.15873,0.952096



Class Overlap. In some cases, you may want to merge classes in the top rows (below)
-----------------------------------------------------------------------------------



Unnamed: 0,Class Name A,Class Name B,Class Index A,Class Index B,Num Overlapping Examples,Joint Probability
0,4,5,4,5,36,0.031223
1,3,5,3,5,23,0.019948
2,0,3,0,3,23,0.019948
3,3,4,3,4,16,0.013877
4,5,6,5,6,16,0.013877
5,1,4,1,4,14,0.012142
6,0,4,0,4,14,0.012142
7,0,5,0,5,11,0.00954
8,4,6,4,6,11,0.00954
9,2,6,2,6,8,0.006938



 * Overall, about 13% (148 of the 1,153) labels in your dataset have potential issues.
 ** The overall label health score for this dataset is: 0.87.

Generated with <3 from Cleanlab.

