## 1. Library Import

In [1]:
import os
import torch

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']= '0'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Device:', device)  # 출력결과: cuda 
print('Count of using GPUs:', torch.cuda.device_count()) 
print('Current cuda device:', torch.cuda.current_device()) 

Device: cuda
Count of using GPUs: 1
Current cuda device: 0


In [2]:
import pandas as pd
import numpy as np
import random
import pickle
from tqdm.notebook import tqdm
tqdm.pandas()

from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# import torch
# torch.cuda.is_available()

## 2. Hyper-Parameter

In [3]:
tokenizer_path = 'beomi/kcbert-base'
weight_path = './fp_result/law_further_pretrained/checkpoint-50000'

class_n = 5
max_len = 300
epochs = 10
batch_size = 32

In [4]:
seed = 42

def set_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False 

set_seeds(seed)

## 3. Data Load

In [None]:
# law_data = pd.read_csv('./data/law_ft.csv')
# del law_data['Unnamed: 0']

# law_data

In [None]:
# law_data = law_data.sample(frac=1, random_state=42).reset_index(drop=True)

# law_data

## 4. Data Split

In [None]:
# law_data['label'].value_counts()

In [None]:
# law_ft_train = pd.DataFrame(columns = law_data.columns)
# law_ft_val = pd.DataFrame(columns = law_data.columns)
# law_ft_test = pd.DataFrame(columns = law_data.columns)

# for i in range(5):
#     temp = law_data[law_data['label']==i]
    
#     law_ft_train = pd.concat([law_ft_train, temp[:4800]])
#     law_ft_val = pd.concat([law_ft_val, temp[4800:-1600]])
#     law_ft_test = pd.concat([law_ft_test, temp[-1600:]])

In [None]:
# print(len(law_ft_train), len(law_ft_val), len(law_ft_test))

In [None]:
# law_ft_train.to_csv('./data/law_ft_train.csv')
# law_ft_val.to_csv('./data/law_ft_val.csv')
# law_ft_test.to_csv('./data/law_ft_test.csv')

In [5]:
law_ft_train = pd.read_csv('./data/law_ft_train.csv')
law_ft_val = pd.read_csv('./data/law_ft_val.csv')
law_ft_test = pd.read_csv('./data/law_ft_test.csv')

In [6]:
train_text = law_ft_train['pre_content']
train_label = law_ft_train['label']

val_text = law_ft_val['pre_content']
val_label = law_ft_val['label']

test_text = law_ft_test['pre_content']
test_label = law_ft_test['label']

## 5. Pre-processing

In [7]:
def tokenizer_(text, max_len, tokenizer):
    encoded_dict = tokenizer.encode_plus(text = text,
                                         add_special_tokens = True,
                                         max_length = max_len,
                                         padding = 'max_length',
                                         return_attention_mask = True,
                                         truncation = True)
    
    input_id = encoded_dict['input_ids']
    token_type_id = encoded_dict['token_type_ids']
    attention_mask = encoded_dict['attention_mask']
    
    return input_id, token_type_id, attention_mask

In [8]:
def preprocessing_train():
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    
    
    for text, label in tqdm(zip(train_text, train_label)):
        try:
            input_id, token_type_id, attention_mask = tokenizer_(text, max_len, tokenizer)
            input_ids.append(input_id)
            token_type_ids.append(token_type_id)
            attention_masks.append(attention_mask)
            labels.append(label)
        except Exception as e:
            print(e)
            pass
        
    train_input_ids = np.array(input_ids, dtype=int)
    train_attention_masks = np.array(attention_masks, dtype=int)
    train_token_type_ids = np.array(token_type_ids, dtype=int)
    train_labels = np.array(labels, dtype=int)
    
    # save
    train_data = {}

    train_data['input_ids'] = train_input_ids
    train_data['token_type_ids'] = train_token_type_ids
    train_data['attention_mask'] = train_attention_masks
    train_data['targets'] = train_labels
    
    os.makedirs('./ft_data/', exist_ok=True)
    
    with open('./ft_data/law_train_data_preprocessing.pickle', 'wb') as f:
        pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL)

In [9]:
def preprocessing_val():
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    
    
    for text, label in tqdm(zip(val_text, val_label)):
        try:
            input_id, token_type_id, attention_mask = tokenizer_(text, max_len, tokenizer)
            input_ids.append(input_id)
            token_type_ids.append(token_type_id)
            attention_masks.append(attention_mask)
            labels.append(label)
        except Exception as e:
            print(e)
            pass
        
    val_input_ids = np.array(input_ids, dtype=int)
    val_attention_masks = np.array(attention_masks, dtype=int)
    val_token_type_ids = np.array(token_type_ids, dtype=int)
    val_labels = np.array(labels, dtype=int)
    
    # save
    val_data = {}

    val_data['input_ids'] = val_input_ids
    val_data['token_type_ids'] = val_token_type_ids
    val_data['attention_mask'] = val_attention_masks
    val_data['targets'] = val_labels
    
    os.makedirs('./ft_data/', exist_ok=True)
    
    with open('./ft_data/law_val_data_preprocessing.pickle', 'wb') as f:
        pickle.dump(val_data, f, pickle.HIGHEST_PROTOCOL)

In [10]:
def preprocessing_test():
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    
    
    for text, label in tqdm(zip(test_text, test_label)):
        try:
            input_id, token_type_id, attention_mask = tokenizer_(text, max_len, tokenizer)
            input_ids.append(input_id)
            token_type_ids.append(token_type_id)
            attention_masks.append(attention_mask)
            labels.append(label)
        except Exception as e:
            print(e)
            pass
        
    test_input_ids = np.array(input_ids, dtype=int)
    test_attention_masks = np.array(attention_masks, dtype=int)
    test_token_type_ids = np.array(token_type_ids, dtype=int)
    test_labels = np.array(labels, dtype=int)
    
    # save
    test_data = {}

    test_data['input_ids'] = test_input_ids
    test_data['token_type_ids'] = test_token_type_ids
    test_data['attention_mask'] = test_attention_masks
    test_data['targets'] = test_labels
    
    os.makedirs('./ft_data/', exist_ok=True)
    
    with open('./ft_data/law_test_data_preprocessing.pickle', 'wb') as f:
        pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL)

In [11]:
preprocessing_train()
preprocessing_val()
preprocessing_test()
print('Preprocessing Clear')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Preprocessing Clear


## 6. Train

### (1) 데이터셋 만들기

In [12]:
with open('./ft_data/law_train_data_preprocessing.pickle', 'rb') as f:
    train_dict = pickle.load(f) 

with open('./ft_data/law_val_data_preprocessing.pickle', 'rb') as f:
    val_dict = pickle.load(f)     
    
with open('./ft_data/law_test_data_preprocessing.pickle', 'rb') as f:
    test_dict = pickle.load(f)    

In [13]:
class DataSet(Dataset):
    
    def __init__(self, data, test=False):
        
        self.data = data
        self.test = test
        
    def __len__(self):
        
        return self.data['input_ids'].shape[0]
    
    def __getitem__(self,idx):
        
        ids = torch.tensor(self.data['input_ids'][idx], dtype=torch.long)
        mask = torch.tensor(self.data['attention_mask'][idx], dtype=torch.long)
        token_type_ids = torch.tensor(self.data['token_type_ids'][idx], dtype=torch.long)
         
        if self.test:
            return {
                'input_ids': ids,
                'attention_mask': mask,
                'token_type_ids': token_type_ids
            }
        
        else:
            target = torch.tensor(self.data['targets'][idx], dtype=torch.long)

            return {
                    'input_ids': ids,
                    'attention_mask': mask,
                    'token_type_ids': token_type_ids,
                    'labels': target
                }

In [14]:
train_dataset = DataSet(data=train_dict)
valid_dataset = DataSet(data=val_dict)
test_dataset = DataSet(data=test_dict)

### (2) 모델

In [15]:
model = BertForSequenceClassification.from_pretrained(weight_path, 
                                                      num_labels = class_n,
                                                      problem_type = 'single_label_classification')

Some weights of the model checkpoint at ./fp_result/law_further_pretrained/checkpoint-50000 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpo

In [16]:
args = TrainingArguments(
    'law_ft_output',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = epochs,
    load_best_model_at_end = True,
)

In [17]:
def metrics(predictions, labels):
    softmax = torch.nn.Softmax()
    probs = softmax(torch.Tensor(predictions))
    pred = np.argmax(probs, axis=1)
    
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
    
    metrics = {'f1': f1,
               'recall': recall,
               'precision': precision,
               'accuracy': accuracy}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = metrics(predictions=preds, labels=p.label_ids)
    return result

In [18]:
#forward pass
outputs = model(input_ids = train_dataset[0]['input_ids'].unsqueeze(0),
                labels=train_dataset[0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(1.4677, grad_fn=<NllLossBackward>), logits=tensor([[ 0.1709,  0.0559, -0.0397,  0.0523, -0.1175]],
       grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics
)

### (3) 학습

In [20]:
trainer.train()

***** Running training *****
  Num examples = 24000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Epoch,Training Loss,Validation Loss,F1,Recall,Precision,Accuracy
1,1.0498,0.951158,0.616796,0.62075,0.618353,0.62075
2,0.717,0.980583,0.618507,0.6285,0.619851,0.6285
3,0.3847,1.213342,0.630592,0.638875,0.629217,0.638875
4,0.1841,1.581396,0.622881,0.630375,0.621762,0.630375
5,0.0944,2.014997,0.624632,0.631,0.624113,0.631
6,0.0568,2.373013,0.636975,0.639625,0.635661,0.639625
7,0.0344,2.657158,0.632952,0.63575,0.63667,0.63575
8,0.0193,2.846442,0.637549,0.641125,0.640368,0.641125
9,0.01,2.932945,0.639269,0.640625,0.639323,0.640625
10,0.0043,2.968847,0.638738,0.640125,0.639183,0.640125


***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
  probs = softmax(torch.Tensor(predictions))
Saving model checkpoint to law_ft_output\checkpoint-750
Configuration saved in law_ft_output\checkpoint-750\config.json
Model weights saved in law_ft_output\checkpoint-750\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
  probs = softmax(torch.Tensor(predictions))
Saving model checkpoint to law_ft_output\checkpoint-1500
Configuration saved in law_ft_output\checkpoint-1500\config.json
Model weights saved in law_ft_output\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
  probs = softmax(torch.Tensor(predictions))
Saving model checkpoint to law_ft_output\checkpoint-2250
Configuration saved in law_ft_output\checkpoint-2250\config.json
Model weights saved in law_ft_output\checkpoint-2250\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32
  pro

TrainOutput(global_step=7500, training_loss=0.25233156305948895, metrics={'train_runtime': 10164.5547, 'train_samples_per_second': 23.611, 'train_steps_per_second': 0.738, 'total_flos': 3.7000988784e+16, 'train_loss': 0.25233156305948895, 'epoch': 10.0})

### (4) 검증

In [21]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 8000
  Batch size = 32


  probs = softmax(torch.Tensor(predictions))


{'eval_loss': 0.951158344745636,
 'eval_f1': 0.6167962100986971,
 'eval_recall': 0.6207499999999999,
 'eval_precision': 0.6183533018802114,
 'eval_accuracy': 0.62075,
 'eval_runtime': 46.2096,
 'eval_samples_per_second': 173.124,
 'eval_steps_per_second': 5.41,
 'epoch': 10.0}

## 7. 성능 평가

In [22]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 8000
  Batch size = 32
  probs = softmax(torch.Tensor(predictions))


PredictionOutput(predictions=array([[ 1.8078022 ,  1.3283167 , -2.1738224 ,  1.0100536 , -1.100724  ],
       [ 2.5921829 ,  0.92069006, -2.0989711 ,  0.658214  , -1.1920718 ],
       [ 2.2641647 ,  0.67087054, -1.2642353 ,  0.55750597, -1.1980045 ],
       ...,
       [-2.1291132 ,  1.1156294 , -2.7146127 , -0.24712716,  2.8232567 ],
       [-0.7200876 ,  0.8877388 , -2.9773626 ,  1.0265913 ,  1.6065918 ],
       [-2.229726  ,  0.850789  , -1.8509374 , -1.0070518 ,  3.5884283 ]],
      dtype=float32), label_ids=array([0, 0, 0, ..., 4, 4, 4], dtype=int64), metrics={'test_loss': 0.9332849383354187, 'test_f1': 0.6283778784611795, 'test_recall': 0.633875, 'test_precision': 0.6288883261969027, 'test_accuracy': 0.633875, 'test_runtime': 46.3259, 'test_samples_per_second': 172.69, 'test_steps_per_second': 5.397})