<a href="https://colab.research.google.com/github/YoungjaeDev/HuggingFace-Tutorial/blob/master/Part3_huggingface_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### load dataset

In [2]:
!pip install -q transformers datasets accelerate

In [3]:
from datasets import load_dataset
nsmc_dataset = load_dataset('nsmc')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
nsmc_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [6]:
nsmc_dataset['train'][0]

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}

In [7]:
nsmc_dataset['train'].features

{'id': Value(dtype='string', id=None),
 'document': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [8]:
nsmc_dataset['train'].features['label'].str2int('negative')

0

In [9]:
nsmc_dataset['train'].features['label'].str2int('positive')

1

In [10]:
nsmc_df = nsmc_dataset['train'].to_pandas()
nsmc_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [11]:
nsmc_df.groupby('label').count()

Unnamed: 0_level_0,id,document
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,75173,75173
1,74827,74827


In [12]:
nsmc_df['review_length'] = nsmc_df['document'].str.len()
nsmc_df.review_length.describe()

count    150000.000000
mean         35.203353
std          29.532097
min           0.000000
25%          16.000000
50%          27.000000
75%          42.000000
max         146.000000
Name: review_length, dtype: float64

### preprocess

In [5]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [6]:
tok.tokenize('청춘 영화의 최고봉.')

['청', '##춘', '영화', '##의', '최고', '##봉', '.']

In [7]:
tok('청춘 영화의 최고봉.')

{'input_ids': [101, 9751, 97707, 42428, 10459, 83491, 118989, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tok(['청춘 영화의 최고봉.', '청춘'], padding=True)

{'input_ids': [[101, 9751, 97707, 42428, 10459, 83491, 118989, 119, 102], [101, 9751, 97707, 102, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 0, 0, 0]]}

In [9]:
def tokenizer(data):
    return tok(data['document'], max_length=64, padding='max_length', truncation=True)

In [10]:
nsmc_dataset_tokenized = nsmc_dataset.map(tokenizer)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [11]:
nsmc_dataset_tokenized['train'][0]

{'id': '9976970',
 'document': '아 더빙.. 진짜 짜증나네요 목소리',
 'label': 0,
 'input_ids': [101,
  9519,
  9074,
  119005,
  119,
  119,
  9708,
  119235,
  9715,
  119230,
  16439,
  77884,
  48549,
  9284,
  22333,
  12692,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [12]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

### model load

In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
                                                          num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
num_train_epochs = 2
learning_rate = 2e-7
batch_size = 128

## train-pytorch

In [23]:
from torch.utils.data import DataLoader

In [24]:
tr_ds = nsmc_dataset_tokenized['train'].remove_columns(['id', 'document'])
tr_ds.set_format(type='torch')

In [25]:
tr_dl = DataLoader(tr_ds, batch_size=batch_size)
tr_dl

<torch.utils.data.dataloader.DataLoader at 0x7b96a85ce620>

In [26]:
val_ds = nsmc_dataset_tokenized['test'].remove_columns(['id', 'document'])
val_ds.set_format(type='torch')
val_dl = DataLoader(tr_ds, batch_size=batch_size)
val_dl

<torch.utils.data.dataloader.DataLoader at 0x7b979423b5b0>

In [27]:
next(iter(tr_dl))

{'label': tensor([0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
         0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
         0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
         0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
         1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
         0, 0, 1, 1, 0, 1, 0, 1]),
 'input_ids': tensor([[  101,  9519,  9074,  ...,     0,     0,     0],
         [  101,   100,   119,  ...,     0,     0,     0],
         [  101,   100,   102,  ...,     0,     0,     0],
         ...,
         [  101,  9358, 12508,  ...,     0,     0,     0],
         [  101,  9519, 25503,  ...,     0,     0,     0],
         [  101, 10150, 10954,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 

In [None]:
import numpy as np
from tqdm import tqdm
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

model.to(device)
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_train_epochs):
    train_losses = []
    train_acc = 0.0
    model.train()

    for step, batch in enumerate(tqdm(tr_dl)):
        label = batch['label'].to(device)
        input_id, token_type_ids, attention_mask = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)

        model.zero_grad()
        pred = model(input_id, token_type_ids, attention_mask)

        # logits 배치 크기 x 클래스 수, t() -> 클래스 수 x 배치사이즈
        # [1] -> 라벨이 1인 배치사이즈
        loss = criterion(torch.sigmoid(pred.logits.t()[1]), label.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        train_acc += acc(pred.logits.argmax(dim=1), label)

#         if (step+1)%100==0:
#             print("train loss: ", np.mean(train_losses))
#             print("train acc: ", train_acc/(step*batch_size))


    print("train loss: ", np.mean(train_losses))
    print("train acc: ", train_acc/len(tr_dl.dataset))

    val_losses = []
    val_acc = 0
    model.eval()


    for step, batch in enumerate(tqdm(val_dl)):
        label = batch['label'].to(device)
        input_id, token_type_ids, attention_mask = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)

        pred = model(input_id, token_type_ids, attention_mask)
        loss = criterion(torch.sigmoid(pred.logits.t()[1]), label.float())

        val_losses.append(loss.item())
        val_acc += acc(pred.logits.argmax(dim=1), label)


    print("val loss: ", np.mean(val_losses))
    print("val acc: ", val_acc/len(val_dl.dataset))

100%|████████████████████████████████████████████████████████████| 1172/1172 [04:09<00:00,  4.70it/s]


train loss:  308.22439920210593
train acc:  0.6012066666666667


100%|████████████████████████████████████████████████████████████| 1172/1172 [01:20<00:00, 14.59it/s]


val loss:  305.9489754946972
val acc:  0.6676466666666667


100%|████████████████████████████████████████████████████████████| 1172/1172 [04:09<00:00,  4.70it/s]


train loss:  305.3655588325787
train acc:  0.67424


100%|████████████████████████████████████████████████████████████| 1172/1172 [01:19<00:00, 14.75it/s]

val loss:  304.07286612084295
val acc:  0.6995666666666667





## train- trainer

In [35]:
!pip install -U accelerate



In [18]:
from transformers import TrainingArguments

logging_steps = len(nsmc_dataset['train']) // batch_size
print(f'logging_steps: {logging_steps}')
output_dir = 'trainer_test'

num_train_epochs = 20
learning_rate = 2e-7
batch_size = 128

training_args = TrainingArguments(output_dir=output_dir,
                                 num_train_epochs=num_train_epochs,
                                 learning_rate = learning_rate,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 evaluation_strategy='epoch',
                                 logging_steps=logging_steps,
                                 fp16=True,
                                 push_to_hub=False)

logging_steps: 1171




In [19]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # 분류 모델의 성능을 평가하기 위해 사용
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [20]:
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
                                                          num_labels=2)

trainer = Trainer(model=model,
                 args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=nsmc_dataset_tokenized['train'],
                 eval_dataset=nsmc_dataset_tokenized['test'],
                 tokenizer=tok)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### train

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6486,0.585248,0.69758,0.692856,0.708912,0.677512
2,0.5612,0.526854,0.74024,0.728988,0.76779,0.693918
