## 3. 전체 학습

`Trainer` 클래스를 사용하지 않고 학습시키기

In [28]:
!pip install -q datasets transformers accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/244.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset('glue', 'mrpc')
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example['sentence1'],
                     example['sentence2'],
                     truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 학습을 위한 준비

학습 루프를 작성하기 전에 객체들을 정의해야 한다

`dataloaders` : 배치 반복

모델이 필요로 하지 않는 column 제거(sentence1, sentence2)

열 레이블의 이름을 `labels` 로 변경

파이썬 리스트 대신 `tensor` 반환하도록 `datasets` 의 형식을 설정

In [11]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
# dataloader 정의하기

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=8,
    collate_fn=data_collator
)

In [13]:
# 배치 검사
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 81]),
 'token_type_ids': torch.Size([8, 81]),
 'attention_mask': torch.Size([8, 81])}

In [14]:
# 모델 인스턴스화
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 배치를 모델에 전달
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7101, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [16]:
# 최적화 함수 정의
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [17]:
# 학습률 스케줄러
# 학습 단계의 횟수를 구한다 (에포크 수 * 학습 배치 수)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1377


### 학습 루프

In [19]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.devide("cpu")
model.to(device)
device

device(type='cuda')

In [23]:
# tqdm 라이브러리를 사용하여 학습 단계 기준으로 progress bar 표시
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [25]:
# 평가 루프
# datasets에서 제공하는 평가 메트릭 사용

from datasets import load_metric

metric = load_metric('glue', 'mrpc')
model.eval()

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

{'accuracy': 0.8529411764705882, 'f1': 0.8969072164948454}

### Accelerate 라이브러리 / 학습 루프 가속화

`Accelerate` 라이브러리를 사용하여 분산 학습을 수행할 수 있다

In [32]:
# 학습 루프 정의

from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

# use accelerator
accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

# use accelerator
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

# accelerator 가 장치 배치를 자동으로 처리하므로, 하단의 코드를 제거할 수 있다

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cuda')
# model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

### 총 정리

In [37]:
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    AdamW,
    get_scheduler
)
import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm


# dataset load
raw_Datasets = load_dataset('glue', 'mrpc')
# ckpt naming
checkpoint = 'bert-base-uncased'
# tokenizer 인스턴스화
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenizer function customize
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

# execute tokenizing
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# define data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# preprocessing
tokenized_datasets = tokenized_datasets.remove_columns(
    ['sentence1', 'sentence2', 'idx']
)
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# make dataloader
train_dataloader = DataLoader(tokenized_datasets['train'],
                              shuffle=True,
                              batch_size=8,
                              collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'],
                             shuffle=True,
                             batch_size=8,
                             collate_fn=data_collator)

# PLM 인스턴스화
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# epoch num
num_epochs = 3

# num of steps
num_training_steps = num_epochs * len(train_dataloader)

# lr scheduler
lr_scheduler = get_scheduler('linear',
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# load model to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# define progress bar
progress_bar = tqdm(range(num_training_steps))

# train mode
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        # loss
        loss = outputs.loss
        # BPP
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# get metric
metric = load_metric('glue', 'mrpc')

# eval mode
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}