<a href="https://colab.research.google.com/github/badoil/ML/blob/master/bert_finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
!pip install datasets transformers

Successfully installed datasets-2.14.4 dill-0.3.7 huggingface-hub-0.16.4 multiprocess-0.70.15 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.32.0 xxhash-3.3.0


In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim     # Adam Optimizer
from datasets import load_dataset

In [7]:
dataset = load_dataset("nsmc")

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [8]:
dataset['train']['label'][:10]  # 0부정리뷰 1긍정리뷰

[0, 1, 0, 0, 1, 0, 0, 0, 1, 1]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [10]:
inputs = tokenizer(dataset['train']['document'], padding=True, truncation=True, max_length=512, return_tensors="pt")    # embedding matrix 만들기 위해서 각 문장 벡터의 차원, 즉 토큰 갯수를 가장 긴 문장의 토큰 갯수에 맞춤

In [11]:
labels = torch.tensor(dataset['train']['label'])

In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
inputs['input_ids'].shape

In [None]:
inputs['attention_mask']    # padding 으로 들어간 토큰들은 0으로 표현해서 없애줌

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [13]:
inputs_test = tokenizer(dataset['test']['document'], padding=True, truncation=True, max_length=512, return_tensors="pt")    # embedding matrix 만들기 위해서 각 문장 벡터의 차원, 즉 토큰 갯수를 가장 긴 문장의 토큰 갯수에 맞춤
labels_test = torch.tensor(dataset['test']['label'])

In [15]:
train_dataset = TensorDataset(inputs.input_ids, inputs.attention_mask, labels)  # pytorch dataset인 TensorDataset 으로 만들어줌

In [30]:
#train_dataset[0]    # (input_ids, attention_mask, labels) 이런식으로 한 문장의 정보를 표현
#inputs['input_ids'][0]
inputs['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [16]:
test_dataset = TensorDataset(inputs_test.input_ids, inputs_test.attention_mask, labels_test)  # pytorch dataset인 TensorDataset 으로 만들어줌

In [17]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)    # batch_size 학습효율에 영향 미침

In [None]:
len(train_loader)   # 150000 = 32 * 4688    # 한 batch 마다 업뎃되므로 이 모델은 총 4688번 업뎃됨

4688

In [18]:
test_loader = DataLoader(test_dataset, batch_size=32)

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)     # BertForSequenceClassification 이것은 feedforward fully connected layer + softmax 해서 classification 하는 클래스

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.to(device)    # 모델의 파라미터들을 지금 이 colab의 메모리 gpu에 올린다

In [21]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()     # multi-class classification
EPOCH = 1
PRINT_EVERY_N = 400

In [None]:
model.train()   # train 모드로 바꿈

In [None]:
for epoch in range(EPOCH):
  running_loss = 0.0
  for i, batch in enumerate(train_loader):
    optimizer.zero_grad()   # batch 마다 미분값 초기화
    input_ids, attention_mask, labels = [b.to(device) for b in batch]   # gpu에 보낸후 리스트에 담기
    outputs = model(input_ids, attention_mask=attention_mask)   # BERT에 따라서 attention 계산됨, 여기서 logit 값이 output
    loss = loss_fn(outputs.logits, labels)    # 확률값, 32개 배치사이즈의 평균 확률값이 나옴
    loss.backward()   # 역전파
    optimizer.step()  # 역전파 미분값으로 파라미터 업데이트

    running_loss += loss.item()
    if ((i % PRINT_EVERY_N == 0) and (i > 0)):
      avg_loss = running_loss / PRINT_EVERY_N
      print(f"epoch: {epoch+1}, batch: {i}, avg_loss:{avg_loss:.4f}")
      running_loss = 0.0
    print("finished")

epoch: 1, batch: 400, avg_loss:0.5361
epoch: 1, batch: 800, avg_loss:0.4481
epoch: 1, batch: 1200, avg_loss:0.4112
epoch: 1, batch: 1600, avg_loss:0.3840
epoch: 1, batch: 2000, avg_loss:0.3731
epoch: 1, batch: 2400, avg_loss:0.3693
epoch: 1, batch: 2800, avg_loss:0.3584
epoch: 1, batch: 3200, avg_loss:0.3496
epoch: 1, batch: 3600, avg_loss:0.3528
epoch: 1, batch: 4000, avg_loss:0.3479
epoch: 1, batch: 4400, avg_loss:0.3445


In [None]:
model.eval()

In [33]:
total_correct = 0
total_count = 0
with torch.no_grad():
  for batch in test_loader:
    input_ids, attention_mask, labels = [b.to(device) for b in batch]
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    correct = (predictions == labels).sum().item()    # boolean 값을 integer로 바꿈
    total_correct += correct
    total_count += labels.size(0)

In [35]:
# accuracy = total_correct / total_count * 100
# print(f"accuracy: {accuracy:.2f}")

accuracy: 3.13


In [None]:
def predict(sentence: str):
  #tokenizing
  inputs = tokenizer(dataset['train']['document'], padding=True, truncation=True, max_length=512, return_tensors="pt")
  input_ids = inputs['input_ids'].to(device)
  attention_mask = inputs['attention_mask'].to(device)

  # forward
  outputs = model(input_ids, attention_mask=attention_mask)
  logits = outputs.logits
  prediction = torch.argmax(logits, dim=1).item()

  return prediction