In [1]:
import random
import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
# [CLS] -> BertModel -> Linear -> tanh -> Linear
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl

MODEL_NAME = "tohoku-nlp/bert-base-japanese-whole-word-masking"

In [2]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
bert_sc = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
bert_sc = bert_sc.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tohoku-nlp/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers.tokenization_utils_base import BatchEncoding


text_list = [
    "この映画は面白かった。",
    "この映画の最後にはがっかりさせられた。",
    "この映画を見て幸せな気持ちになった。",
]
label_list = [
    1,
    0,
    1
]

# 符号化
encoding: BatchEncoding = tokenizer(
    text_list,
    padding="longest",
    return_tensors="pt",
)
encoding = {k: v.cuda() for k, v in encoding.items()}
labels = torch.tensor(label_list).cuda()

# 推論
with torch.no_grad():
    output = bert_sc.forward(**encoding)
# 分類スコア
# 形状は(バッチサイズ（文章数）、 カテゴリ数)
scores = output.logits
labels_predicted = scores.argmax(-1)

In [4]:
labels_predicted

tensor([1, 1, 1], device='cuda:0')

In [5]:
# モデルの分類器のパラメータ初期値はランダムな値のため、精度は低い
num_correct = (labels_predicted == labels).sum().item()
accuracy = num_correct / labels.size(0)
print(accuracy)

0.6666666666666666


In [6]:
# 符号化
encoding: BatchEncoding = tokenizer(
    text_list,
    padding="longest",
    return_tensors="pt",
)
encoding["labels"] = torch.tensor(label_list)
encoding = {k: v.cuda() for k, v in encoding.items()}

# ロスの計算
output = bert_sc(**encoding)
loss = output.loss
print(loss)

tensor(0.6398, device='cuda:0', grad_fn=<NllLossBackward0>)


OSコマンドで livedoor ニュースコーパスをダウンロード後、以下の処理につづく

## 6-5 BERT のファインチューニングと性能評価
- データ(符号化された文章)とラベルを抜き出し、ミニバッチにする
- データローダはデータセットからミニバッチを取り出す

In [7]:
dataset_for_loader = [
    {
        "data": torch.tensor([0, 1]),
        "labels": torch.tensor(0),
    },
    {
        "data": torch.tensor([2, 3]),
        "labels": torch.tensor(1),
    },
    {
        "data": torch.tensor([4, 5]),
        "labels": torch.tensor(2),
    },
    {
        "data": torch.tensor([6, 7]),
        "labels": torch.tensor(3),
    },
]
loader = DataLoader(dataset_for_loader, batch_size=2)

# ミニバッチを取り出す
for idx, batch in enumerate(loader):
    print(f"batch {idx}")
    print(batch)
    # ファインチューニングではここでミニバッチ毎の処理をおこなう

batch 0
{'data': tensor([[0, 1],
        [2, 3]]), 'labels': tensor([0, 1])}
batch 1
{'data': tensor([[4, 5],
        [6, 7]]), 'labels': tensor([2, 3])}


In [8]:
loader = DataLoader(dataset_for_loader, batch_size=2, shuffle=True)
for idx, batch in enumerate(loader):
    print(f"batch {idx}")
    print(batch)

batch 0
{'data': tensor([[4, 5],
        [6, 7]]), 'labels': tensor([2, 3])}
batch 1
{'data': tensor([[0, 1],
        [2, 3]]), 'labels': tensor([0, 1])}


### 前処理
各データを次のキーを持つ辞書にする。

すなわち、 tokenizer で符号化を行ったときに得られる形式

- input_ids
- attention_mask
- token_type_ids
- labels

In [9]:
category_list = [
    "dokujo-tsushin",
    "it-life-hack",
    "kaden-channel",
    "livedoor-homme",
    "movie-enter",
    "peachy",
    "smax",
    "sports-watch",
    "topic-news",
]

tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

# データ整形
# 学習の高速化のため128とする
max_length = 128
dataset_for_loader = []

for label, category in enumerate(tqdm(category_list)):
    for file in glob.glob(f"./text/{category}/{category}*"):
        lines = open(file, encoding="utf-8").read().splitlines()
        # 4行目からが本文
        text = "\n".join(lines[3:])
        encoding = tokenizer(text, max_length=max_length, padding="max_length", truncation=True)
        encoding["labels"] = label
        # なぜこの場合はここでtensor化するのか? return_tensor ではだめなのか?
        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        dataset_for_loader.append(encoding)

100%|██████████| 9/9 [00:17<00:00,  1.99s/it]


In [10]:
from pprint import pprint
pprint(dataset_for_loader[0])
print(len(dataset_for_loader))

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([    2,  2340, 19693, 10585, 28459,    35,  6692, 28493,    13,   501,
           62,   101,    37,     8,   569,   335,     5,    51,     7,     9,
         1040,     5,   616,     9,  2941,    18,  5602,   501,    20,    16,
         4027, 10531,   140,    36,    73, 30020, 28457, 25127,    38,  1080,
            5,    53,    28,   707,     5,    12,     9,    80,  3635,   205,
           29,  2935,   604,  5846,  6503,    11,  4722,    16,   861,    13,
            6, 12272, 24050,  2079,    11,    26,    62,    45,  

In [11]:
# データセット分割
# 学習/検証/テスト = 6:2:2 で分割
random.shuffle(dataset_for_loader)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)

dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train: n_train + n_val]
dataset_test = dataset_for_loader[n_train + n_val:]

# データセットからデータローダを作成
# 学習はshuffle=True
dataloader_train = DataLoader(
    dataset_train,
    # BERTのオリジナルの論文を参考に32としているとのこと
    batch_size=32,
    shuffle=True
)
dataloader_val = DataLoader(
    dataset_val,
    # 損失の勾配を計算しないので、大きめのバッチサイズ
    batch_size=256,
)
dataloader_test = DataLoader(
    dataset_test,
    batch_size=256,
)

## Pytorch Lightning
- PyTorchで書くこともできるが、Lightning はモデルやデータによらず共通の処理が内部であらかじめ実装されている

In [12]:
# モデルの振る舞いを記述するクラス
class BertForSequenceClassification_pl(pl.LightningModule):
    def __init__(
        self,
        model_name: str,
        num_labels: int,
        lr: float,
    ):
        super().__init__()
        # 例えば、 self.hparams.lr にアクセス可能. 自動で引数が設定される
        self.save_hyperparameters()
        self.bert_sc = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # 学習データのミニバッチが与えられたときに損失を出力する関数を書く
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log("train_loss", loss)
        return loss

    # 検証データのミニバッチが与えられたときに検証データの評価指標を計算する関数を書く
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log("val_loss", val_loss)

    # テストデータのミニバッチが与えられたときにテストデータの評価指標を計算する関数を書く
    def test_step(self, batch, batch_idx):
        labels = batch.pop("labels")
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = (labels_predicted == labels).sum().item()
        accuracy = num_correct / labels.size(0)
        self.log("accuracy", accuracy)

    # 学習に用いるオプティマイザを返す関数を書く
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [14]:
# 学習時にモデルの重みを保存する条件を指定
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor="val_loss",
    mode="min",
    save_top_k=1,
    save_weights_only=True,
    dirpath="model/"
)

# 学習の方法を指定
trainer = pl.Trainer(
    accelerator="gpu",
    devices="auto",
    max_epochs=10,
    callbacks=[checkpoint],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
C:\Users\ykite\anaconda3\envs\bert_book_stockmark\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [15]:
model = BertForSequenceClassification_pl(
    model_name=MODEL_NAME,
    num_labels=9,
    lr=1e-5,
)
trainer.fit(model, dataloader_train, dataloader_val)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tohoku-nlp/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using a CUDA device ('NVIDIA GeForce RTX 4070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: C:\Users\ykite\Documents\projects\study\bert_book_stockmark\chap6\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 110 M 


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\ykite\anaconda3\envs\bert_book_stockmark\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
C:\Users\ykite\anaconda3\envs\bert_book_stockmark\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [19]:
best_model_path = checkpoint.best_model_path
print("ベストモデルのファイル:", checkpoint.best_model_path)
print("ベストモデルの検証データに対する損失", checkpoint.best_model_score)

ベストモデルのファイル: C:\Users\ykite\Documents\projects\study\bert_book_stockmark\chap6\model\epoch=4-step=695.ckpt
ベストモデルの検証データに対する損失 tensor(0.4203, device='cuda:0')


In [20]:
# 最後に、ファインチューニングで得たモデルをテストデータで評価
test = trainer.test(dataloaders=dataloader_test)
print(f"Accuracy: {test[0]['accuracy']:.2f}")

Restoring states from the checkpoint path at C:\Users\ykite\Documents\projects\study\bert_book_stockmark\chap6\model\epoch=4-step=695.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at C:\Users\ykite\Documents\projects\study\bert_book_stockmark\chap6\model\epoch=4-step=695.ckpt
C:\Users\ykite\anaconda3\envs\bert_book_stockmark\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.8812754154205322
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Accuracy: 0.88


In [21]:
# PyTorch Lightning モデルのロード
model = BertForSequenceClassification_pl.load_from_checkpoint(
    best_model_path,
)

# 以下のように保存
# ├── model_transformers
# │   ├── config.json
# │   └── model.safetensors
model.bert_sc.save_pretrained("./model_transformers")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tohoku-nlp/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
bert_sc = BertForSequenceClassification.from_pretrained("./model_transformers")

In [23]:
print(bert_sc)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,