In [2]:
#多肢選択式応答

from pprint import pprint
from datasets import load_dataset

train_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="train"
)

valid_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="validation"    
)



Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [6]:
#データセットの確認

pprint(train_dataset[0])
print()
pprint(train_dataset[1])
print()
pprint(train_dataset[10])

{'choice0': '世界',
 'choice1': '写真集',
 'choice2': '絵本',
 'choice3': '論文',
 'choice4': '図鑑',
 'label': 2,
 'q_id': 0,
 'question': '主に子ども向けのもので、イラストのついた物語が書かれているものはどれ？'}

{'choice0': '浮浪者',
 'choice1': '保護者',
 'choice2': 'お坊さん',
 'choice3': '宗教者',
 'choice4': '預言者',
 'label': 1,
 'q_id': 1,
 'question': '未成年者を監護・教育し，彼らを監督し，彼らの財産上の利益を守る法律上の義務をもつ人は？'}

{'choice0': '成金',
 'choice1': '関白',
 'choice2': '同僚',
 'choice3': 'クリップボード',
 'choice4': '成功者',
 'label': 4,
 'q_id': 10,
 'question': '物事を成しとげた人は？'}


In [8]:
from transformers import AutoTokenizer

model_name = "llm-book/bert-base-japanese-v3-jcommonsenseqa"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [57]:
#多肢選択式のデータセットに対する前処理

from transformers import BatchEncoding

num_choice = train_dataset.features["label"].num_classes

def preprocess_multi_process(example, num_choice):
    choice_list = [example[f"choice{i}"] for i in range(num_choice)]
    question_list = [example["question"]] * num_choice

    encoded_example = tokenizer(question_list, choice_list, max_length=128)

    if "label" in example:
        encoded_example["labels"] = example["label"]

    return encoded_example


encoded_train_dataset = train_dataset.map(
    lambda example: preprocess_multi_process(example, num_choice=num_choice), remove_columns=train_dataset.column_names
)
encoded_valid_dataset = valid_dataset.map(
    lambda example: preprocess_multi_process(example, num_choice=num_choice), remove_columns=train_dataset.column_names
)

    

Map:   0%|          | 0/8939 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [60]:
#tokenizerの処理を確認

example = train_dataset[0]

encoded_example = preprocess_multi_process(example, num_choice)

for i in range(5):
    print(tokenizer.convert_ids_to_tokens(encoded_example["input_ids"][i]))

['[CLS]', '主に', '子ども', '向け', 'の', 'もの', 'で', '、', 'イラスト', 'の', 'つい', 'た', '物語', 'が', '書か', 'れ', 'て', 'いる', 'もの', 'は', 'どれ', '?', '[SEP]', '世界', '[SEP]']
['[CLS]', '主に', '子ども', '向け', 'の', 'もの', 'で', '、', 'イラスト', 'の', 'つい', 'た', '物語', 'が', '書か', 'れ', 'て', 'いる', 'もの', 'は', 'どれ', '?', '[SEP]', '写真', '集', '[SEP]']
['[CLS]', '主に', '子ども', '向け', 'の', 'もの', 'で', '、', 'イラスト', 'の', 'つい', 'た', '物語', 'が', '書か', 'れ', 'て', 'いる', 'もの', 'は', 'どれ', '?', '[SEP]', '絵本', '[SEP]']
['[CLS]', '主に', '子ども', '向け', 'の', 'もの', 'で', '、', 'イラスト', 'の', 'つい', 'た', '物語', 'が', '書か', 'れ', 'て', 'いる', 'もの', 'は', 'どれ', '?', '[SEP]', '論文', '[SEP]']
['[CLS]', '主に', '子ども', '向け', 'の', 'もの', 'で', '、', 'イラスト', 'の', 'つい', 'た', '物語', 'が', '書か', 'れ', 'て', 'いる', 'もの', 'は', 'どれ', '?', '[SEP]', '図鑑', '[SEP]']


In [61]:
pprint(encoded_example)

{'attention_mask': [[1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1],
                    [1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
                     1,
               

In [62]:
#ミニバッチ化と最大長パディングを実行する関数
from transformers import BatchEncoding
import torch

def collate_multiple_choice(features):

    batch_size = len(features)
    num_choice = len(features[0]["input_ids"])

    label_name = "labels"

    flat_list = []
    for feature in features:
        flat_list += [{
           k:v[i]  for k, v in feature.items() 
           if k != "labels" 
        } for i in range(num_choice)]

    flat_batch = tokenizer.pad(flat_list, return_tensors="pt")

    batch = {k:v.view(batch_size, num_choice, -1)  for k, v in flat_batch.items()}

    if label_name in features[0].keys():
        labels = [feature[label_name] for feature in features]
        batch[label_name] = torch.tensor(labels, dtype=torch.int64)

    return batch


In [63]:
from transformers import AutoModelForMultipleChoice

transformers_model_name = "cl-tohoku/bert-base-japanese-v3"

model = AutoModelForMultipleChoice.from_pretrained(transformers_model_name, 
                                                   num_labels=train_dataset.features["label"].num_classes)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
from transformers import TrainingArguments, Trainer, BatchEncoding
import numpy as np

def calc_accuracy(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return {"accuracy":(predictions == labels).mean()}

training_arg = TrainingArguments(
    output_dir="./output/",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    num_train_epochs=5,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)

trainer = Trainer(
    model=model,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    data_collator=collate_multiple_choice,
    compute_metrics=calc_accuracy
)

In [65]:
trainer.train()

  0%|          | 0/3354 [00:00<?, ?it/s]

{'loss': 0.7161, 'learning_rate': 4.2620751341681574e-05, 'epoch': 0.45}
{'loss': 0.6248, 'learning_rate': 3.5166964818127615e-05, 'epoch': 0.89}
{'loss': 0.3324, 'learning_rate': 2.7713178294573645e-05, 'epoch': 1.34}
{'loss': 0.2431, 'learning_rate': 2.025939177101968e-05, 'epoch': 1.79}
{'loss': 0.1621, 'learning_rate': 1.2805605247465712e-05, 'epoch': 2.24}
{'loss': 0.081, 'learning_rate': 5.3518187239117475e-06, 'epoch': 2.68}
{'train_runtime': 273.9501, 'train_samples_per_second': 97.89, 'train_steps_per_second': 12.243, 'train_loss': 0.3308519262178497, 'epoch': 3.0}


TrainOutput(global_step=3354, training_loss=0.3308519262178497, metrics={'train_runtime': 273.9501, 'train_samples_per_second': 97.89, 'train_steps_per_second': 12.243, 'train_loss': 0.3308519262178497, 'epoch': 3.0})

In [66]:
trainer.evaluate()

  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 0.7758638262748718,
 'eval_accuracy': 0.8400357462019661,
 'eval_runtime': 2.852,
 'eval_samples_per_second': 392.351,
 'eval_steps_per_second': 49.088,
 'epoch': 3.0}