In [2]:
import torch

from transformers import AutoTokenizer

#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## 加载数据集并处理

In [3]:
from datasets import load_dataset

#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')

#编码
f = lambda x: tokenizer(x['text'], truncation=True, max_length=500)
dataset = dataset.map(f, remove_columns=['text'])

#设置数据类型
dataset.set_format('pt')

dataset, dataset['train'][0]

Map: 100%|██████████| 9600/9600 [00:01<00:00, 5765.01 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 5782.98 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 5770.33 examples/s]


(DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 1200
     })
 }),
 {'label': tensor(1),
  'input_ids': tensor([ 101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221,
          3175,  912, 8024, 3300, 4510, 1220, 2820, 3461, 4684, 2970, 1168, 6809,
          3862, 6804, 8024, 1453, 1741, 7623, 7667,  510, 7608, 2443,  510, 1555,
          1767,  510, 6631, 2356,  510, 3033,  855,  671, 2418,  936, 1059,  511,
          6983, 2421, 6163,  934,  671, 5663, 8024,  852, 6820, 5050, 3146, 3815,
           511, 3807, 3737, 1762, 1920, 1828, 4638, 2238, 7553, 8024, 1728, 3634,
          2523, 2207, 8024,  679, 6814, 1957, 1036

## 定义下游任务模型（Downstream Tasks by Pytorch)

In [12]:
#定义模型
from transformers import BertConfig, BertForSequenceClassification

#在线加载一个语句分类模型
model = BertForSequenceClassification.from_pretrained(
    'google-bert/bert-base-chinese', num_labels=2) # 2分类

model.config

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "google-bert/bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

## 执行训练

In [18]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

#配置训练参数
args = TrainingArguments(output_dir='output_dir',
                         use_cpu=True,
                         num_train_epochs=1,
                         max_steps=300,
                         eval_strategy='no',
                         per_device_train_batch_size=8)

#创建trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset['train'],
                  data_collator=DataCollatorWithPadding(tokenizer))

#执行训练
trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

## 执行测试

In [19]:
#执行测试
def test():
    loader_test = torch.utils.data.DataLoader(
        dataset['test'],
        batch_size=8,
        shuffle=True,
        drop_last=True,
        collate_fn=DataCollatorWithPadding(tokenizer))

    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            out = model(**data).logits

        out = out.argmax(dim=1)
        correct += (out == data.labels).sum().item()
        total += len(data.labels)

        print(i, len(loader_test), correct / total)

        if i == 5:
            break

    return correct / total


test()

0 150 0.25
1 150 0.4375
2 150 0.4583333333333333
3 150 0.40625
4 150 0.4
5 150 0.375


0.375