# 基于Transformers的命名实体识别

命名实体识别其实可以分为两个阶段，包括实体边界识别和确定实体类别（区分主谓宾定冠状应该就是这么来的），确定实体类别好标注，而实体边界识别分为不同的标注体系：IOB1/IOB2/IOE1/IOE2/IOBES/BILOU等，最常用就是IOB2和IOBES：IOB2用B表示实体的开始，I表示实体的内部，O表示实体的外部；IOBES还用E表示实体的结束，S表示一个词单独形成一个命名实体！有时也会用M代替I

评价指标虽然也是accuracy/recall/precision这些，但是判断对错的标准不同：要求完整命名实体的实体边界和实体类别都完全正确才算对，计算评价指标时分母也必须是句子中完整命名实体的个数！

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

## Step1 导入相关包

In [2]:
from datasets import load_dataset
# 注意这里用的是AutoModelForTokenClassification和DataCollatorForTokenClassification，表示对每个token都要进行分类！
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [3]:
# 如果可以联网，直接使用load_dataset进行加载，这也是比较经典的人民日报的NER分类
#ner_datasets = load_dataset("peoples_daily_ner", cache_dir="./data")
# 如果无法联网，则使用下面的方式加载数据集
from datasets import DatasetDict
ner_datasets = DatasetDict.load_from_disk("ner_data")
ner_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4637
    })
})

In [4]:
ner_datasets["train"][0] # 虽然已经按中文的字预分词了，但后面还是要通过tokenizer以得到token ids！英文的话就是根据空格预分词
# 当然预分词与直接用tokenizer得到的分词很可能不同

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [5]:
ner_datasets["train"].features # 可以这么查看命名实体的类别

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

PER：人名；ORG：组织名；LOC：地点

In [19]:
label_list = ner_datasets["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

## Step3 数据集预处理

In [8]:
tokenizer = AutoTokenizer.from_pretrained("/data/PLM/chinese-macbert-base")
tokenizer

BertTokenizerFast(name_or_path='/data/PLM/chinese-macbert-base', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
# 预分词和tokenizer分词的结果可能不同，下面我们将预分词后的结果都称为“词”，中文是一字一“词”，英文是一空格一“词”（显然）

# 对已经做好预分词的语句再通过tokenizer要指定is_split_into_words参数为True，不然会将每个“词”都视为一个句子！
tokenizer(ner_datasets["train"][0]["tokens"], is_split_into_words=True)
# 需要注意的是与tokenizer.tokenize+tokenizer.convert_tokens_to_ids的组合不同：
# is_split_into_words=True会在tokenizer的时候加上特殊字符，而不是在预分词时
# tokenizer.tokenize+tokenizer.convert_tokens_to_ids则不会自动添加特殊字符！

{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
res = tokenizer("interesting word")
# 通过tokenizer后，一个“词”也可能拥有多个token ids，这样直接使用“词”的label肯定不行
print(res)
res.word_ids() # 可以用word_ids查询token ids属于哪个“词”，进而得到“词”对应的命名实体标签！

{'input_ids': [101, 10673, 12865, 12921, 8181, 8681, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


[None, 0, 0, 0, 0, 1, None]

In [11]:
# 借助word_ids实现标签映射
def process_function(examples):
    # 明白了，因为map是逐batch处理的，所以这里会索引到整个batch的tokens和ner_tags，呈嵌套列表的形式
    tokenized_exmaples = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]): # 注意不是对命名实体的类别进行遍历，而是对batch中每个句子的所有命名实体标签进行遍历！
        word_ids = tokenized_exmaples.word_ids(batch_index=i) # 提取batch中第i句的word ids
        # 注意此时label已经是第i句每个“词”的命名实体标签了，无需额外的操作
        label_ids = []
        for word_id in word_ids:
            # 句中属于同一个“词”的token ids都需要打上相同的标签！
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_exmaples["labels"] = labels # 这也是涵盖了整个batch的
    return tokenized_exmaples

In [12]:
tokenized_datasets = ner_datasets.map(process_function, batched=True) # load_dataset
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20865
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2319
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4637
    })
})

In [13]:
print(tokenized_datasets["train"][0])

{'id': '0', 'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}


## Step4 创建模型

In [14]:
# 对于所有的非二分类任务，切记要指定num_labels，否则就会device错误
model = AutoModelForTokenClassification.from_pretrained("/data/PLM/chinese-macbert-base", num_labels=len(label_list))
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /data/PLM/chinese-macbert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [15]:
model.config.num_labels

7

## Step5 创建评估函数

In [16]:
seqeval = evaluate.load("/data/daiyw/Compare/evaluate/metrics/seqeval") # 需要安装evaluate库才能使用
seqeval # 主要需要指定scheme和mode，前者确定标注体系，后者判断仅实体边界错误的部分算不算正确

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of sha

In [20]:
import numpy as np

def eval_metric(pred):
    # 这应该也是一个batch中所有的
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)

    # 将类别转换为原始字符串类型的标签才能用seqeval，只对非-100的进行转换
    true_predictions = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]

    true_labels = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]

    result = seqeval.compute(predictions=true_predictions, references=true_labels, mode="strict", scheme="IOB2")

    return {
        "f1": result["overall_f1"] # 结果包括每个命名实体类别的和总的
    }
    

## Step6 配置训练参数

In [21]:
args = TrainingArguments(
    output_dir="./models_for_ner",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    logging_steps=50,
    num_train_epochs=3
)

## Step7 创建训练器

In [22]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=eval_metric,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Step8 模型训练

In [19]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.0277,0.020275,0.940022
2,0.0127,0.019805,0.950539
3,0.0059,0.019896,0.951768


TrainOutput(global_step=981, training_loss=0.028898916280354684, metrics={'train_runtime': 436.7543, 'train_samples_per_second': 143.319, 'train_steps_per_second': 2.246, 'total_flos': 3915383795505534.0, 'train_loss': 0.028898916280354684, 'epoch': 3.0})

In [20]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"]) 

{'eval_loss': 0.024578170850872993,
 'eval_f1': 0.9467907101264975,
 'eval_runtime': 12.9893,
 'eval_samples_per_second': 356.986,
 'eval_steps_per_second': 2.848,
 'epoch': 3.0}

## Step9 模型预测

In [21]:
from transformers import pipeline

In [22]:
# 使用pipeline进行推理，要指定id2label，将输出直接转化为需要的标签
model.config.id2label = {idx: label for idx, label in enumerate(label_list)}
model.config

BertConfig {
  "_name_or_path": "/data/PLM/chinese-macbert-base",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transfor

In [23]:
# 如果模型是基于GPU训练的，那么推理时要指定device！还要设置不同的任务，如text-classification/token-classification
ner_pipe1 = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0)
# 对于NER任务，可以指定aggregation_strategy为simple，得到组合成实体后的结果，而不是每个“词”的标签
ner_pipe2 = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple")

In [24]:
# 注意这里的'start'和'end'是原句"小明在北京上班"的下标！从而可以直接取出对应的实际结果
res1 = ner_pipe1("小明在北京上班")
print(res1)
res2 = ner_pipe2("小明在北京上班")
print(res2)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-PER', 'score': 0.58375555, 'index': 1, 'word': '小', 'start': 0, 'end': 1}, {'entity': 'I-PER', 'score': 0.68798655, 'index': 2, 'word': '明', 'start': 1, 'end': 2}, {'entity': 'B-LOC', 'score': 0.99912554, 'index': 4, 'word': '北', 'start': 3, 'end': 4}, {'entity': 'I-LOC', 'score': 0.9989334, 'index': 5, 'word': '京', 'start': 4, 'end': 5}]
[{'entity_group': 'PER', 'score': 0.63587105, 'word': '小 明', 'start': 0, 'end': 2}, {'entity_group': 'LOC', 'score': 0.99902946, 'word': '北 京', 'start': 3, 'end': 5}]


In [25]:
# 根据start和end取实际的结果
ner_result = {}
x = "小明在北京上班"
for r in res2:
    if r["entity_group"] not in ner_result:
        ner_result[r["entity_group"]] = []
    ner_result[r["entity_group"]].append(x[r["start"]: r["end"]])

ner_result

{'PER': ['小明'], 'LOC': ['北京']}