参考链接：https://github.com/dmmiller612/bert-extractive-summarizer

抽取式摘要
- 首先对句子编码
- 然后聚类算法，找到最接近中心的句子
- 然后共指解析，找到句子中代词的含义

原始连接：https://keras.io/examples/nlp/text_extraction_with_bert/

In [1]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()

# 数据预处理
给定上下文，每个上下文对应一些问题，问题的答案为上下文中连续的几个单词

In [2]:
model_path = "../../H/models/huggingface/tensorflow/bert-base-uncased/"
slow_tokenizer = BertTokenizer.from_pretrained(model_path)

# 基于 PreTrainedTokenizerFast 的分词器，使用与传统分词器有差别
tokenizer = BertWordPieceTokenizer(model_path + "vocab.txt", lowercase=True)

In [3]:
# 原始文本

train_path = "../datasets/textSum/train-v1.1.json"
eval_path = "../datasets/textSum/dev-v1.1.json"

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

In [16]:
# 数据样本:一个 文本段落 对应多个 问题-答案 对
raw_train_data['data'][0]['paragraphs'][:10]

[{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
  'qas': [{'answers': [{'answer_start': 515,
      'text': 'Saint Bernadette Soubirous'}],
    'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
    'id': '5733be284776f41900661182'},
   {'answers': [{'answer_start': 188, 'text': 'a copper statue of 

In [21]:
context = 'In 2015-2016, Notre Dame ranked 18th overall among "national universities" in the United States in U.S.'
tokenized_context = tokenizer.encode(context)

# tokenized_context.offsets 元组列表，标识每个标记的起始索引点
# 基于 Rust 继承自 PreTrainedTokenizerFast 的分词器才有该属性，Python 分词器无

tokenized_context.offsets

[(0, 0),
 (0, 2),
 (3, 7),
 (7, 8),
 (8, 12),
 (12, 13),
 (14, 19),
 (20, 24),
 (25, 31),
 (32, 36),
 (37, 44),
 (45, 50),
 (51, 52),
 (52, 60),
 (61, 73),
 (73, 74),
 (75, 77),
 (78, 81),
 (82, 88),
 (89, 95),
 (96, 98),
 (99, 100),
 (100, 101),
 (101, 102),
 (102, 103),
 (0, 0)]

In [32]:
question = 'In what US state did Kathmandu first establish an international relationship?'
question_ids = tokenizer.encode(question)
for idx in question_ids.ids:
    print(f"{idx}-->", tokenizer.id_to_token(idx))

101--> [CLS]
1999--> in
2054--> what
2149--> us
2110--> state
2106--> did
28045--> kathmandu
2034--> first
5323--> establish
2019--> an
2248--> international
3276--> relationship
1029--> ?
102--> [SEP]


In [36]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        # 从完整的文本中抽取连续的标记，作为问题的答案
        # 完整的文本
        context = self.context
        
        # 问题，该问题的答案在完整的文本中
        question = self.question
        
        # 问题的答案
        answer_text = self.answer_text
        
        # 答案位于完整文本中的 起点
        start_char_idx = self.start_char_idx

        # 删除多余的空格
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # 答案位于完整文本中的 终点
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # 标记完整文本中哪些 **字符** 是 答案
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # 完整文本向量化，单词会拆分成子词
        tokenized_context = tokenizer.encode(context)

        # tokenized_context.offsets 元组列表，标识每个单词的起始索引点
        # 标记完整文本中哪些 **单词** 属于答案
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # 答案中的 **子词** 在完整文本中的起始索引点
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # 问题向量化,自动添加了 [CLS] 和 [SEP] 标签了
        tokenized_question = tokenizer.encode(question)

        # 模型输入，将 完整文本 和 问题 拼接起来
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # 掩码，表征填充
        # 如果 完整文本+问题 超长，则丢弃
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        # 模型输入数据:
        # 完整文本+问题 的向量
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets




In [37]:
# 从原始语料中提取需要的文本
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                # 一个问题
                question = qa["question"]

                # 可能对应多个答案
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]

                # 第一个答案的起始索引点
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question,
                    context,
                    start_char_idx,
                    answer_text,
                    all_answers,
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

In [38]:
train_squad_example = create_squad_examples(raw_train_data)

In [39]:
len(train_squad_example)

87599

In [40]:
def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]

    # 目标数据为：起始点和终点索引值，两点之间的文本即为答案
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [41]:
x_train, y_train = create_inputs_targets(train_squad_example)
print(f"{len(train_squad_example)} training points created")

87599 training points created


In [42]:
eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")

10570 evaluation points created.


# 模型

In [43]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained(model_path)

    ## QA Model
    input_ids = layers.Input(shape=(max_len, ), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len, ), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len, ), dtype=tf.int32)
    embedding = encoder(input_ids,
                        token_type_ids=token_type_ids,
                        attention_mask=attention_mask)[0]

    start_logits = layers.Dense(1, name="start_logit",
                                use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [44]:
model = create_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 384, 768), ( 109482240   input_1[0][0]                    
______________________________________________________________________________________________

# 训练模型

## 回调函数，计算验证性能

In [45]:
def normalize_text(text):
    text = text.lower()

    # 删除标点符号
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # 删除冠词
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)


    # 删除多余的空格
    text = " ".join(text.split())
    return text

In [18]:
class ExactMatch(keras.callbacks.Callback):
    """
    Each `SquadExample` object contains the character level offsets for each token
    in its input paragraph. We use them to get back the span of text corresponding
    to the tokens between our predicted start and end tokens.
    All the ground-truth answers are also present in each `SquadExample` object.
    We calculate the percentage of data points where the span of text obtained
    from model predictions matches one of the ground-truth answers.
    """
    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [
            _ for _ in eval_squad_examples if _.skip == False
        ]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [
                normalize_text(_) for _ in squad_eg.all_answers
            ]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")

In [None]:
exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs=1,
    verbose=2,
    batch_size=64,  # 超出内存，减小批的大小
    callbacks=[exact_match_callback])

> 输入是文本段落和问题，拼接起来，max_len=384，每个标记对应的词向量又是 768，所以最终超内存！！