# T5 encoder + decoder 文本摘要

## Step1 導入相關包

In [1]:
# !pip install evaluate
# !pip install datasets
# !pip install transformers[torch]
# !pip install rouge-chinese
# !pip install sentencepiece

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

## Step2 下載數據集

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
ds = Dataset.load_from_disk("./nlpcc_2017")
ds

Dataset({
    features: ['title', 'content'],
    num_rows: 5000
})

In [5]:
ds = ds.train_test_split(100, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 100
    })
})

In [6]:
ds["train"][0]

{'title': '组图:黑河边防军人零下30℃户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。',
 'content': '中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30℃严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30℃严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练'}

## Step3 資料處理

In [7]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")
tokenizer

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/725k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5TokenizerFast(name_or_path='Langboat/mengzi-t5-base', vocab_size=32128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', 

In [8]:
ds['train'][:3]

{'title': ['组图:黑河边防军人零下30℃户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。',
  '云南丘北幼儿园中毒事件确认系投毒案,女嫌犯为报复将掺入"毒鼠强"的比萨卷扔进教室,导致7名幼儿误食中毒,其中2人死亡。',
  '国家版权局发布通知,责令各网络音乐服务商停止未经授权传播音乐作品,于本月底前将未经授权传播的音乐作品全部下线。'],
 'content': ['中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30℃严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30℃严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练',
  '法制网昆明4月9日电记者刘百军记者今天晚间从云南省文山州丘北县公安局新闻办公室获悉,2014年3月19日发生在文山州丘北县双龙营镇平龙村佳佳幼儿园的幼儿中毒事件系一起人为投毒案,该案已于4月8日告破。目前,犯罪嫌疑人赵建芝(女,44岁,丘北县双龙营镇上平龙村人)因涉嫌投放危险物质罪已被公安机关刑事拘留。现已查明,犯罪嫌疑人赵建芝因对佳佳幼儿园租用自己原看守、暂住的丘北公路养护段平龙道班用房作为办园用房致其被迫搬走一事怀恨在心,遂产生了对幼儿园实施投毒报复的想法。3月19日,赵建芝趁幼儿午睡时,将自己事先掺入了“毒鼠强”的一袋比萨卷(当地儿童喜欢食用的一种副食品),从幼儿园后窗扔入该园中班教室,导致7名幼儿误食中毒,其中2名幼儿经抢救无效死亡。案件发生后,国务院、公安部、云南省委、省政府、文山州委、州政府领导高度重视,要求公安机关尽快侦破案件。云南省公安厅第一时间派出刑事侦查专家组,会同文山州、丘北县公安机关全力开展案件侦查工作。经公安机关审讯,犯罪嫌疑人赵建芝对上述犯罪事实供认不讳。目前,该案正在深入审查中。',
  '各网络音乐服务商:为加强对

In [9]:
ds['train'][0]

{'title': '组图:黑河边防军人零下30℃户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。',
 'content': '中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30℃严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30℃严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练'}

In [10]:
def process_func(exmaples):
    contents = ["摘要生成: \n" + e for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length=384, truncation=True)
    labels = tokenizer(exmaples["title"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs



In [11]:
tokenized_ds = ds.map(process_func, batched=True)
tokenized_ds

Map:   0%|          | 0/4900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [12]:
tokenized_ds["train"][0]["input_ids"]

[7,
 17965,
 5353,
 13,
 5893,
 349,
 295,
 2927,
 13049,
 114,
 1238,
 6473,
 13,
 5844,
 13,
 6849,
 364,
 50,
 1095,
 70,
 3,
 15059,
 12850,
 3168,
 409,
 542,
 717,
 505,
 1139,
 877,
 20399,
 17187,
 375,
 14061,
 3,
 18694,
 11081,
 10788,
 722,
 24698,
 6472,
 103,
 16773,
 1332,
 3,
 2197,
 1164,
 1342,
 3,
 26664,
 12966,
 789,
 1195,
 4,
 206,
 375,
 18665,
 31,
 304,
 2808,
 5,
 24456,
 41,
 12562,
 3,
 17222,
 108,
 21,
 1367,
 14434,
 8768,
 304,
 2898,
 270,
 1221,
 2808,
 5,
 1324,
 1065,
 498,
 4,
 22,
 2962,
 606,
 1214,
 5588,
 25,
 15059,
 12850,
 3168,
 409,
 542,
 717,
 505,
 1139,
 877,
 20399,
 17187,
 375,
 14061,
 18694,
 11081,
 10788,
 722,
 24698,
 6472,
 103,
 16773,
 1332,
 3168,
 409,
 542,
 717,
 505,
 1139,
 877,
 20399,
 17187,
 375,
 14061,
 24698,
 16,
 5968,
 1332,
 3,
 1139,
 1342,
 1163,
 23,
 586,
 11,
 1301,
 3883,
 3168,
 409,
 542,
 717,
 505,
 1139,
 877,
 20399,
 17187,
 375,
 14061,
 24698,
 16,
 5968,
 1332,
 3,
 1139,
 1342,
 1163,
 23,


In [13]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

'摘要生成: 中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30°C严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30°C严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练</s>'

In [14]:
tokenizer.decode(tokenized_ds["train"][0]["labels"])

'组图:黑河边防军人零下30°C户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。</s>'

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

KeyboardInterrupt: 

## Step5 建立評估函數

In [None]:
tokenizer.pad_token_id


0

In [None]:
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }


## Step6 配置訓練參數

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="rouge-l",
    predict_with_generate=True
)

## Step7 創建訓練器

In [None]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

## Step8 模型训练

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

## Step9 模型推理

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("摘要生成:\n" + ds["test"][-1]["content"], max_length=64, do_sample=True)

In [None]:
ds["test"][-1]["title"]