In [1]:
import torch
import datasets
import lawrouge

from typing import Dict
from datasets import load_dataset

from torch.utils.data import DataLoader

from transformers import (Seq2SeqTrainingArguments, 
                          Seq2SeqTrainer, 
                          BartForConditionalGeneration)

from transformers import BertTokenizer

In [2]:
batch_size = 32
epochs = 10
learning_rate = 1e-4

max_input_length = 512  # max input length
max_target_length = 128  # max output length

In [3]:
# load data
dataset = load_dataset('json', data_files='dataset/nlpcc2017_clean.json', field='data')

# load tokenizer -> Chinese bart uses bert's tokenizer
tokenizer = BertTokenizer.from_pretrained("pretrained_models/bart-base-chinese")

# load model
model = BartForConditionalGeneration.from_pretrained("pretrained_models/bart-base-chinese")

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'title'],
        num_rows: 49944
    })
})

In [5]:
def flatten(example):
    return {
        "document": example["content"],
        "summary": example["title"],
        "id":"0"
    }

# 将原始数据中的content和title转换为document和summary
dataset = dataset["train"].map(flatten, remove_columns=["title", "content"])

dataset

Map:   0%|          | 0/49944 [00:00<?, ? examples/s]

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 49944
})

In [6]:
train_dataset, test_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42).values()

train_dataset, valid_dataset = train_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42).values()

datasets = datasets.DatasetDict({"train":train_dataset, "validation": valid_dataset, "test":test_dataset})

print(datasets)
print()
print(datasets["validation"][7])

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 40454
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 4495
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 4995
    })
})

{'document': 'A股市场的强势支撑新基金发行市场的持续火爆,尤其是二季度以来热度持续升温。据WIND数据显示,今年以来截止6月4日,共有8只基金的首募规模超过百亿,且全部诞生于二季度。这8只首募规模超百亿的基金包括易方达新丝路、易方达新常态、东方红中国优势、富国改革动力、中邮信息产业、景顺长城沪港深精选、工银瑞信丰盈回报、易方达并购重组,其中,易方达新常态、富国改革动力、中邮信息产业募集时间均只有一天,易方达新丝路募集时间则为2天。从基金公司角度看,上述8只基金中,有3只隶属于易方达基金。成立于5月28日的易方达新丝路首募规模达到286亿,创下本轮牛市中基金首募规模新纪录。而于5月29日结束募集的易方达并购重组发行规模则达到100.34亿。此前,易方达在4月30日成立的易方达新常态首募规模也达到146.63亿,成立一个月以来,其净值涨幅约为20%。仅凭这三只基金,易方达在二季度的规模就将增加约534亿元。WIND数据也显示,目前易方达公募资产规模为3151亿元,超过工银瑞信,仅次于天弘和华夏基金之后。东证资管发行的东方红中国优势成立于4月7日,首募规模为138.56亿元,排名第三。但成立近两个月以来其业绩表现并不出彩,截止6月4日净值涨幅仅有10.3%左右。由中邮基金经理任泽松管理的中邮信息产业在5月14日成立,首募规模达到126.02亿元。近两年,任泽松基金投资业绩出彩,他管理的中邮战略新兴产业在2013年净值增长率为80%,在同类基金中排名第一。富国改革动力、景顺长城沪港深精选、工银瑞信丰盈回报分别募得132.55亿

In [8]:
def preprocess_function(examples):
    """
    document作为输入，summary作为标签
    """
    model_inputs = tokenizer(examples["document"], max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [9]:
tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=["document", "summary", "id"])

print(tokenized_datasets["train"][7].keys())
print()
print(tokenized_datasets["train"][7])

Map:   0%|          | 0/40454 [00:00<?, ? examples/s]



Map:   0%|          | 0/4495 [00:00<?, ? examples/s]

Map:   0%|          | 0/4995 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

{'input_ids': [101, 4941, 7178, 20466, 5967, 17342, 20448, 112, 20449, 17520, 21768, 11045, 18544, 113, 20449, 17520, 30832, 10936, 5100, 20815, 10771, 21764, 17342, 16224, 18458, 9582, 116, 21490, 10936, 116, 4941, 7730, 20815, 10771, 4907, 10185, 30901, 9051, 4941, 7730, 20815, 10771, 7385, 22859, 5369, 23195, 9485, 8318, 8505, 8909, 16022, 4915, 23660, 20849, 21995, 33868, 28097, 5085, 5722, 116, 10756, 10210, 7229, 10892, 17229, 16383, 14788, 5036, 5227, 10762, 5756, 5228, 10016, 7165, 23228, 8362, 9108, 15978, 20846, 19635, 20833, 3565, 5763, 5768, 15978, 20846, 5228, 10016, 5122, 6365, 7385, 8938, 11684, 10008, 6222, 10765, 21568, 3566, 18008, 12323, 116, 4941, 7730, 20815, 10771, 7385, 22859, 5369, 23195, 9485, 8318, 8505, 8909, 16022, 4915, 23660, 20849, 21995, 23671, 16413, 8922, 4907, 21471, 8321, 12435, 3566, 20815, 10771, 21764, 19916, 12543, 6423, 7229, 6222, 5036, 30901, 9051, 125, 11225, 30959, 10936

In [10]:
def collate_fn(features: Dict):
    batch_input_ids = torch.tensor([feature["input_ids"] for feature in features], dtype=torch.long)
    batch_attention_mask = torch.tensor([feature["attention_mask"] for feature in features], dtype=torch.long)
    batch_labels = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)
    
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_mask,
        "labels": batch_labels
    }

In [11]:
# 构建DataLoader来验证collate_fn
dataloader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=4, collate_fn=collate_fn)
batch = next(iter(dataloader))

print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)

torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 128])


In [12]:
output = model(**batch) # 验证前向传播
print(output)

Seq2SeqLMOutput(loss=tensor(11.2249, grad_fn=<NllLossBackward0>), logits=tensor([[[-3.9331e+00, -3.5424e+00, -3.2559e+00,  ..., -6.7162e+00,
          -2.7287e+00, -3.1338e+00],
         [-6.9967e+00, -6.6629e+00, -6.6339e+00,  ..., -6.2032e+00,
          -2.0105e+00, -3.7387e+00],
         [-6.8036e+00, -6.7613e+00, -6.2919e+00,  ..., -5.2353e+00,
          -9.0203e-01, -2.7664e+00],
         ...,
         [-3.0006e+00, -3.3891e+00, -2.7181e+00,  ..., -2.2596e+00,
          -6.0266e-02, -8.1703e-01],
         [-2.8956e+00, -3.3000e+00, -2.6745e+00,  ..., -2.1347e+00,
          -3.7229e-02, -7.9668e-01],
         [-2.9717e+00, -3.3836e+00, -2.7269e+00,  ..., -2.2174e+00,
          -5.7367e-02, -7.8783e-01]],

        [[-3.8879e+00, -3.3694e+00, -3.0851e+00,  ..., -5.5803e+00,
          -3.1251e+00, -2.9459e+00],
         [-7.0883e+00, -6.7556e+00, -6.4956e+00,  ..., -5.7501e+00,
          -2.4619e+00, -4.4487e+00],
         [-6.8258e+00, -6.7970e+00, -7.1764e+00,  ..., -6.4246e+00,
   

In [13]:
output

Seq2SeqLMOutput(loss=tensor(11.2249, grad_fn=<NllLossBackward0>), logits=tensor([[[-3.9331e+00, -3.5424e+00, -3.2559e+00,  ..., -6.7162e+00,
          -2.7287e+00, -3.1338e+00],
         [-6.9967e+00, -6.6629e+00, -6.6339e+00,  ..., -6.2032e+00,
          -2.0105e+00, -3.7387e+00],
         [-6.8036e+00, -6.7613e+00, -6.2919e+00,  ..., -5.2353e+00,
          -9.0203e-01, -2.7664e+00],
         ...,
         [-3.0006e+00, -3.3891e+00, -2.7181e+00,  ..., -2.2596e+00,
          -6.0266e-02, -8.1703e-01],
         [-2.8956e+00, -3.3000e+00, -2.6745e+00,  ..., -2.1347e+00,
          -3.7229e-02, -7.9668e-01],
         [-2.9717e+00, -3.3836e+00, -2.7269e+00,  ..., -2.2174e+00,
          -5.7367e-02, -7.8783e-01]],

        [[-3.8879e+00, -3.3694e+00, -3.0851e+00,  ..., -5.5803e+00,
          -3.1251e+00, -2.9459e+00],
         [-7.0883e+00, -6.7556e+00, -6.4956e+00,  ..., -5.7501e+00,
          -2.4619e+00, -4.4487e+00],
         [-6.8258e+00, -6.7970e+00, -7.1764e+00,  ..., -6.4246e+00,
   

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # 将预测的 id 转换为 token
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # 将标签转换为 token
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 去掉解码后的空格
    decoded_preds = ["".join(pred.split()) for pred in decoded_preds]
    decoded_labels = ["".join(label.split()) for label in decoded_labels]
    
    # 计算 ROUGE
    rouge = lawrouge.Rouge()
    result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    result = {
        'rouge-1': result['rouge-1']['f'],
        'rouge-2': result['rouge-2']['f'],
        'rouge-l': result['rouge-l']['f']
    }
    
    # 将结果转换为百分比
    result = {key: value * 100 for key, value in result.items()}
    
    return result

In [15]:
# How to use the metrics function
predictions = ['我今天的午餐是牛肉饼', '我明天的晚餐是汉堡']
targets = ['我今天中午吃牛肉饼', '我明天晚上吃汉堡']

# 将示例数据进行tokenize
predictions_tokenized = tokenizer(predictions, max_length=max_target_length, padding=True, truncation=True, return_tensors='pt')
targets_tokenized = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True, return_tensors='pt')

# 模拟 eval_pred 格式
eval_pred = (predictions_tokenized['input_ids'], targets_tokenized['input_ids'])

# 计算评估指标
results = compute_metrics(eval_pred)

# 输出结果
print(results)

{'rouge-1': 72.13622241177428, 'rouge-2': 43.52941126668205, 'rouge-l': 72.13622241177428}


In [16]:
# 设置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="/content/results", # 模型保存路径
    num_train_epochs=epochs,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    warmup_steps=500,
    weight_decay=0.0001,
    predict_with_generate=True,
    logging_dir="/content/logs",
    logging_steps=2000,
    evaluation_strategy="steps",
    eval_steps=2000,  # 设置评估步数
    save_steps=2000,  # 设置保存步数
    save_total_limit=3,
    generation_max_length=max_target_length, # 生成的最大长度
    generation_num_beams=3, # beam search -> 1 is greedy search
    load_best_model_at_end=True,
    metric_for_best_model="rouge-1"
)

In [17]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# 打印验证集上的结果
print(trainer.evaluate(tokenized_datasets["validation"]))
# 打印测试集上的结果
print(trainer.evaluate(tokenized_datasets["test"]))

In [None]:
# 保存最终模型
trainer.save_model("results/best")

# 加载模型并测试

In [18]:
model = BartForConditionalGeneration.from_pretrained('logs/best')
tokenizer = BertTokenizer.from_pretrained("logs/best")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'The device you are using is {device}')

model.to(device)

The device you are using is cuda


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(51271, 768, padding_idx=0)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(51271, 768, padding_idx=0)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), 

In [19]:
def predict(text_, model_, tokenizer_):
    tokenized_text = tokenizer_(text_, padding=True, truncation=True, max_length=max_target_length, return_tensors='pt')
    
    input_ids = tokenized_text['input_ids'].to(model_.device)
    attention_mask = tokenized_text['attention_mask'].to(model_.device)
    
    output_ = model_.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length)
    
    result_ = tokenizer_.decode(token_ids=output_[0], skip_special_tokens=True).replace(' ', '')
    
    return result_

In [20]:
text = '中新网6月11日电据外媒11日报道,在美国,由于私人飞机的价格太贵,学习开飞机的人有所减少,美国数百个小机场已经关闭,部分机场被改为赛车场或是农田。艾奥瓦州的一位官员说,当地的一座小机场已不再使用,因此地方议会决定将其关闭,机场跑道被改成赛车场。在美国,许多小城镇自上世纪20年代以来开始兴建机场,二战后许多军用飞机的驾驶员退役,但回乡后仍希望继续飞行,因此小机场的规模得到扩大。美国有私人飞机飞行执照的人在上世纪80年代初最多,但如今已降至18.8万人,使得全国数百个小机场关闭。专家指出,数十年前,买一架新的小飞机约需1.3万美元,如今需要25万美元以上。而且飞机还要使用特种航油、买保险、维修、建机库和停机坪。在小飞机的驾驶员人数下降之际,美国商业飞行的旅客却在增多,预计今年的航空旅客将创下纪录。但如今,航空飞行的魅力对许多人而言已不如过去,商业航班经常晚点,一位老飞行员表示,美国民众乘飞机旅行的兴趣已经大大减少。'

result = predict(text, model, tokenizer)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [21]:
# ground truth: 由于私人飞机价格昂贵,学习飞行技术人数减少,美国数百个小机场已经关闭,部分机场被改为赛车场或是农田。
print(result)

数百个小机场已经关闭,部分机场被改为赛车场或是农田;由于私人飞机价格昂贵,学习开飞机的人数减少。
