In [1]:
import torch
import datasets
import lawrouge

from typing import Dict
from datasets import load_dataset

from torch.utils.data import DataLoader

from transformers import (Seq2SeqTrainingArguments, 
                          Seq2SeqTrainer, 
                          BartForConditionalGeneration)

from transformers import BertTokenizer

import json

In [None]:
# !pip install lawrouge
# !pip install datasets
# !pip install accelerate -U
# !pip install transformers[torch]

In [3]:
def get_data(source_path, target_path, output_file, use_partial=None):
    with open(source_path, 'r', encoding='utf-8') as src_f, open(target_path, 'r', encoding='utf-8') as tar_f:
        src_line = src_f.readlines()
        tar_line = tar_f.readlines()
        
        if use_partial is not None:
            src_line = src_line[:use_partial]
            tar_line = tar_line[:use_partial]
    
    assert len(src_line) == len(tar_line), 'Source and target files must have the same number of lines'
    
    all_data = []
    for src, tar in zip(src_line, tar_line):
        data = {}
        data['source'] = src.strip()
        data['target'] = tar.strip()
        
        all_data.append(data)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for sample in all_data:
            json_sample = json.dumps(sample, ensure_ascii=False)
            f.write(json_sample)
            f.write('\n')

In [4]:
get_data('dataset/LCSTS/train.src.txt', 'dataset/LCSTS/train.tgt.txt', 'dataset/LCSTS/train_data.json', use_partial=50000)
get_data('dataset/LCSTS/valid.src.txt', 'dataset/LCSTS/valid.tgt.txt', 'dataset/LCSTS/val_data.json')
get_data('dataset/LCSTS/test.src.txt', 'dataset/LCSTS/test.tgt.txt', 'dataset/LCSTS/test_data.json')

In [12]:
# 读取数据
train_dataset = load_dataset('json', data_files='dataset/LCSTS/train_data.json')
test_dataset = load_dataset('json', data_files='dataset/LCSTS/test_data.json')
val_dataset = load_dataset('json', data_files='dataset/LCSTS/val_data.json')

# 加载tokenizer,中文bart使用bert的tokenizer
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")

In [13]:
def flatten(example):
    return {
        'document': example['source'],
        'summary': example['target'],
        'id': '0'
    }

train_dataset = train_dataset['train'].map(flatten, remove_columns=['source', 'target'])
test_dataset = test_dataset['train'].map(flatten, remove_columns=['source', 'target'])
val_dataset = val_dataset['train'].map(flatten, remove_columns=['source', 'target'])

Map:   0%|          | 0/10666 [00:00<?, ? examples/s]

Map:   0%|          | 0/1106 [00:00<?, ? examples/s]

In [17]:
datasets = datasets.DatasetDict({"train":train_dataset, "validation": val_dataset, "test":test_dataset})

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1106
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 10666
    })
})


In [18]:
print(datasets["validation"][7])

{'document': '今天有传在北京某小区，一光头明星因吸毒被捕的消息。下午北京警方官方微博发布声明通报情况，证实该明星为李代沫。李代沫伙同另外6人，于17日晚在北京朝阳区三里屯某小区的暂住地内吸食毒品，6人全部被警方抓获，且当事人对犯案实施供认不讳。', 'summary': '北京警方确认李代沫吸毒被捕(图)', 'id': '0'}


In [19]:
batch_size = 32
epochs = 10

max_input_length = 256 # 最大输入长度
max_target_length = 128 # 最大输出长度

learning_rate = 1e-4

In [20]:
def preprocess_function(examples):
    """
    document作为输入，summary作为标签
    """
    model_inputs = tokenizer(examples["document"], max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [21]:
tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=["document", "summary", "id"])

print(tokenized_datasets["train"][7].keys())

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1106 [00:00<?, ? examples/s]

Map:   0%|          | 0/10666 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [22]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1106
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10666
    })
})

In [23]:
print(tokenized_datasets["train"][7])

{'input_ids': [101, 21490, 10936, 6188, 5066, 8938, 6411, 12688, 8500, 3565, 6188, 5066, 8938, 11658, 8403, 23137, 17598, 6427, 6399, 9191, 5080, 12495, 15206, 15350, 6507, 21545, 5144, 16423, 14502, 10892, 9206, 10765, 23397, 8909, 5241, 5175, 20447, 25818, 8344, 8939, 5028, 6188, 5066, 8938, 16306, 4896, 8510, 33345, 6432, 5080, 12495, 15206, 15350, 6507, 15134, 5144, 6544, 5834, 8335, 25818, 9053, 17202, 17205, 15206, 15350, 6507, 6436, 8351, 12688, 23236, 5232, 8344, 20179, 3566, 5080, 12495, 15206, 15350, 6507, 5122, 2483, 16306, 4905, 10892, 2484, 6350, 15206, 15350, 11658, 8403, 23137, 6067, 11541, 25818, 11274, 5965, 4906, 5493, 5959, 5028, 15245, 5040, 5965, 19631, 3566, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [32]:
def collate_fn(features: Dict):
 
    batch_input_ids = torch.tensor([feature["input_ids"] for feature in features], dtype=torch.long)
    batch_attention_mask = torch.tensor([feature["attention_mask"] for feature in features], dtype=torch.long)
    batch_labels = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)
    
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_mask,
        "labels": batch_labels
    }

In [33]:
# 构建DataLoader来验证collate_fn
dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, batch_size=4, collate_fn=collate_fn)
batch = next(iter(dataloader))

print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)

torch.Size([4, 256])
torch.Size([4, 256])
torch.Size([4, 128])


In [34]:
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")

model.safetensors:   0%|          | 0.00/561M [00:00<?, ?B/s]

In [35]:
output = model(**batch) # 验证前向传播
print(output)

Seq2SeqLMOutput(loss=tensor(13.2304, grad_fn=<NllLossBackward0>), logits=tensor([[[-3.1715, -2.9859, -2.2114,  ..., -3.7780, -2.5146, -1.5349],
         [-7.3804, -7.1388, -7.0963,  ..., -5.5368, -2.8200, -3.3821],
         [-7.9509, -8.5210, -8.5947,  ..., -7.5362, -5.7510, -6.2699],
         ...,
         [-2.6390, -3.2971, -2.9836,  ..., -1.0758, -0.9355, -0.3422],
         [-2.5106, -3.2141, -2.9439,  ..., -0.9418, -0.9477, -0.3904],
         [-2.6441, -3.3174, -3.0127,  ..., -1.0721, -0.8910, -0.3352]],

        [[-4.0979, -3.8783, -3.1265,  ..., -4.7564, -3.9721, -2.7715],
         [-6.5775, -6.4944, -6.8897,  ..., -3.4292, -1.3200, -3.5792],
         [-6.4953, -6.8784, -6.9473,  ..., -3.1584, -2.0840, -2.6658],
         ...,
         [-2.4619, -3.1553, -2.7983,  ..., -0.2950, -1.6573, -0.4402],
         [-2.4104, -3.0951, -2.8055,  ..., -0.1244, -1.6583, -0.5303],
         [-2.4324, -3.1394, -2.8088,  ..., -0.2124, -1.5722, -0.3626]],

        [[-4.2039, -3.7437, -3.4780,  ..., 

In [36]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # 将预测的 id 转换为 token
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # 将标签转换为 token
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 去掉解码后的空格
    decoded_preds = ["".join(pred.split()) for pred in decoded_preds]
    decoded_labels = ["".join(label.split()) for label in decoded_labels]
    
    # 计算 ROUGE
    rouge = lawrouge.Rouge()
    result = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    result = {
        'rouge-1': result['rouge-1']['f'],
        'rouge-2': result['rouge-2']['f'],
        'rouge-l': result['rouge-l']['f']
    }
    
    # 将结果转换为百分比
    result = {key: value * 100 for key, value in result.items()}
    
    return result

In [37]:
# 示例数据 -> 是一个list，因为函数中使用了batch_decode
predictions = ['我今天的午餐是牛肉饼', '我明天的晚餐是汉堡']
targets = ['我今天中午吃牛肉饼', '我明天晚上吃汉堡']

# 将示例数据进行tokenize
predictions_tokenized = tokenizer(predictions, max_length=max_target_length, padding=True, truncation=True, return_tensors='pt')
targets_tokenized = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True, return_tensors='pt')

# 模拟 eval_pred 格式
eval_pred = (predictions_tokenized['input_ids'], targets_tokenized['input_ids'])

# 计算评估指标
results = compute_metrics(eval_pred)

# 输出结果
print(results)

{'rouge-1': 72.13622241177428, 'rouge-2': 43.52941126668205, 'rouge-l': 72.13622241177428}


In [38]:
# 设置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="/content/results", # 模型保存路径
    num_train_epochs=epochs,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    warmup_steps=500,
    weight_decay=0.0001,
    predict_with_generate=True,
    logging_dir="/content/logs",
    logging_steps=2000,
    evaluation_strategy="steps",
    eval_steps=2000,  # 设置评估步数
    save_steps=2000,  # 设置保存步数
    save_total_limit=3,
    generation_max_length=max_target_length, # 生成的最大长度
    generation_num_beams=3, # beam search -> 1 is greedy search
    load_best_model_at_end=True,
    metric_for_best_model="rouge-1"
)

In [39]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# 打印验证集上的结果
print(trainer.evaluate(tokenized_datasets["validation"]))

# 打印测试集上的结果
print(trainer.evaluate(tokenized_datasets["test"]))

In [None]:
# 保存最终模型
trainer.save_model("results/best")