<a href="https://colab.research.google.com/github/adnil8130/T5LittleGenQA/blob/main/T5%E7%94%9F%E6%88%90%E5%BC%8F%E9%97%AE%E7%AD%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 目标
训练一个生成式问答模型，base模型采用Google T5-Base("uer/t5-base-chinese-cluecorpussmall")

预训练模型地址：https://huggingface.co/uer/t5-base-chinese-cluecorpussmall

模型的评价指标采用BLEU-1，BLEU-2，BLEU-3，BLEU-4。

# 数据集
数据集：链接：https://pan.quark.cn/s/6d4a98cd65f2    

提取码：bzne

数据的格式如下：
```
{"context": "违规分为:一般违规扣分、严重违规扣分、出售假冒商品违规扣分,淘宝网每年12月31日24:00点会对符合条件的扣分做清零处理,详情如下:|温馨提醒:由于出售假冒商品24≤N<48分,当年的24分不清零,所以会存在第一年和第二年的不同计分情况。", "answer": "12月31日24:00", "question": "淘宝扣分什么时候清零", "id": 203}
```

In [89]:
import transformers
import numpy as np
import torch
transformers.__version__

'4.48.3'

# 1. 准备数据

In [90]:
from torch.utils.data import Dataset, random_split
import json

In [91]:
! pwd

/content


In [92]:
max_context_len = 0
max_question_len = 0
max_answer_len = 0
context = ""
questiont = ""
answer = ""

train_data_address = '/content/drive/MyDrive/train.json'
dev_data_address = '/content/drive/MyDrive/dev.json'

with open(train_data_address, 'rt', encoding='utf-8') as f:
  for idx, line in enumerate(f):
    sample = json.loads(line.strip())
    if len(sample["context"]) > max_question_len:
        max_context_len = len(sample["context"])
        context = sample["context"]
    if len(sample["question"]) > max_question_len:
        max_question_len = len(sample["question"])
        question = sample["question"]
    if len(sample["answer"]) > max_answer_len:
        max_answer_len = len(sample["answer"])
        answer = sample["answer"]

with open(dev_data_address, 'rt', encoding='utf-8') as f:
  for idx, line in enumerate(f):
    sample = json.loads(line.strip())
    if len(sample["context"]) > max_question_len:
        max_context_len = len(sample["context"])
        context = sample["context"]
    if len(sample["question"]) > max_question_len:
        max_question_len = len(sample["question"])
        question = sample["question"]
    if len(sample["answer"]) > max_answer_len:
        max_answer_len = len(sample["answer"])
        answer = sample["answer"]

print("最长context", max_context_len, context)
print("最长question", max_question_len, question)
print("最长answer", max_answer_len, answer)

最长context 107 2015下半年教师资格证考试时间为11月1日，考生可于2015年10月26日—10月31日登录报名系统，根据提示下载pdf准考证文件。下载后，仔细核对个人信息，并直接打印成准考，按准考证上的要求到指定地点参加考试。
最长question 42 痞子猪身上是什么字母? (问题由猫小逗提供)【答题格式为da+答案,例如答案是爱消除
最长answer 110 如果下雨的时候你拖着行李箱子站在屋檐下面那么其实我没有足够的时间找一个好一点的理由抛弃家里面的狗坐上K667次列车到你在的地方找个商店买一把伞然后给我妹妹弹吉他因为她要参加比赛所以我回不去了我也不会给你说我泡面的碗还没洗


## 构建数据集

In [93]:
train_ratio = 0.9
max_dataset_size = 22000
train_set_size = 20000
valid_set_size = 2000

class GenQA(Dataset):
  def __init__(self, data_file):
    self.data = self.load_data(data_file)

  def load_data(self, data_file):
    Data = {}
    with open(data_file, 'rt', encoding='utf-8') as f:
      for idx, line in enumerate(f):
        if idx >= max_dataset_size:
            break
        sample = json.loads(line.strip())
        Data[idx] = sample
    return Data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

data = GenQA(train_data_address)
data_size = len(data)

train_size = int(train_ratio * data_size)
valid_size = data_size - train_size
train_data, valid_data = random_split(data, [train_size, valid_size])
test_data = GenQA(dev_data_address)

In [94]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 13068
valid set size: 1452
test set size: 984
{'context': '阿瓦隆（Avalon）是亚瑟王传说中的精灵国度。亚瑟在与莫德雷德的激战中死亡（一说重伤），一艘船将他带到了阿瓦隆岛。http://baike.baidu.com/subview/527381/9410559.htm?fromtitle=Avalon&fromid=9953433&type=search', 'answer': '阿瓦隆岛', 'question': '亚瑟王长眠于何处', 'id': 3103}


## 数据预处理

In [95]:
from transformers import AutoTokenizer
model_checkpoint = 'uer/t5-base-chinese-cluecorpussmall'
# model_checkpoint = 'uer/t5-small-chinese-cluecorpussmall'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,return_token_type_ids=False)

In [96]:
context = train_data[0]["context"]
question = train_data[0]["question"]
answer = train_data[0]["answer"]

inputs = tokenizer(context, question)
targets = tokenizer(answer)

In [97]:
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['[CLS]', '阿', '瓦', '隆', '（', '[UNK]', '）', '是', '亚', '瑟', '王', '传', '说', '中', '的', '精', '灵', '国', '度', '。', '亚', '瑟', '在', '与', '莫', '德', '雷', '德', '的', '激', '战', '中', '死', '亡', '（', '一', '说', '重', '伤', '）', '，', '一', '艘', '船', '将', '他', '带', '到', '了', '阿', '瓦', '隆', '岛', '。', 'http', ':', '/', '/', 'ba', '##ike', '.', 'baidu', '.', 'com', '/', 'su', '##b', '##view', '/', '52', '##73', '##81', '/', '94', '##10', '##55', '##9', '.', 'htm', '?', 'from', '##ti', '##tle', '=', '[UNK]', '&', 'from', '##id', '=', '99', '##53', '##43', '##3', '&', 'type', '=', 'search', '[SEP]', '亚', '瑟', '王', '长', '眠', '于', '何', '处', '[SEP]']
['[CLS]', '阿', '瓦', '隆', '岛', '[SEP]']


In [98]:
import torch

max_input_length = 256
max_target_length = 256
sample_cnt = 4

inputs = [train_data[s_idx]["context"] + train_data[s_idx]["question"] for s_idx in range(sample_cnt)]
targets = [train_data[s_idx]["answer"] for s_idx in range(sample_cnt)]

model_inputs = tokenizer(
    inputs,
    padding=True,
    max_length=max_input_length,
    truncation=True,
    return_tensors="pt",
    return_token_type_ids=False
)
labels = tokenizer(
    text_target=targets,
    padding=True,
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
    return_token_type_ids=False
)["input_ids"]

end_token_index = torch.where(labels == 102)[-1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100

print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)

batch_X shape: {'input_ids': torch.Size([4, 256]), 'attention_mask': torch.Size([4, 256])}
batch_y shape: torch.Size([4, 13])
{'input_ids': tensor([[ 101, 7350, 4482,  ...,    0,    0,    0],
        [ 101, 7674, 1044,  ...,    0,    0,    0],
        [ 101,  782, 4638,  ...,  119, 8115,  102],
        [ 101, 2456, 6392,  ..., 6821, 7027,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}
tensor([[ 101, 7350, 4482, 7384, 2270,  102, -100, -100, -100, -100, -100, -100,
         -100],
        [ 101, 8115,  119,  123, 1330, 5101,  190, 8311,  119,  123, 1330, 5101,
          102],
        [ 101,  121,  119, 9048,  102, -100, -100, -100, -100, -100, -100, -100,
         -100],
        [ 101,  124, 1039,  102, -100, -100, -100, -100, -100, -100, -100, -100,
         -100]])


In [111]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM
import gc


max_length = 256
train_batch_size = 8
test_batch_size = 32

device = 'cuda' if torch.cuda.is_available() else xm.xla_device()
print(f'Using {device} device')

def clean_cuda(device):
    if device == 'cuda':
        # 清理无用变量
        gc.collect()
        torch.cuda.empty_cache()

        # 查看清理后剩余显存
        print(f"释放后可用显存: {torch.cuda.mem_get_info()[0]/1024**3:.2f} GB")

clean_cuda(device)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample["context"] + sample["question"])
        batch_targets.append(sample['answer'])
    batch_data = tokenizer(
        batch_inputs,
        text_target=batch_targets,
        padding=True,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
        return_token_type_ids=False
    )
    batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(batch_data['labels'])
    end_token_index = torch.where(batch_data['labels'] == 102)[-1]
    for idx, end_idx in enumerate(end_token_index):
        batch_data['labels'][idx][end_idx+1:] = -100
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=train_batch_size, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=test_batch_size, shuffle=False, collate_fn=collote_fn)

Using cuda device
释放后可用显存: 11.75 GB


In [100]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
batch shape: {'input_ids': torch.Size([32, 256]), 'attention_mask': torch.Size([32, 256]), 'labels': torch.Size([32, 10]), 'decoder_input_ids': torch.Size([32, 10])}
{'input_ids': tensor([[ 101, 2791, 4371,  ...,  671, 3613,  102],
        [ 101, 1765, 4026,  ...,  782, 4638,  102],
        [ 101, 4343, 1762,  ..., 6121,  749,  102],
        ...,
        [ 101, 2418, 6421,  ..., 8024,  872,  102],
        [ 101, 5283, 7188,  ...,    0,    0,    0],
        [ 101, 3336, 4502,  ..., 1914, 7415,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[  101,   671,   702,   102,  -100,  -100,  -100,  -100,  -100,  -100],
        [  101,  7478,  2382,  5708,   102,  -100,  -100,  -100,  -100,  -100],
        [  1

# 2. 模型训练

## 优化模型参数

In [None]:
loss_record_step = []

In [124]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)

    model.train()
    loss_record_step = []
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        # print(batch_data)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        loss_record_step.append(loss.item())
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss, loss_record_step

## 评估指标

In [102]:
! pip install sacrebleu



In [103]:
from sacrebleu.metrics import BLEU

predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
bad_predictions_1 = ["This This This This"]
bad_predictions_2 = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

bleu1 = BLEU(max_ngram_order=1)
print("==========BLEU-1==========")
print(bleu1.corpus_score(predictions, references).score)
print(bleu1.corpus_score(bad_predictions_1, references).score)
print(bleu1.corpus_score(bad_predictions_2, references).score)

bleu2 = BLEU(max_ngram_order=2)
print("==========BLEU-2==========")
print(bleu2.corpus_score(predictions, references).score)
print(bleu2.corpus_score(bad_predictions_1, references).score)
print(bleu2.corpus_score(bad_predictions_2, references).score)

bleu3 = BLEU(max_ngram_order=3)
print("==========BLEU-3==========")
print(bleu3.corpus_score(predictions, references).score)
print(bleu3.corpus_score(bad_predictions_1, references).score)
print(bleu3.corpus_score(bad_predictions_2, references).score)

bleu4 = BLEU(max_ngram_order=4)
print("==========BLEU-4==========")
print(bleu4.corpus_score(predictions, references).score)
print(bleu4.corpus_score(bad_predictions_1, references).score)
print(bleu4.corpus_score(bad_predictions_2, references).score)

84.33740467435464
2.634980614046608
0.40867714384640685
65.05696445772017
2.1514526621798953
0.40867714384640685
53.804523766396244
1.8269935164445736
0.0
46.750469682990165
1.683602693167689
0.0


In [104]:
def test_loop(dataloader, model):
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_length,
            ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu1.corpus_score(preds, labels).score, bleu2.corpus_score(preds, labels).score, bleu3.corpus_score(preds, labels).score, bleu4.corpus_score(preds, labels).score

## 保存模型

In [105]:
def test_loop(dataloader, model):
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_length,
            ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu1.corpus_score(preds, labels).score, bleu2.corpus_score(preds, labels).score, bleu3.corpus_score(preds, labels).score, bleu4.corpus_score(preds, labels).score

In [106]:
# test_data = GenQA(dev_data_address)
# test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)

# test_loop(test_dataloader, model)

  0%|          | 0/31 [00:00<?, ?it/s]

(0.0, 0.0, 0.0, 0.0)

In [8]:
import matplotlib.pyplot as plt
import numpy as np

def save_data_and_plot(data, txt_file_path, img_file_path):
    # ===================== 写入文件部分 =====================
    # 追加写入数据（自动创建文件）
    with open(txt_file_path, 'a') as f:
        # 将数字转为字符串并换行写入
        f.write('\n'.join(map(str, original_data)))
        f.write('\n')  # 添加换行符分隔不同写入批次

    # ===================== 读取文件部分 =====================
    # 从文件读取所有数字
    loaded_data = []
    try:
        with open(txt_file_path, 'r') as f:
            for line in f:
                # 去除空白字符并尝试转换为浮点数
                cleaned_line = line.strip()
                if cleaned_line:
                    loaded_data.append(float(cleaned_line))
    except FileNotFoundError:
        print("错误：文件不存在")
        exit()

    # ===================== 绘图部分 =====================
    plt.figure(figsize=(10, 6))
    plt.plot(loaded_data,
            color='green',
            linestyle='--',
            marker='s',
            markersize=8,
            linewidth=2)

    # 图表装饰
    plt.title("数值变化曲线", fontsize=14, pad=20)
    plt.xlabel("数据索引", fontsize=12, labelpad=10)
    plt.ylabel("测量值", fontsize=12, labelpad=10)
    plt.grid(True, alpha=0.4, linestyle=':')

    # 自动调整坐标轴范围
    plt.xlim(0, len(loaded_data)-1)
    plt.ylim(min(loaded_data)-1, max(loaded_data)+1)

    # 保存和显示
    plt.savefig(img_file_path, dpi=300, bbox_inches='tight')
    plt.close()  # 关闭图表释放内存

    print("操作结果：")
    print(f"- 数据已保存至 {txt_file_path}")
    print(f"- 生成曲线图：{img_file_path}")
    print(f"- 加载数据量：{len(loaded_data)} 条")

In [107]:
loss_record = []

In [None]:
from transformers import AdamW, get_scheduler

learning_rate = 2e-5
epoch_num = 50

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_bleu1 = 0.
best_bleu2 = 0.
best_bleu3 = 0.
best_bleu4 = 0.
best_bleu_weighted_add = 0.
txt_file_path = '/content/drive/MyDrive/lossdata.txt'
img_file_path = '/content/drive/MyDrive/lossdata.png'

for t in range(epoch_num):
    clean_cuda(device)
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")

    total_loss, loss_record_step = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    save_data_and_plot(loss_record_step, txt_file_path, img_file_path)
    clean_cuda(device)
    valid_bleu1, valid_bleu2, valid_bleu3, valid_bleu4 = test_loop(valid_dataloader, model)
    print(f"BLEU1: {valid_bleu1:>0.2f}\n")
    if valid_bleu1 > best_bleu1:
        best_bleu1 = valid_bleu1
    print(f"BLEU2: {valid_bleu2:>0.2f}\n")
    if valid_bleu2 > best_bleu2:
        best_bleu2 = valid_bleu2
    print(f"BLEU3: {valid_bleu3:>0.2f}\n")
    if valid_bleu3 > best_bleu3:
        best_bleu3 = valid_bleu3
    print(f"BLEU4: {valid_bleu4:>0.2f}\n")
    if valid_bleu4 > best_bleu4:
        best_bleu4 = valid_bleu4

    valid_bleu = 0.1 * valid_bleu1 + 0.2 * valid_bleu2 + 0.3 * valid_bleu3 + 0.4 * valid_bleu4
    if valid_bleu > best_bleu_weighted_add or epoch_num % 2 == 0:
        best_bleu_weighted_add = valid_bleu
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'/content/drive/MyDrive/epoch_{t+1}_loss_{loss_record_step[-1]:0.7f}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin')
    clean_cuda(device)
print("Done!")

释放后可用显存: 8.85 GB
Epoch 1/50
-------------------------------


  0%|          | 0/1634 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

In [None]:
test_data = GenQA(dev_data_address)
test_dataloader = DataLoader(test_data, batch_size=test_batch_size, shuffle=False, collate_fn=collote_fn)

import json

model.load_state_dict(torch.load('epoch_1_valid_bleu_53.38_model_weights.bin'))

model.eval()
with torch.no_grad():
    print('evaluating on test set...')
    sources, preds, labels = [], [], []
    for batch_data in tqdm(test_dataloader):
        batch_data = batch_data.to(device)
        generated_tokens = model.generate(
            batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_length,
        ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_sources = tokenizer.batch_decode(
            batch_data["input_ids"].cpu().numpy(),
            skip_special_tokens=True,
            use_source_tokenizer=True
        )
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        print("input:", decoded_sources)
        print("input:", decode_preds)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        sources += [source.strip() for source in decoded_sources]
        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    bleu_score = bleu.corpus_score(preds, labels).score
    print(f"Test BLEU: {bleu_score:>0.2f}\n")
    results = []
    print('saving predicted results...')
    for source, pred, label in zip(sources, preds, labels):
        results.append({
            "sentence": source,
            "prediction": pred,
            "translation": label[0]
        })
    with open('test_data_pred.json', 'wt', encoding='utf-8') as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')