## 生成式问答
> pretrained_model: `mengzi-t5-base`

> dataset: `DuReaderQG`

In [1]:
import json
import torch
import wandb
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from transformers import get_scheduler
from sacrebleu import BLEU
from tqdm.auto import tqdm
import numpy as np
wandb.init(
    project='llm-learning',
    name='1_QA',
    mode='offline'
)
train_file = 'DuReaderQG/train.json'
valid_file = 'DuReaderQG/dev.json'
model_checkpoint = "./mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### 创建dataset

In [2]:
class QAdata(Dataset):
    def __init__(self, file_name):
        super().__init__()
        self.data = self.load_data(file_name)

    def load_data(self, file_name):
        data = {}
        with open(file_name, 'r') as f:
            for idx, line in enumerate(f):
                data[idx] = json.loads(line)
        return data
    
    def __getitem__(self, idx):
        return self.data[idx]
    
    def __len__(self):
        return len(self.data)
    
train_dataset = QAdata(train_file)
valid_dataset = QAdata(valid_file)

### 创建dataloader

In [3]:
def collate_fn(batch_data):
    contexts, questions, answers = [], [], []
    for sample in batch_data:
        contexts.append(sample['context'])
        questions.append(sample['question'])
        answers.append(sample['answer'])
    inputs = tokenizer(
        contexts,
        questions,
        text_target=answers,
        max_length=256,
        truncation='only_first',
        padding=True,
        stride=50,
        return_tensors='pt'
    )
    end_token_idx = torch.where(inputs['labels'] == tokenizer.eos_token_id)[1]
    for idx, end_idx in enumerate(end_token_idx):
        inputs['labels'][idx][end_idx + 1:] = -100 # 将无需预测的部分转换为-100，避免影响计算交叉熵损失
    return inputs
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

     

### 训练和评估函数

In [4]:

def train_loop(dataloader,  model, optimizer, lr_scheduler, total_loss, epoch):
    process_bar = tqdm(range(len(dataloader)))
    process_bar.set_description(f'Loss: {0:>7f}')
    finished_batch_num = (epoch - 1) * len(dataloader)
    model.train()
    for idx, batch in enumerate(dataloader, start=1):
        batch = batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        wandb.log({"train/loss_step": loss.item()}, step = finished_batch_num + idx)
        total_loss += loss.item()
        process_bar.set_description(f"Loss : {total_loss / (finished_batch_num + idx):>7f}")
        process_bar.update(1)
    
    return total_loss


def test_loop(dataloader, model, bleu):
    preds, labels = [], []
    model.eval()
    for batch in tqdm(dataloader):
        batch.to(device)
        with torch.no_grad():
            generations = model.generate(**batch).cpu().numpy()
        decoded_generations = [' '.join(tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)) for ids in generations]

        label_ids = batch['labels'].cpu().numpy()
        label_ids = np.where(label_ids!=-100, label_ids, tokenizer.pad_token_type_id)
        decoded_labels = [' '.join(tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)) for ids in label_ids]

        preds += [pred.strip() for pred in decoded_generations]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu.corpus_score(hypotheses=preds, references=labels)

### 训练主函数

In [5]:
lr = 3e-5
epoch_num = 3
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader)
)
total_loss = 0.
bleu = BLEU()
best_bleu = 0.
model.to(device)
for ep in range(epoch_num):
    print(f"Epoch: {ep + 1}/{epoch_num}\n----------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, total_loss, ep + 1)
    bleu_scores = test_loop(valid_dataloader, model, bleu) 
    wandb.log({
        "eval/bleu_score": bleu_scores.score,
            "eval/bleu-1": bleu_scores.precisions[0],
            "eval/bleu-2": bleu_scores.precisions[1],
            "eval/bleu-3": bleu_scores.precisions[2],
            "eval/bleu-4": bleu_scores.precisions[3],
            "epoch": ep
    })
    print(f"Epoch {ep} | BLEU Score: {bleu_scores.score:.2f}")
    print(', '.join([f"BLEU-{idx + 1}: {bleu_scores.precisions[idx]:.2f}" for idx in range(4)]))
    torch.save(
        model.state_dict(), 
        f'epoch_{ep+1}_valid_bleu_{bleu_scores.score:0.2f}_model_weights.bin'
    )
print('Done!')


Epoch: 1/3
----------------


  0%|          | 0/908 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Epoch 0 | BLEU Score: 100.00
BLEU-1: 100.00, BLEU-2: 100.00, BLEU-3: 100.00, BLEU-4: 100.00
Epoch: 2/3
----------------


  0%|          | 0/908 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Epoch 1 | BLEU Score: 100.00
BLEU-1: 100.00, BLEU-2: 100.00, BLEU-3: 100.00, BLEU-4: 100.00
Epoch: 3/3
----------------


  0%|          | 0/908 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Epoch 2 | BLEU Score: 100.00
BLEU-1: 100.00, BLEU-2: 100.00, BLEU-3: 100.00, BLEU-4: 100.00
Done!


### 生成结果采样

In [7]:
model.load_state_dict(torch.load('epoch_3_valid_bleu_100.00_model_weights.bin', weights_only=False))
results = []
inputs, gens = [], []
num_test = 10

with torch.no_grad():
    for batch in valid_dataloader:
        batch.to(device)
        gen_ids = model.generate(**batch).cpu().numpy()
        gen_tokens = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        input_ids = batch['input_ids'].cpu().numpy()
        input_tokens = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        inputs += [s.strip() for s in input_tokens]
        gens += [s.strip() for s in gen_tokens]
        num_test -= 1
        if num_test == 0:
            break

for q, a in zip(inputs, gens):
    results.append({
        "question": q,
        "answer": a
    })
print(results[1])
with open("preds.json", "wt", encoding='utf-8') as f:
    for example in results:
        f.write(json.dumps(example, ensure_ascii=False) + '\n')
        

{'question': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。2017年银行贷款基准利率', 'answer': '4.35%'}
