# Load the dataset

In [14]:
from datasets import load_dataset
train_name = "train.json"
valid_name = "valid.json"
test_name = "test.json"
data_dir = "./Data"

data_files = {"train": train_name, "test": test_name, "valid": valid_name}
dataset = load_dataset('json', data_dir = data_dir, data_files = data_files)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
print(f"Valid dataset size: {len(dataset['valid'])}")

Train dataset size: 80870
Test dataset size: 10110
Valid dataset size: 10108


# Preparation before training

Before the training of LLM, we need to do preliminary-disposition of dataset.
GQA belongs to the Text-Generation task.
We need to know the length information of the input & output text, which will benefit for the high-efficient batch-processing for these dataset.

* We utilize the t5-large model here.

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/root/autodl-fs/flan-t5-xxl"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [16]:
from datasets import concatenate_datasets
import numpy as np
from tqdm import tqdm

# The maximum total input sequence length after tokenization.
# Sequences longer than the max will be truncated, and sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["question"], truncation=True), batched = True, remove_columns=["question", "answer"])
input_lengths = [len(x) for x in tokenized_inputs["input_ids"]]

# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lengths, 85))
print(f"MAX source length: {max_source_length}")


# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=["question", "answer"])
target_lengths = [len(x) for x in tokenized_targets["input_ids"]]

# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lengths, 90))
print(f"MAX target length: {max_target_length}")

MAX source length: 34
MAX target length: 90


## We do pre-processing for all dataset and save the processed dataset to Disk

In [17]:
def preprocess_function(samples, padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in samples["question"]]
    
    # tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    
    # tokenize targets with the 'text_target' keyword argument
    labels = tokenizer(text_target=samples["answer"], max_length=max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
        
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# <font color=red>(Save to Disk)</font>

In [5]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns = ["question", "answer", "question_type", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("Data/Train")
tokenized_dataset["test"].save_to_disk("Data/test")

Map:   0%|          | 0/10110 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/80870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10110 [00:00<?, ? examples/s]

# <font color=red size=10>(Load Dataset from the Disk)</font>

In [18]:
from datasets import load_from_disk

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns = ["question", "answer", "question_type", "id"])

# 加载训练数据集
tokenized_dataset["train"] = load_from_disk("Data/Train")

# 加载测试数据集
tokenized_dataset["test"] = load_from_disk("Data/test")

# 打印加载的数据集以验证
print(tokenized_dataset.keys())

dict_keys(['train', 'test', 'valid'])


# LoRA & bnb-int8 to Fine-tuning the T5-large model

In [19]:
from transformers import AutoModelForSeq2SeqLM

model_id = "/root/autodl-fs/t5-large"

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")



In [20]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training,TaskType
import wandb


'''======================================'''
config1 = {"r": 10,
    "lora_alpha": 36,
    "target_modules": ["q", "v"],
    "lora_dropout": 0.05,
    "bias": "lora_only",
    "task_type": TaskType.SEQ_2_SEQ_LM
         }
'''======================================='''


# define LoRA Config
lora_config = LoraConfig(
    r = config1["r"],
    lora_alpha = config1["lora_alpha"],
    target_modules = config1["target_modules"],
    lora_dropout = config1["lora_dropout"],
    bias = config1["bias"],
    task_type = config1["task_type"]
)


# prepare int8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adapter
model = get_peft_model(model, lora_config)
import io
import sys

# 假设 model 是您已经定义好的模型对象

# 创建一个 StringIO 对象
output = io.StringIO()

# 保存当前的 stdout
original_stdout = sys.stdout

try:
    # 将 stdout 重定向到 StringIO 对象
    sys.stdout = output
    # 调用方法，打印输出到 StringIO 对象
    model.print_trainable_parameters()
finally:
    # 恢复原始的 stdout
    sys.stdout = original_stdout

# 获取 StringIO 对象中的内容
output_str = output.getvalue()

# 关闭 StringIO 对象
output.close()

# 打印捕获到的字符串内容
print("1:",output_str)

# wandb.init(
#     project = "LLM_LoRA_FineTuning",
    
#     config = {
#         "config": config1,
#         "Dataset": "qa_Tools_and_Home_Improvement",
#         "Tuning-method": "LoRA",
#         "Trainable params": str(output_str)
#     }
# )




1: trainable params: 2,949,120 || all params: 740,617,216 || trainable%: 0.39819760279512595



接下来需要创建一个 $DataCollator$，负责对输入和标签进行填充，我们使用 🤗 $Transformers$ 库中的$DataCollatorForSeq2Seq$ 来完成这一环节。

In [21]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100 
# 在训练序列到序列（Seq2Seq）模型时，通常会使用特殊的填充标记来对输入进行处理。对于标签数据，在计算损失函数时，我们需要忽略填充标记所带来的影响，因为这些填充部分不应该参与到损失的计算中。
# 在 Transformers 库中，通常将不应该被考虑的标签设置为一个特定的值，通常是 -100。当计算损失函数时，模型会忽略这些 -100 值所对应的预测结果，只计算真实标签部分的损失值，从而实现在计算损失函数时忽略填充标记的效果。 
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

自定义回调函数来记录训练损失

In [22]:
from transformers import TrainerCallback, TrainerState, TrainerControl

class TrainLogCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []

    def on_log(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if 'loss' in state.log_history[-1]:
            self.train_losses.append(state.log_history[-1]['loss'])
            if state.global_step % args.logging_steps == 0:
                print(f"Logging step {state.global_step} at epoch {state.epoch}")
                wandb.log({"logging_step": state.global_step, "train_loss": state.log_history[-1]['loss']})
            # print(f"Step: {state.global_step}, Train Loss: {state.log_history[-1]['loss']}")

In [24]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

'''==============================================='''
output_dir="t5-large3"

config2 = {
    "output_dir": output_dir,
    "auto_find_batch_size": True,
    "learning_rate": 5e-4,
    "num_train_epochs":5,
    "logging_dir":f"{output_dir}/logs",
    "logging_strategy":"steps",
    "logging_steps":200,
    "save_strategy": "no",
    "report_to": "tensorboard"
}
'''================================================='''


# 初始化自定义回调
log_step_callback = TrainLogCallback()

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir = config2["output_dir"],
    auto_find_batch_size = config2["auto_find_batch_size"],
    learning_rate = config2["learning_rate"], # higher learning rate
    num_train_epochs = config2["num_train_epochs"],
    logging_dir = config2["logging_dir"],
    logging_strategy = config2["logging_strategy"],
    logging_steps = config2["logging_steps"],
    save_strategy = config2["save_strategy"],
    report_to = config2["report_to"],
)

wandb.init(
    project = "LLM_LoRA_FineTuning",
    
    config = {
        "config1": config1,
        "config2": config2,
        "Dataset": "qa_Tools_and_Home_Improvement",
        "Tuning-method": "LoRA",
        "Trainable params": str(output_str)
    }
)

# api = wandb.Api()

# run = api.run("llm_learner/LLM_LoRA_FineTuning/623waqcp")
# run.config["config2"] = config2
# run.update()

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    callbacks=[log_step_callback]
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

0,1
logging_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_loss,▅▂▃▃▂▄█▂▁▁▃▃▄▄▃▃▄▃▅▅▅▅▄▄▅▅▆▇█▆▆▆▆▅▅█▇▇▆▆

0,1
logging_step,10400.0
train_loss,3.3955


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112511116597388, max=1.0…

# train

In [25]:
try:
    trainer.train()
except Exception as e:
    print(f"An error occurred: {e}")
    import traceback
    traceback.print_exc()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
200,3.4062
400,3.2302
600,3.2448
800,3.2384
1000,3.1847
1200,3.1968
1400,3.2047
1600,3.1934
1800,3.1672
2000,3.1585


Logging step 200 at epoch 0.019784350578692253
Logging step 400 at epoch 0.039568701157384506
Logging step 600 at epoch 0.059353051736076766
Logging step 800 at epoch 0.07913740231476901
Logging step 1000 at epoch 0.09892175289346128
Logging step 1200 at epoch 0.11870610347215353
Logging step 1400 at epoch 0.13849045405084578
Logging step 1600 at epoch 0.15827480462953802
Logging step 1800 at epoch 0.1780591552082303
Logging step 2000 at epoch 0.19784350578692256
Logging step 2200 at epoch 0.2176278563656148
Logging step 2400 at epoch 0.23741220694430706
Logging step 2600 at epoch 0.2571965575229993
Logging step 2800 at epoch 0.27698090810169157
Logging step 3000 at epoch 0.29676525868038384
Logging step 3200 at epoch 0.31654960925907605
Logging step 3400 at epoch 0.3363339598377683
Logging step 3600 at epoch 0.3561183104164606
Logging step 3800 at epoch 0.37590266099515285
Logging step 4000 at epoch 0.3956870115738451
Logging step 4200 at epoch 0.4154713621525373
Logging step 4400 at 

# Save the model

In [26]:
# Save our LoRA model & tokenizer results
'''========================================'''
peft_model_id="LoRA_results_t5_large_4"
'''========================================'''

trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)



('LoRA_results_t5_large_4/tokenizer_config.json',
 'LoRA_results_t5_large_4/special_tokens_map.json',
 'LoRA_results_t5_large_4/tokenizer.json')

***
***

# <center><font color=Green size=20>Evaluate (Example Answers)</font></center>

In [27]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc. 
'''================================================'''
peft_model_id = "LoRA_results_t5_large_4"
'''================================================'''

config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
if config.base_model_name_or_path:
    ;
else:
    '''========================================================'''
    config.base_model_name_or_path = "/root/autodl-fs/t5-large"
    '''========================================================'''
    
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")



Peft model loaded


In [29]:
from datasets import load_dataset 
from random import randrange
import torch
import json

# # Load dataset from the hub and get a sample
data_files = {"train": "train.json", "test":"test.json", "val":"valid.json"}
dataset = load_dataset('json', data_dir="./Data/", data_files=data_files)

sample = dataset['test'][randrange(len(dataset["test"]))]

input_ids = tokenizer(sample["question"], return_tensors="pt", truncation=True).input_ids.cuda()
# # with torch.inference_mode():
# outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=True, top_p=0.9)
print(f"input question: {sample['question']}\n{'---'* 20}")

output = model.generate(input_ids=input_ids, max_new_tokens=128, output_hidden_states=True, output_scores=True,return_dict_in_generate=True, do_sample=True, top_p=0.9)
#print(f"----------{output}")
#全部的输出
decoded_output = [tokenizer.decode(ids) for ids in output.sequences]
print(decoded_output)
#####################################


input question: does this lamp need wiring from a boz or comes with a plug
------------------------------------------------------------
['<pad> No it comes with a plug. The switch is the same size and shape as the Bizo that is shown in the pictures.</s>']


# <center><font color=Green size=20>Evaluate (Metrics)</font></center>

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("./evaluate-main/metrics/rouge")

'''========================================================Evaluate PEFT model (function)==============================================================='''
def evaluate_peft_model(sample,max_target_length=512):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)    
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels
''' ========================================================================================================================================='''
# load test dataset from disk
test_dataset = load_from_disk("./Data/test/").with_format("torch")
examples = test_dataset.select(range(5))

# run predictions
# this can take ~45 minutes
predictions, references = [] , []

# for sample in tqdm(examples):
for sample in tqdm(test_dataset):
    #print(sample.keys())
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric 
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results 
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

# write to wandb log
import wandb

'''========================================================='''
project_name = "LLM_LoRA_FineTuning"

'''========================================================='''
run_id = "tnnnrg0n"                                         # 可以是一个唯一标识符，如果已经有运行 ID
'''========================================================='''

api = wandb.Api()

run = api.run(f"llm_learner/{project_name}/{run_id}")
run.config["test_Rogue1"] = rogue['rouge1']
run.config["test_Rogue2"] = rogue['rouge2']
run.config["test_RogueL"] = rogue['rougeL']
run.config["test_RogueLsum"] = rogue['rougeLsum']
run.update()

wandb.init(project=project_name, id=run_id, resume="allow")
wandb.log({"test_Rogue1": rogue['rouge1'], "test_Rogue2": rogue['rouge2'], "test_RogueL": rogue['rougeL'], "test_RogueLsum": rogue['rougeLsum']})
wandb.finish()