# 环境配置

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U trl
!pip install -q -U tensorboardX
!pip install -q wandb

[0m

In [None]:
from enum import Enum
from functools import partial
import pandas as pd
import torch
import json

from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, TaskType

seed = 42
set_seed(seed)

import os
!export HF_ENDPOINT=https://hf-mirror.com
os.environ['HF_TOKEN']="1"

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 使用本地路径
model_path = "autodl-tmp/models/Qwen2.5-7B-Instruct"
dataset_path = "autodl-tmp/datas/lora_finetune_dataset_functioncall_sample600.jsonl"

# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 设置聊天模板
tokenizer.chat_template = (
    "{{ bos_token }}"
    "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}"
    "{% for message in messages %}"
    "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
)

# 从本地JSONL文件加载数据集
dataset = load_dataset('json', data_files=dataset_path)
dataset = dataset.rename_column("conversation", "messages")

print("Dataset sample structure:")
sample = dataset["train"][0]
print(type(sample["messages"]))
print(sample["messages"] if isinstance(sample["messages"], list) else "Not a list")

def preprocess(sample):
    messages = sample["messages"]
    
    if not isinstance(messages, list):
        return {"text": "Error: unexpected message format"}
    
    if messages and isinstance(messages[0], dict) and "role" in messages[0]:
        if messages[0]["role"] == "system":
            system_message_content = messages[0]["content"]
            if len(messages) > 1:
                messages[1]["content"] = (
                    system_message_content +
                    "Also, before making a call to a function take the time to plan the function to take. "
                    "Make that thinking process between <think>{your thoughts}</think>\n\n" +
                    messages[1]["content"]
                )
            messages = messages[1:]
    
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

dataset = dataset.map(preprocess, remove_columns=["messages"])
dataset = dataset["train"].train_test_split(test_size=0.05)
print(dataset)

Dataset sample structure:
<class 'list'>
[{'role': 'system', 'content': '你现在是一位在上海陆家嘴区域活动的人，请以第一人称视角回忆你的活动轨迹。下面<tools></tools>标签中提供了一些可以辅助你回忆细节的工具函数：<tools>[  {    "type": "function",     "function": {      "name": "query_date",       "description": "生成一个工作日日期，帮助你回忆具体是哪一天",       "parameters": {        "type": "object",         "properties": {}      }    }  },   {    "type": "function",     "function": {      "name": "get_lujiazui_transition_info",       "description": "帮助你回忆进出陆家嘴的信息，包括从哪里来、什么时候到达、离开时间和去向",       "parameters": {        "type": "object",         "properties": {          "person_type_inljz": {            "type": "string",             "description": "你在陆家嘴活动的人群画像，仅限于以下五种：\'工作人群\',\'陆家嘴本地居民\',\'旅游人群\',\'商务人群\',\'到访商业消费人群\'"          }        },         "required": ["person_type_inljz"]      }    }  },   {    "type": "function",     "function": {      "name": "query_location_info",       "description": "帮助你回忆在陆家嘴进行活动的具体地点",       "parameters": {        "type": "object",    

In [7]:
print(dataset["train"][85]["text"])

<start_of_turn>human
你现在是一位在上海陆家嘴区域活动的人，请以第一人称视角回忆你的活动轨迹。下面<tools></tools>标签中提供了一些可以辅助你回忆细节的工具函数：<tools>[  {    "type": "function",     "function": {      "name": "query_date",       "description": "生成一个工作日日期，帮助你回忆具体是哪一天",       "parameters": {        "type": "object",         "properties": {}      }    }  },   {    "type": "function",     "function": {      "name": "get_lujiazui_transition_info",       "description": "帮助你回忆进出陆家嘴的信息，包括从哪里来、什么时候到达、离开时间和去向",       "parameters": {        "type": "object",         "properties": {          "person_type_inljz": {            "type": "string",             "description": "你在陆家嘴活动的人群画像，仅限于以下五种：'工作人群','陆家嘴本地居民','旅游人群','商务人群','到访商业消费人群'"          }        },         "required": ["person_type_inljz"]      }    }  },   {    "type": "function",     "function": {      "name": "query_location_info",       "description": "帮助你回忆在陆家嘴进行活动的具体地点",       "parameters": {        "type": "object",         "properties": {          "activity_type": {            "t

In [8]:
print(dataset["train"][85]["text"])

<start_of_turn>human
你现在是一位在上海陆家嘴区域活动的人，请以第一人称视角回忆你的活动轨迹。下面<tools></tools>标签中提供了一些可以辅助你回忆细节的工具函数：<tools>[  {    "type": "function",     "function": {      "name": "query_date",       "description": "生成一个工作日日期，帮助你回忆具体是哪一天",       "parameters": {        "type": "object",         "properties": {}      }    }  },   {    "type": "function",     "function": {      "name": "get_lujiazui_transition_info",       "description": "帮助你回忆进出陆家嘴的信息，包括从哪里来、什么时候到达、离开时间和去向",       "parameters": {        "type": "object",         "properties": {          "person_type_inljz": {            "type": "string",             "description": "你在陆家嘴活动的人群画像，仅限于以下五种：'工作人群','陆家嘴本地居民','旅游人群','商务人群','到访商业消费人群'"          }        },         "required": ["person_type_inljz"]      }    }  },   {    "type": "function",     "function": {      "name": "query_location_info",       "description": "帮助你回忆在陆家嘴进行活动的具体地点",       "parameters": {        "type": "object",         "properties": {          "activity_type": {            "t

In [11]:
from enum import Enum
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import gc

# 使用本地路径
model_path = "autodl-tmp/models/Qwen2.5-7B-Instruct"
dataset_path = "autodl-tmp/datas/lora_finetune_dataset_functioncall_sample600.jsonl"

class ChatmlSpecialTokens(str, Enum):
    tools = "<tools>"
    eotools = "</tools>"
    think = "<think>"
    eothink = "</think>"
    tool_call = "<tool_call>"
    eotool_call = "</tool_call>"
    tool_response = "<tool_response>"
    eotool_response = "</tool_response>"
    answer = "<answer>"
    eoanswer = "</answer>"
    start_of_turn = "<start_of_turn>"
    end_of_turn = "<end_of_turn>"
    pad_token = "<pad>"
    eos_token = "<eos>"
    
    @classmethod
    def list(cls):
        return [c.value for c in cls]

# 初始化tokenizer并添加特殊标记
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    pad_token=ChatmlSpecialTokens.pad_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list()
)

# 设置聊天模板
tokenizer.chat_template = (
    "{{ bos_token }}"
    "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}"
    "{% for message in messages %}"
    "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
)

# 验证特殊标记是否已添加
print("特殊标记检查:")
for token in ChatmlSpecialTokens.list():
    in_special = token in tokenizer.all_special_tokens
    print(f"{token}: {'在特殊标记列表中' if in_special else '不在特殊标记列表中'}")

print("\n分词测试:")
for token in ChatmlSpecialTokens.list():
    tokens = tokenizer.tokenize(token)
    token_ids = tokenizer.encode(token, add_special_tokens=False)
    print(f"{token} -> {tokens} (token_ids: {token_ids})")
    
    # 一个特殊标记应该只对应一个token ID
    if len(tokens) == 1:
        print(f"✓ 正确: 被识别为单个token")
    else:
        print(f"✗ 错误: 被分解为多个token")

# 从本地JSONL文件加载数据集
dataset = load_dataset('json', data_files=dataset_path)
dataset = dataset.rename_column("conversation", "messages")

print("\nDataset sample structure:")
sample = dataset["train"][0]
print(type(sample["messages"]))
print(sample["messages"] if isinstance(sample["messages"], list) else "Not a list")

def preprocess(sample):
    messages = sample["messages"]
    
    if not isinstance(messages, list):
        return {"text": "Error: unexpected message format"}
    
    if messages and isinstance(messages[0], dict) and "role" in messages[0]:
        if messages[0]["role"] == "system":
            system_message_content = messages[0]["content"]
            if len(messages) > 1:
                messages[1]["content"] = (
                    system_message_content +
                    "Also, before making a call to a function take the time to plan the function to take. "
                    "Make that thinking process between <think>{your thoughts}</think>\n\n" +
                    messages[1]["content"]
                )
            messages = messages[1:]
    
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

dataset = dataset.map(preprocess, remove_columns=["messages"])
dataset = dataset["train"].train_test_split(test_size=0.1)
print(dataset)

# 测试一个包含多个特殊标记的句子
test_text = "<start_of_turn>human\n你好<end_of_turn><eos>\n<start_of_turn>model\n<think>用户打招呼，我应该回应</think><tool_call>{'name': 'test'}</tool_call><answer>你好！有什么我可以帮助你的吗？</answer><end_of_turn><eos>"
encoded = tokenizer.encode(test_text)
decoded = tokenizer.decode(encoded)
print("\n测试文本编码再解码:")
print(f"原始文本: {test_text}")
print(f"解码后文本: {decoded}")
print(f"文本是否保持一致: {'是' if test_text == decoded else '否'}")

# 在完成数据集预处理后再加载模型
# 清理缓存
torch.cuda.empty_cache()
gc.collect()

# 使用device_map="auto"以便更好地管理内存
print("\n开始加载模型...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    attn_implementation='eager',
    torch_dtype=torch.bfloat16,
    device_map="auto",
    offload_folder="offload",  # 允许必要时卸载到磁盘
)

model.resize_token_embeddings(len(tokenizer))

if torch.cuda.is_available():
    print("\nGPU内存使用情况:")
    print(f"已分配内存: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"缓存内存: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

特殊标记检查:
<tools>: 在特殊标记列表中
</tools>: 在特殊标记列表中
<think>: 在特殊标记列表中
</think>: 在特殊标记列表中
<tool_call>: 在特殊标记列表中
</tool_call>: 在特殊标记列表中
<tool_response>: 在特殊标记列表中
</tool_response>: 在特殊标记列表中
<answer>: 在特殊标记列表中
</answer>: 在特殊标记列表中
<start_of_turn>: 在特殊标记列表中
<end_of_turn>: 在特殊标记列表中
<pad>: 在特殊标记列表中
<eos>: 在特殊标记列表中

分词测试:
<tools> -> ['<tools>'] (token_ids: [151666])
✓ 正确: 被识别为单个token
</tools> -> ['</tools>'] (token_ids: [151667])
✓ 正确: 被识别为单个token
<think> -> ['<think>'] (token_ids: [151668])
✓ 正确: 被识别为单个token
</think> -> ['</think>'] (token_ids: [151669])
✓ 正确: 被识别为单个token
<tool_call> -> ['<tool_call>'] (token_ids: [151657])
✓ 正确: 被识别为单个token
</tool_call> -> ['</tool_call>'] (token_ids: [151658])
✓ 正确: 被识别为单个token
<tool_response> -> ['<tool_response>'] (token_ids: [151670])
✓ 正确: 被识别为单个token
</tool_response> -> ['</tool_response>'] (token_ids: [151671])
✓ 正确: 被识别为单个token
<answer> -> ['<answer>'] (token_ids: [151672])
✓ 正确: 被识别为单个token
</answer> -> ['</answer>'] (token_ids: [151673])
✓ 正确: 被识别为单个token


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 540
    })
    test: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

测试文本编码再解码:
原始文本: <start_of_turn>human
你好<end_of_turn><eos>
<start_of_turn>model
<think>用户打招呼，我应该回应</think><tool_call>{'name': 'test'}</tool_call><answer>你好！有什么我可以帮助你的吗？</answer><end_of_turn><eos>
解码后文本: <start_of_turn>human
你好<end_of_turn><eos>
<start_of_turn>model
<think>用户打招呼，我应该回应</think><tool_call>{'name': 'test'}</tool_call><answer>你好！有什么我可以帮助你的吗？</answer><end_of_turn><eos>
文本是否保持一致: 是

开始加载模型...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


GPU内存使用情况:
已分配内存: 22.99 GB
缓存内存: 23.07 GB


# 开始训练

## 1.导入必要的库

In [12]:
from peft import LoraConfig, TaskType, get_peft_model
import torch
from trl import SFTTrainer, SFTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import gc
import os

torch.manual_seed(2423)

<torch._C.Generator at 0x7f34b5652150>

## 2.设置路径和基础配置

In [13]:
model_path = "autodl-tmp/models/Qwen2.5-7B-Instruct"
dataset_path = "autodl-tmp/datas/lora_finetune_dataset_functioncall_sample600.jsonl"
username = "zyss1"
base_log_dir = "autodl-tmp/log"
output_dir = os.path.join(base_log_dir, "qwen-2.5-fn_call")
os.makedirs(output_dir, exist_ok=True)
print(f"模型输出将保存在: {output_dir}")

torch.cuda.empty_cache()
gc.collect()

模型输出将保存在: autodl-tmp/log/qwen-2.5-fn_call


9945

## 3.定义特殊标记和准备Tokenizer

In [14]:
from enum import Enum

class ChatmlSpecialTokens(str, Enum):
    tools = "<tools>"
    eotools = "</tools>"
    think = "<think>"
    eothink = "</think>"
    tool_call = "<tool_call>"
    eotool_call = "</tool_call>"
    tool_response = "<tool_response>"
    eotool_response = "</tool_response>"
    answer = "<answer>"
    eoanswer = "</answer>"
    start_of_turn = "<start_of_turn>"
    end_of_turn = "<end_of_turn>"
    pad_token = "<pad>"
    eos_token = "<eos>"
    
    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    pad_token=ChatmlSpecialTokens.pad_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list()
)

tokenizer.chat_template = (
    "{{ bos_token }}"
    "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}"
    "{% for message in messages %}"
    "{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
)

print("特殊标记检查:")
for token in ChatmlSpecialTokens.list():
    in_special = token in tokenizer.all_special_tokens
    print(f"{token}: {'在特殊标记列表中' if in_special else '不在特殊标记列表中'}")

特殊标记检查:
<tools>: 在特殊标记列表中
</tools>: 在特殊标记列表中
<think>: 在特殊标记列表中
</think>: 在特殊标记列表中
<tool_call>: 在特殊标记列表中
</tool_call>: 在特殊标记列表中
<tool_response>: 在特殊标记列表中
</tool_response>: 在特殊标记列表中
<answer>: 在特殊标记列表中
</answer>: 在特殊标记列表中
<start_of_turn>: 在特殊标记列表中
<end_of_turn>: 在特殊标记列表中
<pad>: 在特殊标记列表中
<eos>: 在特殊标记列表中


## 加载数据集并预处理

In [16]:
dataset = load_dataset('json', data_files=dataset_path)
dataset = dataset.rename_column("conversation", "messages")

print("\nDataset sample structure:")
sample = dataset["train"][0]
print(type(sample["messages"]))
print(sample["messages"] if isinstance(sample["messages"], list) else "Not a list")

def preprocess(sample):
    messages = sample["messages"]
    
    if not isinstance(messages, list):
        return {"text": "Error: unexpected message format"}
    
    if messages and isinstance(messages[0], dict) and "role" in messages[0]:
        if messages[0]["role"] == "system":
            system_message_content = messages[0]["content"]
            if len(messages) > 1:
                messages[1]["content"] = (
                    system_message_content +
                    "Also, before making a call to a function take the time to plan the function to take. "
                    "Make that thinking process between <think>{your thoughts}</think>\n\n" +
                    messages[1]["content"]
                )
            messages = messages[1:]
    
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

dataset = dataset.map(preprocess, remove_columns=["messages"])
dataset = dataset["train"].train_test_split(test_size=0.05)
print(dataset)


Dataset sample structure:
<class 'list'>
[{'role': 'system', 'content': '你现在是一位在上海陆家嘴区域活动的人，请以第一人称视角回忆你的活动轨迹。下面<tools></tools>标签中提供了一些可以辅助你回忆细节的工具函数：<tools>[  {    "type": "function",     "function": {      "name": "query_date",       "description": "生成一个工作日日期，帮助你回忆具体是哪一天",       "parameters": {        "type": "object",         "properties": {}      }    }  },   {    "type": "function",     "function": {      "name": "get_lujiazui_transition_info",       "description": "帮助你回忆进出陆家嘴的信息，包括从哪里来、什么时候到达、离开时间和去向",       "parameters": {        "type": "object",         "properties": {          "person_type_inljz": {            "type": "string",             "description": "你在陆家嘴活动的人群画像，仅限于以下五种：\'工作人群\',\'陆家嘴本地居民\',\'旅游人群\',\'商务人群\',\'到访商业消费人群\'"          }        },         "required": ["person_type_inljz"]      }    }  },   {    "type": "function",     "function": {      "name": "query_location_info",       "description": "帮助你回忆在陆家嘴进行活动的具体地点",       "parameters": {        "type": "object",   

## 5.加载模型

In [17]:
torch.cuda.empty_cache()
gc.collect()

print("\n开始加载模型...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    attn_implementation='eager',
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.resize_token_embeddings(len(tokenizer))

if torch.cuda.is_available():
    print("\nGPU内存使用情况:")
    print(f"已分配内存: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"缓存内存: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


开始加载模型...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


GPU内存使用情况:
已分配内存: 18.59 GB
缓存内存: 19.79 GB


## 6.配置LoRA参数

In [18]:
# LoRA参数配置
# r: 低秩矩阵的维度，越小压缩率越高，训练所需显存越少
rank_dimension = 16  
# lora_alpha: LoRA层的缩放因子，越高调整效果越强
lora_alpha = 32  # 从64减小到32，以增加稳定性
# lora_dropout: LoRA层的dropout概率，防止过拟合
lora_dropout = 0.05  

# Qwen2.5针对函数调用的LoRA配置
peft_config = LoraConfig(
    r=rank_dimension,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    # 目标模块 - 确保这些层名称在Qwen2.5模型中存在
    target_modules=[
        "q_proj",     # 注意力机制的查询投影
        "k_proj",     # 注意力机制的键投影
        "v_proj",     # 注意力机制的值投影
        "o_proj",     # 注意力输出投影
        "gate_proj",  # MLP门控投影
        "up_proj",    # MLP上投影
        "down_proj",  # MLP下投影
        "lm_head",    # 语言模型输出头
    ],
    bias="none",     # 不训练偏置项，节省参数
    task_type=TaskType.CAUSAL_LM,  # 任务类型：因果语言模型
)

# 将LoRA应用到模型
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # 打印可训练参数占比

trainable params: 42,854,352 || all params: 7,655,696,848 || trainable%: 0.5598


## 7.配置训练参数

In [29]:
per_device_train_batch_size = 1  # 每个GPU的训练批次大小
per_device_eval_batch_size = 1   # 每个GPU的验证批次大小
gradient_accumulation_steps = 4  # 梯度累积步数，增大相当于增大批次大小
logging_steps = 5               # 每多少步记录一次日志
learning_rate = 1e-4            # 初始学习率

max_grad_norm = 1.0             # 梯度裁剪值，防止梯度爆炸
num_train_epochs = 1            # 训练轮数
warmup_ratio = 0.1              # 预热比例，学习率从0逐渐增加到设定值
lr_scheduler_type = "cosine"    # 学习率调度器类型
max_seq_length = 4096           # 最大序列长度，根据你的数据调整

training_arguments = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",          # 不保存中间检查点
    eval_strategy="epoch",       # 每轮结束评估一次
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,            # L2正则化强度
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",     # 使用tensorboard记录指标
    bf16=True,                   # 使用bfloat16混合精度训练
    hub_private_repo=False,      # 上传到Hub的选项
    push_to_hub=False,           # 是否将模型推送到HuggingFace Hub
    num_train_epochs=num_train_epochs,
    # 启用梯度检查点以节省GPU内存
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    packing=True,                # 打包序列以提高训练效率
    max_seq_length=max_seq_length,
)

## 8.创建SFT训练器并开始训练

In [None]:
import logging
import time
import os
import gc
import torch
from datetime import datetime
from transformers.trainer_callback import TrainerCallback
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from transformers import AutoTokenizer
from trl import SFTTrainer
from peft import LoraConfig

# 设置离线模式
os.environ["HF_HUB_OFFLINE"] = "1"

# 强制清理内存
torch.cuda.empty_cache()
gc.collect()

# 设置环境变量以优化CUDA内存分配
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'

# 定义输出目录和模型路径
output_dir = "./qwen_output"
model_name = "/root/autodl-tmp/models/Qwen2.5-7B-Instruct"

# 创建日志目录
log_dir = os.path.join(output_dir, "logs")
os.makedirs(log_dir, exist_ok=True)

# 设置日志
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_{timestamp}.txt")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
logger.info(f"日志将保存在: {log_file}")

# 4位量化配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# 加载模型
logger.info("开始加载模型...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True,
    trust_remote_code=True
)
logger.info("模型加载完成")

# 设置LoRA配置
peft_config = LoraConfig(
    r=10,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# 定义训练参数
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    learning_rate=2e-5,
    gradient_accumulation_steps=10,
    gradient_checkpointing=True,
    per_device_eval_batch_size=1,
    optim="paged_adamw_8bit",
    fp16=True,
    logging_steps=20,
    save_steps=100,
    logging_strategy="steps",
    logging_dir=log_dir,
    logging_first_step=True,
    report_to=["tensorboard"],
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    evaluation_strategy="steps",
    eval_steps=100,
    warmup_ratio=0.1,
    weight_decay=0.01,
)

# 创建自定义回调
class DetailedProgressCallback(TrainerCallback):
    def __init__(self):
        self.start_time = time.time()
        self.step_times = []
        
    def on_step_begin(self, args, state, control, **kwargs):
        self.step_start_time = time.time()
        # 每10步清理一次缓存
        if state.global_step % 10 == 0:
            torch.cuda.empty_cache()
        
    def on_step_end(self, args, state, control, **kwargs):
        step_time = time.time() - self.step_start_time
        self.step_times.append(step_time)
        avg_step_time = sum(self.step_times[-20:]) / min(len(self.step_times), 20)
        
        # 只在日志步骤记录详细信息
        if state.global_step % training_arguments.logging_steps == 0:
            elapsed = time.time() - self.start_time
            elapsed_str = time.strftime("%H:%M:%S", time.gmtime(elapsed))
            
            total_steps = state.max_steps
            remaining_steps = total_steps - state.global_step
            estimated_remaining_time = remaining_steps * avg_step_time
            remaining_str = time.strftime("%H:%M:%S", time.gmtime(estimated_remaining_time))
            
            loss = state.log_history[-1].get("loss", "N/A") if state.log_history else "N/A"
            
            memory_allocated = torch.cuda.memory_allocated() / 1024**3
            memory_reserved = torch.cuda.memory_reserved() / 1024**3
            
            logger.info(
                f"步骤: {state.global_step}/{total_steps} ({(state.global_step/total_steps*100):.2f}%) | "
                f"损失: {loss} | "
                f"平均步骤时间: {avg_step_time:.2f}s | "
                f"已运行时间: {elapsed_str} | "
                f"预计剩余时间: {remaining_str} | "
                f"GPU内存: {memory_allocated:.2f}GB/{memory_reserved:.2f}GB"
            )
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            eval_loss = metrics.get("eval_loss", "N/A")
            logger.info(f"【评估】当前步骤: {state.global_step} | 验证损失: {eval_loss}")

# 限制数据集大小
if len(dataset["train"]) > 1000:
    logger.info(f"缩减训练集大小: {len(dataset['train'])} -> 1000")
    dataset["train"] = dataset["train"].select(range(1000))
    
if len(dataset["test"]) > 200:
    logger.info(f"缩减测试集大小: {len(dataset['test'])} -> 200")
    dataset["test"] = dataset["test"].select(range(200))

# 创建SFT训练器
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    peft_config=peft_config,
    callbacks=[DetailedProgressCallback()]
)

# 打印训练前GPU使用情况
if torch.cuda.is_available():
    logger.info("\n训练前GPU内存使用情况:")
    logger.info(f"已分配内存: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    logger.info(f"缓存内存: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# 打印训练信息
logger.info(f"\n=== 开始训练 | 时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===")
logger.info(f"训练集大小: {len(dataset['train'])} 样本")
logger.info(f"验证集大小: {len(dataset['test'])} 样本")
logger.info(f"批次大小: {training_arguments.per_device_train_batch_size} * {training_arguments.gradient_accumulation_steps} (梯度累积)")
logger.info(f"训练轮数: {training_arguments.num_train_epochs}")
logger.info(f"学习率: {training_arguments.learning_rate}")
logger.info(f"LoRA秩维度: {peft_config.r}")
logger.info(f"LoRA缩放因子: {peft_config.lora_alpha}")
logger.info(f"使用4位量化: 是")
logger.info(f"使用梯度检查点: {training_arguments.gradient_checkpointing}")
logger.info(f"使用半精度训练: {training_arguments.fp16}")

# 开始训练
train_result = trainer.train()

# 训练完成，记录详细结果
logger.info(f"\n=== 训练完成 | 时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===")
logger.info(f"总训练时间: {train_result.metrics.get('train_runtime', 0)/60:.2f}分钟")
logger.info(f"训练损失: {train_result.training_loss:.4f}")
logger.info(f"训练步数: {train_result.global_step}")
logger.info(f"训练速度: {train_result.metrics.get('train_samples_per_second', 0):.2f} 样本/秒")
logger.info(f"训练吞吐量: {train_result.metrics.get('train_steps_per_second', 0):.2f} 步/秒")

# 保存模型
peft_model_path = os.path.join(output_dir, "final_model")
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)
logger.info(f"最终模型已保存到: {peft_model_path}")

# 最终评估
logger.info("\n执行最终模型评估...")
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    logger.info(f"{key}: {value}")

logger.info("\n训练流程完成!")

2025-03-20 23:20:41,595 - __main__ - INFO - 日志将保存在: ./qwen_output/logs/training_log_20250320_232041.txt
2025-03-20 23:20:41,598 - __main__ - INFO - 开始加载模型...
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
2025-03-20 23:20:41,720 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-03-20 23:20:48,547 - __main__ - INFO - 模型加载完成
  trainer = SFTTrainer(


Truncating train dataset:   0%|          | 0/570 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/30 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
2025-03-20 23:20:50,785 - __main__ - INFO - 
训练前GPU内存使用情况:
2025-03-20 23:20:50,787 - __main__ - INFO - 已分配内存: 21.69 GB
2025-03-20 23:20:50,788 - __main__ - INFO - 缓存内存: 22.83 GB
2025-03-20 23:20:50,788 - __main__ - INFO - 
=== 开始训练 | 时间: 2025-03-20 23:20:50 ===
2025-03-20 23:20:50,789 - __main__ - INFO - 训练集大小: 570 样本
2025-03-20 23:20:50,790 - __main__ - INFO - 验证集大小: 30 样本
2025-03-20 23:20:50,791 - __main__ - INFO - 批次大小: 1 * 10 (梯度累积)
2025-03-20 23:20:50,792 - __main__ - INFO - 训练轮数: 3
2025-03-20 23:20:50,792 - __main__ - INFO - 学习率: 2e-05
2025-03-20 23:20:50,793 - __main__ - INFO - LoRA秩维度: 10
2025-03-20 23:20:50,794 - __main__ - INFO - LoRA缩放因子: 32
2025-03-20 23:20:50,795 - __main__ - INFO - 使用4位量化: 是
2025-03-20 23:20:50,796 - _

Step,Training Loss,Validation Loss


2025-03-20 23:23:08,158 - __main__ - INFO - 步骤: 20/171 (11.70%) | 损失: 2.6866 | 平均步骤时间: 6.83s | 已运行时间: 00:02:19 | 预计剩余时间: 00:17:11 | GPU内存: 7.30GB/11.14GB
