1.准备数据集

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import random

def read_json(file_path):
    """读取 JSON 文件"""
    with open(file_path, 'r') as f:
        return json.load(f)

# 定义模式列表
modes = ['absolute', 'obb_rel', 'obb_rot']
# modes=['absolute']
# 假设 train.json 在当前目录，可根据实际路径调整
train_json_path = 'train.json'
test_json_path='test.json'
# 读取训练数据 JSON
train_data_json = read_json(train_json_path)
test_data_json=read_json(test_json_path)

for mode in modes:
    all_train_entries = []
    all_test_entries=[]
    # 遍历每个物体 ID 和对应的信息
    for obj_id, obj_info in train_data_json.items():
        mode_info = obj_info.get(mode, {})
        bbox_code = mode_info.get('bbox_code', '')
        label_code = mode_info.get('label_code', '')
        # 确保 bbox_code 和 label_code 存在
        if bbox_code and label_code:
            all_train_entries.append((bbox_code, label_code))
    
    # 随机打乱数据
    random.shuffle(all_train_entries)
    train_num = int(len(all_train_entries))
    
    # 遍历每个物体 ID 和对应的信息
    for obj_id, obj_info in test_data_json.items():
        mode_info = obj_info.get(mode, {})
        bbox_code = mode_info.get('bbox_code', '')
        label_code = mode_info.get('label_code', '')
        # 确保 bbox_code 和 label_code 存在
        if bbox_code and label_code:
            all_test_entries.append((bbox_code, label_code))
    
    # 随机打乱数据
    random.shuffle(all_test_entries)
    test_num = int(len(all_test_entries))


    # 写入训练集 JSONL 文件
    with open(f'{mode}_train_data.jsonl', 'w') as f:
        for bbox, label in all_train_entries[:train_num]:
            entry = {
                'context': '',
                'question': bbox,
                'answer': label
            }
            f.write(json.dumps(entry) + '\n')
    


    # 写入验证集 JSONL 文件
    with open(f'{mode}_test_data.jsonl', 'w') as f:
        for bbox, label in all_test_entries[:test_num]:
            entry = {
                'context': '',
                'question': bbox,
                'answer': label
            }
            f.write(json.dumps(entry) + '\n')
    
    print(f"Mode: {mode}, Train entries: {len(all_train_entries[:train_num])}, Test entries: {len(all_test_entries[:test_num])}")


Mode: absolute, Train entries: 1871, Test entries: 229
Mode: obb_rel, Train entries: 1871, Test entries: 229
Mode: obb_rot, Train entries: 1871, Test entries: 229


2.加载数据和模型

In [3]:
from datetime import datetime
import os
import sys
import torch
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)
from transformers import (AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM,
                          TrainingArguments, Trainer, DataCollatorForSeq2Seq)
 
# 加载自己的数据集
from datasets import load_dataset
 
train_dataset = load_dataset('json', data_files='obb_rel_train_data.jsonl', split='train')
test_dataset = load_dataset('json', data_files='obb_rel_test_data.jsonl', split='train')
 
# 读取模型
base_model = '/data/winter25/zhouzy/ZZY/real2code/Scaled_Dataset/model'
 
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)
 
tokenizer = AutoTokenizer.from_pretrained(base_model)

Generating train split: 1871 examples [00:00, 143915.03 examples/s]
Generating train split: 229 examples [00:00, 59647.00 examples/s]
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.81s/it]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


先统计一下model自身参数

In [5]:
#统计一下参数，比对一下MODEL
from collections import defaultdict
# model.print_trainable_parameters()
print(model.named_modules)

<bound method Module.named_modules of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

3.训前效果

In [5]:
tokenizer.pad_token = tokenizer.eos_token

prompt = """[INST]  You are an AI assistant trained to understand 3D scenes and object relationships. Given the following Oriented Bounding Box (OBB) information, your task is to generate a list of child joints that describes the articulations between object parts.

OBB Information:
{}

Generate a number of root_geom,which means the base object,relative to OBB ID
- root_geom: Integer relative to/ selected from  input OBB ID
Generate a list of child joints. Each joint should be described by a dictionary with the following keys:
- box: The ID of the child bounding box
- type: The joint type ('hinge' for revolute joints, 'slide' for prismatic joints)
- idx: The rotation axis index (0 for x-axis, 1 for y-axis, 2 for z-axis)
- edge: Edge coordinates on the OBB, for example [1, -1]
- sign: Direction of the joint (+1 or -1)

IMPORTANT: Your response must contain ONLY the root_geom number and child_joints list, exactly as shown below, with NO additional text before or after！！！:

root_geom= 
child_joints = [
    dict(box=[child OBB ID], type=[joint type], idx=[rotation axis index], edge=[edge coordinates], sign=[direction]),
    # Additional joints as needed
]


Generate the geom_number and child_joints list: [/INST]
"""
prompts = [prompt.format(train_dataset[i]['question']) for i in [0,]]
 
model_input = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
 
 
model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=300)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
print(outputs)
print("ZeroShot:")
print(train_dataset[0]["answer"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["[INST]  You are an AI assistant trained to understand 3D scenes and object relationships. Given the following Oriented Bounding Box (OBB) information, your task is to generate a list of child joints that describes the articulations between object parts.\n\nOBB Information:\nbboxes={\n0:{'center':[-0.02,0.05,0.02],'R':[[0.9,0.3,0.0],[-0.3,0.9,0.0],[0.0,0.0,1.0]],'extent':[1.35,1.35,1.42]},\n1:{'center':[0.01,-0.5,-0.18],'R':[[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]],'extent':[0.49,1.79,0.99]},\n}\n\nGenerate a number of root_geom,which means the base object,relative to OBB ID\n- root_geom: Integer relative to/ selected from  input OBB ID\nGenerate a list of child joints. Each joint should be described by a dictionary with the following keys:\n- box: The ID of the child bounding box\n- type: The joint type ('hinge' for revolute joints, 'slide' for prismatic joints)\n- idx: The rotation axis index (0 for x-axis, 1 for y-axis, 2 for z-axis)\n- edge: Edge coordinates on the OBB, for exam

In [6]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
 
 
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=800,
        padding=False,
        return_tensors=None,
    )
 
    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
 
    return result
 
 
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are an AI assistant trained to understand 3D scenes and object relationships. Given the following Oriented Bounding Box (OBB) information, your task is to generate a list of child joints that describes the articulations between object parts.

OBB Information:
### Input:
{data_point["question"]}

Generate a number of root_geom,which means the base object,relative to OBB ID
- root_geom: Integer relative to/ selected from  input OBB ID
Generate a list of child joints. Each joint should be described by a dictionary with the following keys:
- box: The ID of the child bounding box
- type: The joint type ('hinge' for revolute joints, 'slide' for prismatic joints)
- idx: The rotation axis index (0 for x-axis, 1 for y-axis, 2 for z-axis)
- edge: Edge coordinates on the OBB, for example [1, -1]
- sign: Direction of the joint (+1 or -1)

IMPORTANT: Your response must contain ONLY the root_geom number and child_joints list, exactly as shown below, with no additional text before or after:

root_geom=[root_geom_number] 
child_joints = [
    dict(box=[child OBB ID], type=[joint type], idx=[rotation axis index], edge=[edge coordinates], sign=[direction]),
    # Additional joints as needed
]


Generate the geom_number and child_joints list:

### Response:
{data_point["answer"]}
"""
    return tokenize(full_prompt)
 
 
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_test_dataset = test_dataset.map(generate_and_tokenize_prompt)
 
 
model.train() # put model back into training mode
model = prepare_model_for_kbit_training(model)
 
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


model = get_peft_model(model, config)
model.print_trainable_parameters()
print(model.named_modules)
# print("\n================= 模型可训练参数（LoRA 适配器） =================")
# print("以下参数包含 'lora' 关键词，表明已应用 LoRA 适配器：")
# for name, param in model.named_parameters():
#     if param.requires_grad and "lora" in name:
#         print(f"✅ 可训练参数: {name}")

# print("\n================= 所有可训练参数数量 =================")
# trainable_params = sum(p.requires_grad for p in model.parameters())
# all_params = sum(p.numel() for p in model.parameters())
# print(f"可训练参数数量: {trainable_params}")
# print(f"模型总参数数量: {all_params}")
# print(f"可训练参数占比: {trainable_params / all_params:.4%}")



# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True
 
 
 
batch_size = 32
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama-ft"
 
training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        eval_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none", # if use_wandb else "none", wandb
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )
 
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

Map:   0%|          | 0/1871 [00:00<?, ? examples/s]

Map: 100%|██████████| 1871/1871 [00:02<00:00, 779.66 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 769.71 examples/s] 


trainable params: 16,777,216 || all params: 6,755,323,904 || trainable%: 0.2484
<bound method Module.named_modules of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32016, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
model.config.use_cache = False
 
old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)
trainer.train()

In [4]:
import torch
from peft import PeftModel,PeftConfig
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model="/data/winter25/zhouzy/ZZY/real2code/Scaled_Dataset/model"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(base_model)
from datasets import load_dataset
 
train_dataset = load_dataset('json', data_files='obb_rel_train_data.jsonl', split='train')
test_dataset = load_dataset('json', data_files='obb_rel_test_data.jsonl', split='train')
 
 
output_dir = "/data/winter25/zhouzy/ZZY/real2code/Scaled_Dataset/P-dataset/code-llama-ft/checkpoint-400"
model = PeftModel.from_pretrained(model,output_dir,output_loading_info=True)
# print(model.named_modules)
 
eval_data=test_dataset[80]["question"]
eval_prompt = f"""You are an AI assistant trained to understand 3D scenes and object relationships. Given the following Oriented Bounding Box (OBB) information, your task is to generate a list of child joints that describes the articulations between object parts.

OBB Information:
### Input:
{eval_data}

Generate a number of root_geom,which means the base object,relative to OBB ID
- root_geom: Integer relative to/ selected from  input OBB ID
Generate a list of child joints. Each joint should be described by a dictionary with the following keys:
- box: The ID of the child bounding box
- type: The joint type ('hinge' for revolute joints, 'slide' for prismatic joints)
- idx: The rotation axis index (0 for x-axis, 1 for y-axis, 2 for z-axis)
- edge: Edge coordinates on the OBB, for example [1, -1]
- sign: Direction of the joint (+1 or -1)

IMPORTANT: Your response must contain ONLY the root_geom number and child_joints list, exactly as shown below, with no additional text before or after:

root_geom=[root_geom_number] 
child_joints = [
    dict(box=[child OBB ID], type=[joint type], idx=[rotation axis index], edge=[edge coordinates], sign=[direction]),
    # Additional joints as needed
]


Generate the geom_number and child_joints list:

Generate the child_joints list:

### Response:
"""
 
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
 
model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=100)[0]
print("训后回答")
print(tokenizer.decode(outputs, skip_special_tokens=True))
print("对应label")
print(test_dataset[80]["answer"])


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards: 100%|██████████| 2/2 [00:30<00:00, 15.41s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


训后回答
You are an AI assistant trained to understand 3D scenes and object relationships. Given the following Oriented Bounding Box (OBB) information, your task is to generate a list of child joints that describes the articulations between object parts.

OBB Information:
### Input:
bboxes={
0:{'center':[-0.27,0.24,0.23],'R':[[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]],'extent':[0.53,1.08,0.05]},
1:{'center':[-0.01,-0.59,0.23],'R':[[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]],'extent':[1.08,0.53,0.05]},
2:{'center':[0.27,0.24,0.23],'R':[[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]],'extent':[0.53,1.08,0.05]},
3:{'center':[-0.0,0.07,-0.07],'R':[[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]],'extent':[1.09,1.63,0.45]},
}

Generate a number of root_geom,which means the base object,relative to OBB ID
- root_geom: Integer relative to/ selected from  input OBB ID
Generate a list of child joints. Each joint should be described by a dictionary with the following keys:
- box: The ID of the child bounding box

打印参数查看无法正确读入的原因