In [1]:
import sys
import os

# 添加项目根目录到 sys.path
# 获取当前工作目录
current_dir = get_ipython().run_line_magic('pwd', '')
# 假设项目根目录是当前工作目录的上两级目录
project_root = os.path.abspath(os.path.join(current_dir, '..'))

sys.path.append(project_root)
print(project_root)
print(sys.path)
import os
from collections import deque
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import draccus
import torch
import torch.distributed as dist
import tqdm
from accelerate import PartialState
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from transformers import AutoConfig, AutoImageProcessor
from transformers.modeling_outputs import CausalLMOutputWithPast

import wandb
from prismatic.models.backbones.llm.prompting import PurePromptBuilder, VicunaV15ChatPromptBuilder
from prismatic.util.data_utils import PaddedCollatorForActionPrediction
from prismatic.vla.action_tokenizer import ActionTokenizer
from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics

from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
import argparse


/root/autodl-tmp/openvla
['/root/autodl-tmp/openvla/myScripts', '/root/miniconda3/envs/openvla/lib/python310.zip', '/root/miniconda3/envs/openvla/lib/python3.10', '/root/miniconda3/envs/openvla/lib/python3.10/lib-dynload', '', '/root/miniconda3/envs/openvla/lib/python3.10/site-packages', '__editable__.openvla-0.0.3.finder.__path_hook__', '/root/autodl-tmp/LIBERO', '/root/autodl-tmp/openvla']


2025-04-28 10:52:06.765628: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 10:52:06.824582: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-28 10:52:06.824610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-28 10:52:06.826555: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-28 10:52:06.841985: I tensorflow/core/platform/cpu_feature_guar

In [2]:
class FinetuneConfig:
    def __init__(
        self,
        vla_path: str = "openvla/openvla-7b",
        data_root_dir: Path = Path("datasets/open-x-embodiment"),
        dataset_name: str = "droid_wipe",
        run_root_dir: Path = Path("runs"),
        adapter_tmp_dir: Path = Path("adapter-tmp"),
        batch_size: int = 2,
        max_steps: int = 400,
        save_steps: int = 200,
        learning_rate: float = 5e-4,
        grad_accumulation_steps: int = 1,
        image_aug: bool = True,
        shuffle_buffer_size: int = 100_000,
        save_latest_checkpoint_only: bool = False,
        use_lora: bool = True,
        lora_rank: int = 32,
        lora_dropout: float = 0.0,
        use_quantization: bool = True,
        wandb_project: str = "openvla",
        wandb_entity: str = "stanford-voltron",
        run_id_note: Optional[str] = None,
    ):
    # fmt: on
        self.vla_path = vla_path
        self.data_root_dir = data_root_dir
        self.dataset_name = dataset_name
        self.run_root_dir = run_root_dir
        self.adapter_tmp_dir = adapter_tmp_dir
        self.batch_size = batch_size
        self.max_steps = max_steps
        self.save_steps = save_steps
        self.learning_rate = learning_rate
        self.grad_accumulation_steps = grad_accumulation_steps
        self.image_aug = image_aug
        self.shuffle_buffer_size = shuffle_buffer_size
        self.save_latest_checkpoint_only = save_latest_checkpoint_only
        self.use_lora = use_lora
        self.lora_rank = lora_rank
        self.lora_dropout = lora_dropout
        self.use_quantization = use_quantization
        self.wandb_project = wandb_project
        self.wandb_entity = wandb_entity
        self.run_id_note = run_id_note



In [8]:
cfg = FinetuneConfig(
    vla_path=Path(r"/root/autodl-tmp/openvla/myScripts/runs/huggingface_models+austin_buds_dataset_converted_externally_to_rlds+b2+lr-0.0005+lora-r32+dropout-0.0"),
    data_root_dir=Path(r"/root/autodl-tmp/modified_libero_rlds/libero_spatial_no_noops"),
    dataset_name='1.0.0',
    batch_size=2,
)
print("Fine-tuning with the following configuration:")
for key, value in cfg.__dict__.items():
    print(f"{key}: {value}")
# 你的 fine-tuning 逻辑...
# 实例化 FinetuneConfig
# [Validate] Ensure GPU Available & Set Device / Distributed Context
assert torch.cuda.is_available(), "Fine-tuning assumes at least one GPU is available!"
distributed_state = PartialState()
torch.cuda.set_device(device_id := distributed_state.local_process_index)  #device_id 通过 distributed_state.local_process_index 获得
torch.cuda.empty_cache()
# Quantization Config =>> only if LoRA fine-tuning
quantization_config = None

Fine-tuning with the following configuration:
vla_path: C:\Users\YangJC\.cache\huggingface\hub\models--openvla--openvla-7b\snapshots\openvla
data_root_dir: C:\Users\YangJC\Desktop\tensorflow_datasets
dataset_name: austin_buds_dataset_converted_externally_to_rlds
run_root_dir: runs
adapter_tmp_dir: adapter-tmp
batch_size: 1
max_steps: 400
save_steps: 200
learning_rate: 0.0005
grad_accumulation_steps: 1
image_aug: True
shuffle_buffer_size: 100000
save_latest_checkpoint_only: False
use_lora: True
lora_rank: 32
lora_dropout: 0.0
use_quantization: True
wandb_project: openvla
wandb_entity: stanford-voltron
run_id_note: None


In [20]:
# AutoConfig.register("openvla", OpenVLAConfig)  
# AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
# AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
# AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)

In [16]:
# Load OpenVLA Processor and Model using HF AutoClasses
processor = AutoProcessor.from_pretrained(cfg.vla_path, trust_remote_code=True) # 加载处理器（Processor）  AutoProcessor
vla = AutoModelForVision2Seq.from_pretrained(     # 加载模型（Model） AutoModelForVision2Seq
    cfg.vla_path,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,  #如果启用了量化，则传递quantization_config
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.82it/s]


In [17]:
# 检查模型的第一个参数所在的设备
print("Model device:", next(vla.parameters()).device)


Model device: cpu


In [18]:
processor

PrismaticProcessor:
- image_processor: PrismaticImageProcessor {
  "auto_map": {
    "AutoImageProcessor": "processing_prismatic.PrismaticImageProcessor",
    "AutoProcessor": "processing_prismatic.PrismaticProcessor"
  },
  "image_processor_type": "PrismaticImageProcessor",
  "image_resize_strategy": "resize-naive",
  "input_sizes": [
    [
      3,
      224,
      224
    ],
    [
      3,
      224,
      224
    ]
  ],
  "interpolations": [
    "bicubic",
    "bicubic"
  ],
  "means": [
    [
      0.485,
      0.456,
      0.406
    ],
    [
      0.5,
      0.5,
      0.5
    ]
  ],
  "processor_class": "PrismaticProcessor",
  "stds": [
    [
      0.229,
      0.224,
      0.225
    ],
    [
      0.5,
      0.5,
      0.5
    ]
  ],
  "tvf_crop_params": [
    {
      "output_size": [
        224,
        224
      ]
    },
    {
      "output_size": [
        224,
        224
      ]
    }
  ],
  "tvf_do_letterbox": false,
  "tvf_letterbox_fill": null,
  "tvf_normalize_params"

In [19]:
# # Device Placement =>> note that BitsAndBytes automatically handles for quantized training
# if cfg.use_quantization:
#     vla = prepare_model_for_kbit_training(vla)
# else:
#     vla = vla.to(device_id)

In [12]:
{device_id}

{0}

In [28]:
from typing import List, Union

import numpy as np

from transformers import PreTrainedTokenizerBase
class ActionTokenizer:
    def __init__(
        self, tokenizer: PreTrainedTokenizerBase, bins: int = 256, min_action: int = -1, max_action: int = 1
    ) -> None:
        """
        Discretizes continuous robot actions into N bins per dimension and maps to the least used tokens.

        NOTE =>> by default, assumes a BPE-style tokenizer akin to the LlamaTokenizer, where *the least used tokens*
                 appear at the end of the vocabulary!

        :param tokenizer: Base LLM/VLM tokenizer to extend.
        :param bins: Number of bins for each continuous value; we'll adopt a uniform binning strategy.
        :param min_action: Minimum action value (for clipping, setting lower bound on bin interval).
        :param max_action: Maximum action value (for clipping, setting upper bound on bin interval).
        """
        self.tokenizer, self.n_bins, self.min_action, self.max_action = tokenizer, bins, min_action, max_action

        # Create Uniform Bins + Compute Bin Centers
        self.bins = np.linspace(min_action, max_action, self.n_bins) #创建均匀分布的离散化区间（bins）
        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0     #创建区间中心（bin_centers）每个区间的中心点

        # [Contract] Set "action_token_begin_idx" based on `self.tokenizer.vocab_size - (self.n_bins + 1)`
        #   =>> Assumes we're always overwriting the final `n_bins` tokens of the vocabulary!
        self.action_token_begin_idx: int = int(self.tokenizer.vocab_size - (self.n_bins + 1)) #计算动作对应的分词器词汇表的起始索引，离散化后的动作映射到词汇表的最后n_bins个位置

    def __call__(self, action: np.ndarray) -> Union[str, List[str]]: #将输入的连续动作值离散化，并将离散化后的动作索引映射到分词器的词汇表中
        """Clip & bin actions to *the last `n_bins` tokens* of the vocabulary (e.g., tokenizer.vocab[-256:])."""
        action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
        discretized_action = np.digitize(action, self.bins)   # 离散化动作为整数索引 action 落在哪个区间（bin）内
 
        # Handle single element vs. batch
        if len(discretized_action.shape) == 1:
            return self.tokenizer.decode(list(self.tokenizer.vocab_size - discretized_action))# 映射到词表decoded_action对应的字符串
        else:
            return self.tokenizer.batch_decode((self.tokenizer.vocab_size - discretized_action).tolist())

    def decode_token_ids_to_actions(self, action_token_ids: np.ndarray) -> np.ndarray:
        """
        Returns continuous actions for discrete action token IDs.

        NOTE =>> Because of the way the actions are discretized w.r.t. the bins (and not the bin centers), the
                 digitization returns bin indices between [1, # bins], inclusive, when there are actually only
                 (# bins - 1) bin intervals.

                 Therefore, if the digitization returns the last possible index, we map this to the last bin interval.

        EXAMPLE =>> Let's say self._bins has 256 values. Then self._bin_centers has 255 values. Digitization returns
                    indices between [1, 256]. We subtract 1 from all indices so that they are between [0, 255]. There
                    is still one index (i==255) that would cause an out-of-bounds error if used to index into
                    self._bin_centers. Therefore, if i==255, we subtract 1 from it so that it just becomes the index of
                    the last bin center. We implement this simply via clipping between [0, 255 - 1].
        """
        discretized_actions = self.tokenizer.vocab_size - action_token_ids
        discretized_actions = np.clip(discretized_actions - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)

        return self.bin_centers[discretized_actions]

    @property
    def vocab_size(self) -> int:
        return self.n_bins

In [36]:
action_tokenizer = ActionTokenizer(processor.tokenizer)
action_tokenizer

<__main__.ActionTokenizer at 0x1c3ce637400>

In [47]:
action = np.array([0.5])
print(action_tokenizer(action))
action = np.array([-0.8])
print(action_tokenizer(action))
action = np.array([0.2])
print(action_tokenizer(action))
action = np.array([0.5, -0.8, 0.2])
print(action_tokenizer(action))

എ
候
麻
എ候麻
