In [None]:
%connect_info

In [1]:
import os
from collections import deque
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import draccus
import torch
import torch.distributed as dist
import tqdm
from accelerate import PartialState
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from transformers import AutoConfig, AutoImageProcessor
from transformers.modeling_outputs import CausalLMOutputWithPast

import wandb
from prismatic.models.backbones.llm.prompting import PurePromptBuilder, VicunaV15ChatPromptBuilder
from prismatic.util.data_utils import PaddedCollatorForActionPrediction
from prismatic.vla.action_tokenizer import ActionTokenizer
from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics

from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor


  from .autonotebook import tqdm as notebook_tqdm
2025-05-19 18:42:06.855894: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-19 18:42:06.855985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-19 18:42:06.857157: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-19 18:42:06.863034: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
quantization_config = None
if cfg.use_quantization:
    assert cfg.use_lora, "Quantized training only supported for LoRA fine-tuning!"
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4"
    )

# Register OpenVLA model to HF Auto Classes (not needed if the model is on HF Hub)
AutoConfig.register("openvla", OpenVLAConfig)
AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)

# Load OpenVLA Processor and Model using HF AutoClasses
processor = AutoProcessor.from_pretrained(cfg.vla_path, trust_remote_code=True)
# vla = AutoModelForVision2Seq.from_pretrained(
#     cfg.vla_path,
#     torch_dtype=torch.bfloat16,
#     quantization_config=quantization_config,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
# )

# Create Action Tokenizer
action_tokenizer = ActionTokenizer(processor.tokenizer)

batch_transform = RLDSBatchTransform(
    action_tokenizer,
    processor.tokenizer,
    image_transform=processor.image_processor.apply_transform,
    prompt_builder_fn=PurePromptBuilder if "v01" not in cfg.vla_path else VicunaV15ChatPromptBuilder,
)

# vla = AutoModelForVision2Seq.from_pretrained(
#     cfg.vla_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True
# )

vla = AutoModelForVision2Seq.from_pretrained(
    cfg.vla_path,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)
print("base vla from pretrained")


In [None]:
# Install minimal dependencies (`torch`, `transformers`, `timm`, `tokenizers`, ...)
# > pip install -r https://raw.githubusercontent.com/openvla/openvla/main/requirements-min.txt
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

import torch

# Load Processor & VLA`
# model_path = r"/root/autodl-tmp/openvla/myScripts/autodl-tmp/huggingface_models"
# model_path = r"/root/autodl-tmp/openvla/myScripts/runs/huggingface_models+austin_buds_dataset_converted_externally_to_rlds+b2+lr-0.0005+lora-r32+dropout-0.0"
#! export HF_ENDPOINT=https://hf-mirror.com
#! huggingface-cli download --resume-download openvla/openvla-7b --local-dir openvla7b_huggingfacemodel
# model_path = r"/home/chuangzhi/zhq/yjc/openvla7b_huggingfacemodel"
model_path = r"/home/chuangzhi/zhq/yjc/runs/openvla7b_huggingfacemodel+libero_spatial_no_noops+b2+lr-0.0005+lora-r32+dropout-0.0+example_dataset+b1+lr-0.0005+lora-r32+dropout-0.0"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    model_path, 
    attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True, 
    trust_remote_code=True
).to("cuda:0")

# Grab image input & format prompt
# image: Image.Image = get_from_camera(...)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
# Grab image input & format prompt
image_path = r'/root/autodl-tmp/openvla/myScripts/1.jpg' # 替换为你的图像路径
image = Image.open(image_path)  # 加载图像

prompt = "In: What action should the robot take to {<INSTRUCTION>}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)


In [14]:
inputs.keys()


dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [15]:
devices = {key: value.device if isinstance(value, torch.Tensor) else "Not a Tensor" for key, value in inputs.items()}
print(devices)

{'input_ids': device(type='cuda', index=0), 'attention_mask': device(type='cuda', index=0), 'pixel_values': device(type='cuda', index=0)}


In [16]:
# inputs['input_ids'].device()

TypeError: 'torch.device' object is not callable

In [6]:
inputs['attention_mask'].shape

torch.Size([1, 22])

In [7]:
inputs['pixel_values'].shape

torch.Size([1, 6, 224, 224])

In [8]:
inputs['pixel_values'][ :, :2,:4,:4]

tensor([[[[-0.1118,  0.2139,  0.3867,  0.5742],
          [-0.0089,  0.3008,  0.4551,  0.6250],
          [ 0.1455,  0.4023,  0.5391,  0.6953],
          [ 0.3008,  0.4883,  0.6250,  0.7617]],

         [[-0.1582,  0.0869,  0.2969,  0.4023],
          [-0.0359,  0.1748,  0.3496,  0.4727],
          [ 0.0869,  0.2793,  0.4375,  0.5430],
          [ 0.2100,  0.2969,  0.4727,  0.5586]]]], device='cuda:0',
       dtype=torch.bfloat16)

In [9]:
action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
print(action)
# # Execute...
# robot.act(action, ...)

[ 0.00538307  0.00766987 -0.00335915 -0.02568265  0.04945926  0.10147657
  0.        ]


In [10]:
action.shape

(7,)

In [1]:
import torch
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers import AutoConfig, AutoImageProcessor
from prismatic.models.backbones.llm.prompting import PurePromptBuilder, VicunaV15ChatPromptBuilder
from prismatic.vla.action_tokenizer import ActionTokenizer
from typing import Type, Any
from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
from prismatic.models.backbones.llm.prompting import PurePromptBuilder, VicunaV15ChatPromptBuilder
from prismatic.util.data_utils import PaddedCollatorForActionPrediction
from prismatic.vla.action_tokenizer import ActionTokenizer
from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics
# 注册 OpenVLA 模型到 HF Auto Classes
AutoConfig.register("openvla", OpenVLAConfig)
AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)


  from .autonotebook import tqdm as notebook_tqdm
2025-05-22 16:55:33.800944: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-22 16:55:33.801044: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-22 16:55:33.802274: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-22 16:55:33.808140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from typing import Dict, List, Optional, Union, Any
import json
import os

class OpenVLAInference:
    def __init__(self, model_path, device="cuda"):
        """
        初始化 OpenVLA 推理器
        
        参数:
            model_path: 模型路径 (可以是 HuggingFace Hub 路径或本地路径)
            device: 推理设备 ('cuda' 或 'cpu')
        """
        self.device = device if torch.cuda.is_available() and device.startswith("cuda") else "cpu"
        dataset_statistics_path = os.path.join(model_path, "dataset_statistics.json")
        if os.path.isfile(dataset_statistics_path):
            with open(dataset_statistics_path, "r") as f:
                norm_stats = json.load(f)
            self.norm_stats = norm_stats
        self.unnorm_key = "example_dataset"
        assert self.unnorm_key in self.norm_stats, f"Action un-norm key {self.unnorm_key} not found in VLA `norm_stats`!"
        # 加载处理器和模型
        self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True
        ).to(self.device)
        self.model.norm_stats = self.norm_stats
        # 创建 action tokenizer
        self.action_tokenizer = ActionTokenizer(self.processor.tokenizer)
        
        # 根据模型版本选择 prompt builder
        self.prompt_builder_cls = (
            PurePromptBuilder if "v01" not in model_path 
            else VicunaV15ChatPromptBuilder
        )
        
        # 设置模型为评估模式
        self.model.eval()
    
    def _build_prompt(self, text_instruction: str) -> str:
        """
        构建推理时使用的 prompt (直接实现原RLDSBatchTransform的逻辑)
        
        参数:
            text_instruction: 文本指令 (如 "wipe the table")
            
        返回:
            格式化后的 prompt 文本
        """
        # 初始化 prompt builder
        self.prompt_builder = self.prompt_builder_cls("openvla")
        # print("self.prompt_builder.turn_count",self.prompt_builder.turn_count)
        conversation = [
            {"from": "human", "value":f"What action should the robot take to {text_instruction.lower()}?"},
            # {"from": "gpt", "value":""},
        ]
        # 添加对话轮次 (只包含人类指令部分)
        for turn in conversation:
            self.prompt_builder.add_turn(turn["from"], turn["value"])
        # print("self.prompt_builder.turn_count",self.prompt_builder.turn_count)
        self.turn_count = self.prompt_builder.turn_count
        return self.prompt_builder.get_prompt()
    
    def preprocess_inputs(self, text_prompt: str, image: Any) -> Dict[str, torch.Tensor]:
        """
        预处理输入文本和图像
        
        参数:
            text_prompt: 文本指令
            image: PIL.Image 对象或图像路径
            
        返回:
            处理后的模型输入字典
        """
        # 如果 image 是路径，则加载图像
        if isinstance(image, str):
            image = Image.open(image).convert("RGB")
        
        # 构建 prompt
        prompt_text = self._build_prompt(text_prompt)
        self.input_ids = self.processor.tokenizer(prompt_text, add_special_tokens=True).input_ids
        self.input_ids = torch.tensor(self.input_ids)
        # 使用处理器处理输入
        inputs = self.processor(
            text=prompt_text,
            images=image,
            return_tensors="pt",
            truncation=True
        )
        
        # 将输入移动到设备
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
        
        return inputs
    
    def generate_actions(self, text_prompt: str, image: Any, 
                       max_new_tokens: int = 512, 
                       temperature: float = 0) -> tuple:
        """
        生成动作序列
        
        参数:
            text_prompt: 文本指令
            image: PIL.Image 对象或图像路径
            max_new_tokens: 最大生成 token 数
            temperature: 采样温度
            
        返回:
            tuple: (action_sequence, decoded_actions)
                - action_sequence: 动作序列 (numpy 数组)
                - decoded_actions: 解码后的动作 (人类可读格式)
        """
        # 预处理输入
        inputs = self.preprocess_inputs(text_prompt, image)
        
        # 生成动作 token
        with torch.no_grad(), torch.autocast("cuda", dtype=torch.bfloat16):
            self.output = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True if temperature > 0 else False,
                pad_token_id=self.processor.tokenizer.pad_token_id,
                eos_token_id=self.processor.tokenizer.eos_token_id,
            )
            self.action = self.model.predict_action(**inputs,
                                               unnorm_key=self.unnorm_key,
                                               temperature=temperature,
                                               do_sample=True if temperature > 0 else False,
                                               )
        mask = self.output > self.action_tokenizer.action_token_begin_idx
        # 解码动作 token
        action_tokens = self.output[mask].cpu().numpy()
        action_sequence = self.action_tokenizer.decode_token_ids_to_actions(action_tokens)
        
        # 获取人类可读的动作描述
        # decoded_actions = self.action_tokenizer.decode_actions_to_readable(action_sequence)
        
        return action_sequence
    
    def __call__(self, text_prompt: str, image: Any, 
                max_new_tokens: int = 512, 
                temperature: float = 0) -> tuple:
        """便捷调用方法"""
        return self.generate_actions(text_prompt, image, max_new_tokens, temperature)



In [8]:
# # 使用示例
# !CUDA_VISIBLE_DEVICES=1
vla_path = r"/home/chuangzhi/zhq/yjc/runs/openvla7b_huggingfacemodel+libero_spatial_no_noops+b2+lr-0.0005+lora-r32+dropout-0.0+example_dataset+b1+lr-0.0005+lora-r32+dropout-0.0"
# vla_path = r"/home/chuangzhi/zhq/yjc/runs/openvla7b_huggingfacemodel+libero_spatial_no_noops+b2+lr-0.0005+lora-r32+dropout-0.0"
# 初始化推理器 - 替换为你的模型路径  # 可以是本地路径或 HuggingFace Hub 路径
vla_inference = OpenVLAInference(vla_path,device="cuda:4")
vla_inference.device

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  4.40it/s]


'cuda:4'

In [9]:
vla_inference.norm_stats.keys()

dict_keys(['example_dataset'])

In [10]:
# 示例输入
text_instruction = "Move the drinking glass to the basket."  # 注意使用小写，与训练时一致
image_path = "/home/chuangzhi/zhq/yjc/myScripts/0.png"  # 你的图像路径

# 生成动作
action_sequence = vla_inference(text_instruction, image_path)

# 打印结果
print("Generated Action Sequence:", action_sequence)
print(f'"action" 序列长度为:{len(action_sequence)}')
print(vla_inference.action)

self.prompt_builder.turn_count 0
self.prompt_builder.turn_count 1




AttributeError: 'OpenVLAInference' object has no attribute 'self'

In [None]:
# 示例输入
text_instruction = "Move the drinking glass to the basket."  # 注意使用小写，与训练时一致
image_path = "/home/chuangzhi/zhq/yjc/myScripts/0.png"  # 你的图像路径

# 生成动作
action_sequence = vla_inference(text_instruction, image_path)

# 打印结果
print("Generated Action Sequence:", action_sequence)
print(f'"action" 序列长度为:{len(action_sequence)}')

self.prompt_builder.turn_count 0
self.prompt_builder.turn_count 1
Generated Action Sequence: [0.90980392 0.24313725 0.         0.09411765 0.10980392 0.19607843
 0.99607843]
"action" 序列长度为:7


In [35]:
vla_inference.prompt_builder.turn_count

1

In [36]:
vla_inference.turn_count

1

In [37]:
vla_inference.processor.tokenizer.pad_token_id, vla_inference.processor.tokenizer.eos_token_id

(32000, 2)

In [6]:
img = Image.open(image_path).convert("RGB")

In [24]:
inputs = vla_inference.preprocess_inputs(text_instruction, img)
inputs

prompt_builder.turn_count 0
prompt_builder.turn_count 1


{'input_ids': tensor([[    1,   512, 29901,  1724,  3158,   881,   278, 19964,  2125,   304,
           4337,   278, 13748,   292, 12917,   304,   278, 25972, 29889, 29973,
             13,  3744, 29901, 29871]], device='cuda:2'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
        device='cuda:2'),
 'pixel_values': tensor([[[[-1.6406, -1.7578, -2.0000,  ..., -0.1631, -0.1631, -0.1465],
           [-1.6562, -1.6875, -1.9688,  ..., -0.1631, -0.1465, -0.1465],
           [-1.7109, -1.6562, -1.8984,  ..., -0.1807, -0.1807, -0.1631],
           ...,
           [ 0.6094,  0.5898,  0.6250,  ...,  0.7109,  0.7109,  0.7109],
           [ 0.6602,  0.6250,  0.6758,  ...,  0.7461,  0.7461,  0.7109],
           [ 0.6602,  0.6445,  0.6602,  ...,  0.7109,  0.7305,  0.6953]],
 
          [[-1.5234, -1.6641, -1.9141,  ...,  0.1572,  0.1396,  0.1221],
           [-1.5469, -1.5938, -1.8750,  ...,  0.1572,  0.1748,  0.1748],
           [-1.6484, -

In [60]:
vla_inference.input_ids

tensor([    1,   512, 29901,  1724,  3158,   881,   278, 19964,  2125,   304,
         4337,   278, 13748,   292, 12917,   304,   278, 25972, 29889, 29973,
           13,  3744, 29901])

In [40]:
inputs.keys(),inputs['attention_mask']

(dict_keys(['input_ids', 'attention_mask', 'pixel_values']),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
        device='cuda:4'))

In [20]:
vla_inference._build_prompt(text_instruction)

'In: What action should the robot take to move the drinking glass to the basket?\nOut:'

In [42]:
vla_inference._build_prompt(text_instruction)

self.prompt_builder.turn_count 0
self.prompt_builder.turn_count 1


'In: What action should the robot take to move the drinking glass to the basket.?\nOut: '

In [21]:
vla_inference.action_tokenizer.action_token_begin_idx

31743

NameError: name 'output' is not defined

In [48]:
with torch.no_grad(), torch.autocast("cuda", dtype=torch.bfloat16):
    output = vla_inference.model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.1,
        do_sample=True,
        pad_token_id=vla_inference.processor.tokenizer.pad_token_id,
        eos_token_id=vla_inference.processor.tokenizer.eos_token_id,
    )
output

tensor([[    1,   512, 29901,  1724,  3158,   881,   278, 19964,  2125,   304,
          4337,   278, 13748,   292, 12917,   304,   278, 25972, 29889, 29973,
            13,  3744, 29901, 31756, 31841, 31872, 31860, 31880, 31867, 31872,
         31744,     2]], device='cuda:0')

In [None]:
mask = output > vla_inference.action_tokenizer.action_token_begin_idx

output[mask].cpu().numpy()



In [18]:
len(action_sequence)

8