In [1]:
!pip install torch onnx onnxruntime lerobot num2words

Collecting onnx
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting lerobot
  Downloading lerobot-0.3.3-py3-none-any.whl.metadata (26 kB)
Collecting num2words
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting datasets<=3.6.0,>=2.19.0 (from lerobot)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting av>=14.2.0 (from lerobot)
  Downloading av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting jsonlines>=4.0.0 (from lerobot)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pynput>=1.7.7 (from lerobot)
  Downloading pynput-1.8.1-py2.py3-none-any.whl.metadata (32 kB)
Collecting pyserial>=3.

In [2]:
import torch
from torch import nn
from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel

class SmolVLMForONNXExport(nn.Module):
    def __init__(self, model_id, attention_mode="self_attn", **kwargs):
        super().__init__()
        # Instantiate the real model
        self.model = SmolVLMWithExpertModel(model_id, attention_mode=attention_mode, **kwargs)

        # Bake in the configuration for the static graph
        self.attention_mode = attention_mode
        self.num_vlm_layers = self.model.num_vlm_layers
        self.num_expert_layers = self.model.num_expert_layers

    def forward(
          self,
          vlm_embeds: torch.Tensor,
          expert_embeds: torch.Tensor,
          attention_mask: torch.Tensor,
          position_ids: torch.LongTensor,
          # Flattened past_key_values: one tensor for keys, one for values per layer
          *past_key_values_flat: torch.Tensor):


          inputs_embeds = [vlm_embeds, expert_embeds]

          past_key_values = {}
          if past_key_values_flat:
              # Logic to reconstruct the dictionary from the flat tuple of tensors
              # This assumes a consistent ordering: (key_layer0, val_layer0, key_layer1, val_layer1, ...)
              for i in range(0, len(past_key_values_flat), 2):
                  layer_idx = i // 2
                  past_key_values[layer_idx] = {
                      "key_states": past_key_values_flat[i],
                      "value_states": past_key_values_flat[i+1],
                  }
          else:
              past_key_values = None


          fill_kv_cache = past_key_values is None


          final_embeds_list, new_past_key_values = self.model.forward(
              attention_mask=attention_mask,
              position_ids=position_ids,
              past_key_values=past_key_values,
              inputs_embeds=inputs_embeds,
              use_cache=True, # Always assume cache is used for ONNX generation models
              fill_kv_cache=fill_kv_cache,
          )

          # --- 3. Flatten Outputs ---
          # The output must be a flat tuple of tensors
          vlm_output_embeds = final_embeds_list[0]
          expert_output_embeds = final_embeds_list[1]

          present_key_values_flat = []
          for i in range(self.num_vlm_layers):
              if i in new_past_key_values:
                  present_key_values_flat.append(new_past_key_values[i]["key_states"])
                  present_key_values_flat.append(new_past_key_values[i]["value_states"])

          return (vlm_output_embeds, expert_output_embeds, *present_key_values_flat)

In [3]:
import torch
# from onnx_export_wrapper import SmolVLMForONNXExport


onnx_model = SmolVLMForONNXExport("HuggingFaceTB/SmolVLM2-500M-Video-Instruct")
onnx_model.eval()

onnx_model.to("cuda:0")

output_names = ["vlm_output_embeds", "expert_output_embeds"]
num_layers = onnx_model.model.num_vlm_layers
for i in range(num_layers):
    output_names.append(f"present_key_{i}")
    output_names.append(f"present_value_{i}")

batch_size = 1
vlm_seq_len = 256
expert_seq_len = 16
total_seq_len = vlm_seq_len + expert_seq_len

dummy_vlm_embeds = torch.randn(batch_size, vlm_seq_len, 960, device="cuda:0", dtype=torch.bfloat16)
dummy_expert_embeds = torch.randn(batch_size, expert_seq_len, 480, device="cuda:0", dtype=torch.bfloat16)
dummy_attn_mask = torch.ones(batch_size, total_seq_len, total_seq_len, device="cuda:0", dtype=torch.bool)
dummy_pos_ids = torch.arange(total_seq_len, device="cuda:0").unsqueeze(0)


dummy_inputs = (dummy_vlm_embeds, dummy_expert_embeds, dummy_attn_mask, dummy_pos_ids)

torch.onnx.export(
    onnx_model,
    dummy_inputs,
    "smolvlm_expert.onnx",
    input_names=["vlm_embeds", "expert_embeds", "attention_mask", "position_ids"],
    output_names=output_names,
    opset_version=17
)


Loading  HuggingFaceTB/SmolVLM2-500M-Video-Instruct weights ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

  if seq_len < position_ids.shape[1]:


In [5]:
import os, onnx

out_name = None
src_name = "smolvlm_expert.onnx"
out_dir = "splited"
os.makedirs(out_dir, exist_ok=True)
base = out_name or (os.path.splitext(os.path.basename(src_name))[0] + "_ext.onnx")
out_path = os.path.join(out_dir, base)

model = onnx.load(src_name)
onnx.save_model(
        model,
        out_path,
        save_as_external_data=True,
        all_tensors_to_one_file=True,
        location="weights.bin",
        size_threshold=0,
        convert_attribute=True
    )