diff --git a/narf/include/onnxutils.hpp b/narf/include/onnxutils.hpp
index d3fd9b6..8e531f7 100644
--- a/narf/include/onnxutils.hpp
+++ b/narf/include/onnxutils.hpp
@@ -1,73 +1,66 @@
 #ifndef NARF_ONNXUTILS_H
 #define NARF_ONNXUTILS_H
 
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include <tbb/task_arena.h>
 
+#include <eigen3/unsupported/Eigen/CXX11/Tensor>
+
 #include "onnxruntime/onnxruntime_cxx_api.h"
-#include "traits.hpp"
 
 namespace narf {
 
 
+  // ONNX Runtime wrapper with persistent per-slot input/output buffers
+  // pre-bound through Ort::IoBinding. Each ``operator()`` call copies
+  // the caller's Eigen tensors into the persistent ORT-owned buffers
+  // via ``Eigen::TensorMap<…, RowMajor>``, runs the session against
+  // the binding, and copies the outputs back out the same way.
+  //
+  // The TensorMap-based copy enforces ORT's row-major contract
+  // regardless of the layout the caller passed in: assigning a
+  // (possibly ColMajor) Eigen tensor into a RowMajor TensorMap
+  // transparently reindexes element-by-element, so passing a ColMajor
+  // multi-axis tensor produces correct numerical results instead of a
+  // silent layout scramble.
+  //
+  // Construction has two flavours:
+  //   * static-shape:  shapes are queried from the session metadata.
+  //                    Throws if any input or output shape contains a
+  //                    dynamic axis (-1).
+  //   * explicit-shape: shapes are supplied by the caller and used to
+  //                    allocate the persistent buffers. Use this when
+  //                    the model has dynamic axes that the caller is
+  //                    pinning to concrete sizes at construction.
   class onnx_helper {
   public:
 
-    onnx_helper(const std::string &model_path, unsigned int nslots = 1) : meminfo_(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)) {
-
-      Ort::SessionOptions options;
-      options.SetIntraOpNumThreads(1);
-      options.SetInterOpNumThreads(1);
-
-      for (unsigned int islot = 0; islot < nslots; ++islot) {
-        auto &session = sessions_.emplace_back(env_, model_path.c_str(), options);
-
-        // Ort::Allocator allocator(session, meminfo_);
-        Ort::Allocator &allocator = allocators_.emplace_back(session, meminfo_);
-
-        auto const nin = session.GetInputCount();
-        auto const nout = session.GetOutputCount();
-
-        if (islot == 0) {
-
-          for (std::size_t iin = 0; iin < nin; ++iin) {
-            auto const &namestring = inputNameStrings_.emplace_back(session.GetInputNameAllocated(iin, allocator));
-            inputNames_.emplace_back(namestring.get());
-
-            inputShapes_.emplace_back(session.GetInputTypeInfo(iin).GetTensorTypeAndShapeInfo().GetShape());
-          }
-
-          for (std::size_t iout = 0; iout < nout; ++iout) {
-            auto const &namestring = outputNameStrings_.emplace_back(session.GetOutputNameAllocated(iout, allocator));
-            outputNames_.emplace_back(namestring.get());
-
-            outputShapes_.emplace_back(session.GetOutputTypeInfo(iout).GetTensorTypeAndShapeInfo().GetShape());
-          }
-
-        }
-
-        auto &inputValues = inputValues_.emplace_back();
-        auto &outputValues = outputValues_.emplace_back();
-
-        for (std::size_t iin = 0; iin < nin; ++iin) {
-          inputValues.emplace_back(Ort::Value::CreateTensor(allocator,
-                                                     inputShapes_[iin].data(),
-                                                     inputShapes_[iin].size(),
-                                                     session.GetInputTypeInfo(iin).GetTensorTypeAndShapeInfo().GetElementType()));
-
-        }
-
-        for (std::size_t iout = 0; iout < nout; ++iout) {
-          outputValues.emplace_back(Ort::Value::CreateTensor(allocator,
-                                                     outputShapes_[iout].data(),
-                                                     outputShapes_[iout].size(),
-                                                     session.GetOutputTypeInfo(iout).GetTensorTypeAndShapeInfo().GetElementType()));
+    // Static-shape constructor. Reads each input/output's shape from
+    // ``session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo()`` and
+    // throws if any contains a dynamic axis (-1).
+    onnx_helper(const std::string &model_path, unsigned int nslots = 1) {
+      init_(model_path, /*input_shapes_explicit=*/nullptr,
+            /*output_shapes_explicit=*/nullptr, nslots);
+    }
 
-        }
-      }
+    // Explicit-shape constructor. ``input_shapes`` and ``output_shapes``
+    // must each be ``session.GetInputCount()`` / ``GetOutputCount()``
+    // long, with concrete (non-negative) dim values. Use this when
+    // the model declares one or more dynamic axes; element types are
+    // still discovered from the session.
+    onnx_helper(const std::string &model_path,
+                const std::vector<std::vector<int64_t>> &input_shapes,
+                const std::vector<std::vector<int64_t>> &output_shapes,
+                unsigned int nslots = 1) {
+      init_(model_path, &input_shapes, &output_shapes, nslots);
     }
 
     template<typename input_t, typename output_t>
-    void operator() (input_t &inputs, output_t &outputs) {
+    void operator() (const input_t &inputs, output_t &outputs) {
       // Slot indexed by the TBB task arena thread index rather than
       // RDataFrame's per-graph slot. RDataFrame slots are not unique
       // across multiple RDataFrame loops scheduled by ``RunGraphs`` in
@@ -81,143 +74,193 @@ namespace narf {
       Ort::Session &session = sessions_[tbb_slot];
       auto &inputValues = inputValues_[tbb_slot];
       auto &outputValues = outputValues_[tbb_slot];
+      auto &binding = bindings_[tbb_slot];
 
-      using Eigen::TensorMap;
-      using Eigen::Tensor;
-
-      auto fill_inputs = [&inputValues](auto const&... tensors) {
-        auto fill_input = [](Ort::Value &value, auto const &tensor) {
-          using scalar_t = typename std::decay_t<decltype(tensor)>::Scalar;
-
-          const scalar_t *src = tensor.data();
-          scalar_t *dest = value.GetTensorMutableData<scalar_t>();
-
-          std::copy(src, src + tensor.size(), dest);
-        };
-
+      auto fill_inputs = [this, &inputValues](auto const&... tensors) {
         std::size_t i = 0;
-        (fill_input(inputValues[i++], tensors), ...);
+        auto one = [&](auto const &t) { fill_into_ort_(inputValues[i++], t); };
+        (one(tensors), ...);
       };
-
       std::apply(fill_inputs, inputs);
 
-      session.Run(Ort::RunOptions{nullptr},
-              inputNames_.data(),
-              inputValues.data(),
-              inputNames_.size(),
-              outputNames_.data(),
-              outputValues.data(),
-              outputNames_.size());
-
-      auto fill_outputs = [&outputValues](auto&... tensors) {
-        auto fill_output = [](Ort::Value &value, auto &tensor) {
-          using scalar_t = typename std::decay_t<decltype(tensor)>::Scalar;
-
-          const scalar_t *src = value.GetTensorMutableData<scalar_t>();
-          scalar_t *dest = tensor.data();
-
-          std::copy(src, src + tensor.size(), dest);
-        };
+      session.Run(Ort::RunOptions{nullptr}, binding);
 
+      auto fill_outputs = [this, &outputValues](auto&... tensors) {
         std::size_t i = 0;
-        (fill_output(outputValues[i++], tensors), ...);
+        auto one = [&](auto &t) { fill_from_ort_(outputValues[i++], t); };
+        (one(tensors), ...);
       };
-
       std::apply(fill_outputs, outputs);
-
-
     }
 
   private:
+    // ``env_`` must outlive every session built from it (sessions
+    // don't hold their own ref). The default-options allocator is an
+    // ``Unowned<OrtAllocator>`` wrapper around an ORT-library-owned
+    // singleton — destroying the wrapper does not free the
+    // underlying allocator, so we can construct it as a local in
+    // ``init_`` and let it go; the ``Ort::Value::~Value`` calls of
+    // our member vectors will free their buffers against the same
+    // singleton at class-destruction time.
     Ort::Env env_;
-    Ort::MemoryInfo meminfo_;
     std::vector<Ort::Session> sessions_;
-    std::vector<Ort::Allocator> allocators_;
-    std::vector<Ort::AllocatedStringPtr> inputNameStrings_;
-    std::vector<Ort::AllocatedStringPtr> outputNameStrings_;
-    std::vector<const char*> inputNames_;
-    std::vector<const char*> outputNames_;
-    std::vector<std::vector<int64_t>> inputShapes_;
-    std::vector<std::vector<int64_t>> outputShapes_;
     std::vector<std::vector<Ort::Value>> inputValues_;
     std::vector<std::vector<Ort::Value>> outputValues_;
-  };
-
-  class onnx_helper_alloc {
-  public:
-
-    onnx_helper_alloc(const std::string &model_path, unsigned int nslots = 1) : meminfo_(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)) {
+    std::vector<Ort::IoBinding> bindings_;
 
+    void init_(const std::string &model_path,
+               const std::vector<std::vector<int64_t>> *input_shapes_explicit,
+               const std::vector<std::vector<int64_t>> *output_shapes_explicit,
+               unsigned int nslots) {
       Ort::SessionOptions options;
       options.SetIntraOpNumThreads(1);
       options.SetInterOpNumThreads(1);
 
-      for (unsigned int islot = 0; islot < nslots; ++islot) {
-        sessions_.emplace_back(env_, model_path.c_str(), options);
-      }
-
-      Ort::Session &session0 = sessions_.front();
-      Ort::Allocator allocator(session0, meminfo_);
-
-      auto const nin = session0.GetInputCount();
-      auto const nout = session0.GetOutputCount();
+      // Global ORT-owned default CPU allocator. ``Unowned`` wrapper,
+      // so this local can die at end of ``init_`` — the OrtValues
+      // we hand to ``CreateTensor`` below hold a pointer to the
+      // underlying singleton, not to this wrapper.
+      Ort::AllocatorWithDefaultOptions allocator;
+
+      // Locals: filled on the first slot iteration and reused for
+      // the rest. AllocatedStringPtrs free themselves through the
+      // allocator at end of scope; the underlying allocator is the
+      // ORT-library-owned singleton, alive for the program's lifetime.
+      std::vector<Ort::AllocatedStringPtr> inputNameStrings;
+      std::vector<Ort::AllocatedStringPtr> outputNameStrings;
+      std::vector<const char*> inputNames;
+      std::vector<const char*> outputNames;
+      std::vector<std::vector<int64_t>> inputShapes;
+      std::vector<std::vector<int64_t>> outputShapes;
+      std::vector<ONNXTensorElementDataType> inputElementTypes;
+      std::vector<ONNXTensorElementDataType> outputElementTypes;
 
-      for (std::size_t i = 0; i < nin; ++i) {
-        inputNames_.emplace_back(session0.GetInputNameAllocated(i, allocator).release());
-      }
-
-      for (std::size_t i = 0; i < nout; ++i) {
-        outputNames_.emplace_back(session0.GetOutputNameAllocated(i, allocator).release());
-      }
-    }
-
-    template<typename input_t, typename output_t>
-    void operator() (input_t &inputs, output_t &outputs) {
-      // Slot indexed by the TBB task arena thread index; see
-      // onnx_helper::operator() for rationale.
-      auto const tbb_slot = std::max(tbb::v1::this_task_arena::current_thread_index(), 0);
-
-      if (static_cast<std::size_t>(tbb_slot) >= sessions_.size()) {
-        throw std::runtime_error("Not enough sessions allocated for number of tbb threads");
-      }
+      for (unsigned int islot = 0; islot < nslots; ++islot) {
+        auto &session = sessions_.emplace_back(env_, model_path.c_str(), options);
 
-      auto get_tensors_with_shapes = [](auto&... tensors) {
-        auto get_shape = [](const auto &tensor) {
-          return narf::tensor_traits<std::decay_t<decltype(tensor)>>::get_sizes();
-        };
+        auto const nin = session.GetInputCount();
+        auto const nout = session.GetOutputCount();
 
-        return std::make_tuple(std::make_pair(std::ref(tensors), get_shape(tensors))...);
-      };
+        if (islot == 0) {
+          if (input_shapes_explicit && input_shapes_explicit->size() != nin) {
+            throw std::invalid_argument(
+                "onnx_helper: input_shapes size does not match model input count");
+          }
+          if (output_shapes_explicit && output_shapes_explicit->size() != nout) {
+            throw std::invalid_argument(
+                "onnx_helper: output_shapes size does not match model output count");
+          }
 
-      auto get_values = [this](auto&... tensors_with_shapes) {
-        std::array<Ort::Value, sizeof...(tensors_with_shapes)> values = { Ort::Value::CreateTensor<typename std::decay_t<decltype(tensors_with_shapes.first)>::Scalar>(meminfo_, tensors_with_shapes.first.data(), tensors_with_shapes.first.size(), tensors_with_shapes.second.data(), tensors_with_shapes.second.size())... };
+          // ``TypeInfo`` owns the underlying ``OrtTypeInfo``; the
+          // ``ConstTensorTypeAndShapeInfo`` we get from it is a
+          // non-owning view that must not outlive the parent.
+          for (std::size_t iin = 0; iin < nin; ++iin) {
+            auto const &namestring = inputNameStrings.emplace_back(
+                session.GetInputNameAllocated(iin, allocator));
+            inputNames.emplace_back(namestring.get());
+
+            auto type_info = session.GetInputTypeInfo(iin);
+            auto info = type_info.GetTensorTypeAndShapeInfo();
+            inputElementTypes.emplace_back(info.GetElementType());
+
+            std::vector<int64_t> shape = input_shapes_explicit
+                ? (*input_shapes_explicit)[iin]
+                : info.GetShape();
+            for (auto d : shape) {
+              if (d < 0) {
+                throw std::runtime_error(
+                    std::string("onnx_helper: input '") + inputNames.back() +
+                    "' has a dynamic axis; use the explicit-shape constructor");
+              }
+            }
+            inputShapes.emplace_back(std::move(shape));
+          }
 
-        return values;
-      };
+          for (std::size_t iout = 0; iout < nout; ++iout) {
+            auto const &namestring = outputNameStrings.emplace_back(
+                session.GetOutputNameAllocated(iout, allocator));
+            outputNames.emplace_back(namestring.get());
+
+            auto type_info = session.GetOutputTypeInfo(iout);
+            auto info = type_info.GetTensorTypeAndShapeInfo();
+            outputElementTypes.emplace_back(info.GetElementType());
+
+            std::vector<int64_t> shape = output_shapes_explicit
+                ? (*output_shapes_explicit)[iout]
+                : info.GetShape();
+            for (auto d : shape) {
+              if (d < 0) {
+                throw std::runtime_error(
+                    std::string("onnx_helper: output '") + outputNames.back() +
+                    "' has a dynamic axis; use the explicit-shape constructor");
+              }
+            }
+            outputShapes.emplace_back(std::move(shape));
+          }
+        }
 
-      auto inputs_with_shapes = std::apply(get_tensors_with_shapes, inputs);
-      auto outputs_with_shapes = std::apply(get_tensors_with_shapes, outputs);
+        auto &inputValues = inputValues_.emplace_back();
+        auto &outputValues = outputValues_.emplace_back();
 
-      auto inputValues = std::apply(get_values, inputs_with_shapes);
-      auto outputValues = std::apply(get_values, outputs_with_shapes);
+        for (std::size_t iin = 0; iin < nin; ++iin) {
+          inputValues.emplace_back(Ort::Value::CreateTensor(
+              allocator,
+              inputShapes[iin].data(),
+              inputShapes[iin].size(),
+              inputElementTypes[iin]));
+        }
+        for (std::size_t iout = 0; iout < nout; ++iout) {
+          outputValues.emplace_back(Ort::Value::CreateTensor(
+              allocator,
+              outputShapes[iout].data(),
+              outputShapes[iout].size(),
+              outputElementTypes[iout]));
+        }
 
-      Ort::Session &session = sessions_[tbb_slot];
+        // Pre-bind every input/output Ort::Value to this slot's
+        // session-local IoBinding. Subsequent ``session.Run`` calls
+        // re-use the same buffers — no per-call Ort::Value
+        // construction, no per-call name array setup, no copies on
+        // the ORT side.
+        auto &binding = bindings_.emplace_back(session);
+        for (std::size_t iin = 0; iin < nin; ++iin) {
+          binding.BindInput(inputNames[iin], inputValues[iin]);
+        }
+        for (std::size_t iout = 0; iout < nout; ++iout) {
+          binding.BindOutput(outputNames[iout], outputValues[iout]);
+        }
+      }
+    }
 
-      session.Run(Ort::RunOptions{nullptr},
-              inputNames_.data(),
-              inputValues.data(),
-              inputNames_.size(),
-              outputNames_.data(),
-              outputValues.data(),
-              outputNames_.size());
+    // Caller-Eigen-tensor -> ORT buffer copy via a RowMajor TensorMap
+    // assignment. The shape comes directly from the caller's tensor;
+    // the ORT-side buffer was allocated to ``inputShapes_[i]`` at
+    // construction and the caller is responsible for matching it.
+    // Works for any caller layout (ColMajor or RowMajor): Eigen's
+    // cross-layout assignment iterates in index space and respects
+    // both source and destination strides, so the result is shape-
+    // correct regardless of source layout.
+    template <typename CallerTensor>
+    void fill_into_ort_(Ort::Value &value, const CallerTensor &caller) const {
+      using D = std::decay_t<CallerTensor>;
+      using scalar_t = typename D::Scalar;
+      constexpr int rank = D::NumDimensions;
+      Eigen::TensorMap<Eigen::Tensor<scalar_t, rank, Eigen::RowMajor>> ort_view(
+          value.template GetTensorMutableData<scalar_t>(), caller.dimensions());
+      ort_view = caller;
     }
 
-  private:
-    Ort::Env env_;
-    Ort::MemoryInfo meminfo_;
-    std::vector<Ort::Session> sessions_;
-    std::vector<const char*> inputNames_;
-    std::vector<const char*> outputNames_;
+    // ORT buffer -> caller-Eigen-tensor copy via a RowMajor TensorMap
+    // assignment. Same layout-agnostic behaviour as ``fill_into_ort_``.
+    template <typename CallerTensor>
+    void fill_from_ort_(const Ort::Value &value, CallerTensor &caller) const {
+      using D = std::decay_t<CallerTensor>;
+      using scalar_t = typename D::Scalar;
+      constexpr int rank = D::NumDimensions;
+      Eigen::TensorMap<const Eigen::Tensor<scalar_t, rank, Eigen::RowMajor>> ort_view(
+          value.template GetTensorData<scalar_t>(), caller.dimensions());
+      caller = ort_view;
+    }
   };
 
 }