Untested: (1) restoring hosted data initialization; (2) arbitrary tensor expression initialized params; (3) fix in backprop for params

lukstafi · lukstafi · commit 94ba8390ac5e · 2025-07-05T11:45:33.000+02:00
(1) restores initialization functionality, but from ndarray, mostly no-copy.
(2) allows for wrapping e.g. random sampling tensor expression as a param.
(3) prevents backprop into initialization code of params. That code doesn't disappear, can be used manually.
diff --git a/bin/hello_world_op.ml b/bin/hello_world_op.ml
@@ -189,7 +189,7 @@ let%track2_sexp _Big_matrix (() : unit) : unit =
   let ctx = Backend.make_context stream in
   Rand.init 0;
   (* Hey is inferred to be a matrix. *)
-  let hey = Tensor.param ~value:0.5 "hey" in
+  let hey = TDSL.param ~value:0.5 "hey" in
   let zero_to_twenty = TDSL.range 20 in
   let%op yd = (hey * zero_to_twenty) + zero_to_twenty in
   Train.forward_and_forget backend ctx yd;
diff --git a/lib/operation.ml b/lib/operation.ml
@@ -37,15 +37,6 @@ module Initial_NTDSL = struct
   module O = struct end
 end
 
-module Initial_TDSL = struct
-  let term = Tensor.term ~grad_spec:If_needed
-  let number = Tensor.number ~grad_spec:If_needed
-  let ndarray = Tensor.ndarray ~grad_spec:If_needed
-  let param = Tensor.param
-
-  module O = struct end
-end
-
 let add ?(label = []) =
   let module NTDSL = Initial_NTDSL in
   let%cd op_asn ~v ~t1 ~t2 ~projections = v =: v1 + v2 in
@@ -452,27 +443,77 @@ module NDO = struct
   let ( <> ) = ne ~grad_spec:Prohibit_grad
 end
 
+(** The input [i] dimensions default to empty. The batch and output dimensions will be inferred if
+    omitted. Note: the data should have no padding and if padding is inferred, the data will be
+    copied; otherwise, the resulting tensor value shares host memory with the ndarray. *)
+let reshape ~l ?b ?(i = []) ?o ndarray =
+  Tensor.term ~label:[ l ] ?batch_dims:b ~input_dims:i ?output_dims:o ~init_data:(Reshape ndarray)
+    ()
+
+(** The dimensions are taken from the provided ndarray, but the split into axis kinds still needs to
+    be inferred (or provided). Assumes no padding. See also: {!reshape} and {!TDSL.wrap_param}. *)
+let wrap ~l ?b ?(i = []) ?o ndarray =
+  Tensor.term ~label:[ l ] ?batch_dims:b ~input_dims:i ?output_dims:o
+    ~init_data:(Keep_shape_no_padding ndarray) ()
+
+(** Assumes the ndarray is padded as given. This means the dimensions of the ndarray will differ
+    from the dimensions of the tensor by the padding. See also: {!TDSL.wrap}. *)
+let wrap_padded ~l ?b ?(i = []) ?o ~padding ~padded_value ndarray =
+  Tensor.term ~label:[ l ] ?batch_dims:b ~input_dims:i ?output_dims:o
+    ~init_data:(Padded { data = ndarray; padding; padded_value })
+    ()
+
+(** The output dimensions are taken from the provided ndarray, assuming precisely the first axis is
+    a batch axis, assumes no input axes and the batch dimensions are inferred. Assumes the data has
+    no padding, and data is copied if padding is inferred. See also: {!reshape} and {!wrap}. *)
+let rebatch ~l ndarray =
+  let output_dims = Ir.Ndarray.dims ndarray |> Array.to_list |> List.tl_exn in
+  if List.is_empty output_dims then invalid_arg "rebatch: ndarray has just one axis";
+  Tensor.term ~label:[ l ] ~input_dims:[] ~output_dims ~init_data:(Reshape ndarray) ()
+
 module TDSL = struct
-  include Initial_TDSL
   module O = DO
 
+  let term = Tensor.term ~grad_spec:If_needed
+  let number = Tensor.number ~grad_spec:If_needed
+  let ndarray = Tensor.ndarray ~grad_spec:If_needed
+
+  let param ?value ?values =
+    let t =
+      match (value, values) with
+      | Some _, Some _ -> invalid_arg "TDSL.param: both value and values are set"
+      | Some value, None -> Tensor.fetch_param_init (Asgns.Constant value)
+      | None, Some values -> Tensor.fetch_param_init (Asgns.Constant_fill values)
+      | None, None -> !Tensor.default_param_init
+    in
+    Tensor.param ~t
+
   let einsum = einsum ~grad_spec:If_needed
   let outer_sum = outer_sum ~grad_spec:If_needed
   let einsum1 = einsum1 ~grad_spec:If_needed
   let range = range ~grad_spec:If_needed
   let range_of_shape = range_of_shape ~grad_spec:If_needed
   let stop_gradient = stop_gradient
-
-  (** The input [i] dimensions default to empty. The batch dimensions will be inferred if omitted.
-  *)
-  let init_const ~l ?b ?(i = []) ~o values =
-    Tensor.term ~label:[ l ] ~grad_spec:Prohibit_grad ?batch_dims:b ~input_dims:i ~output_dims:o
-      ~fetch_op:(Asgns.Constant_fill values) ()
-
-  (** It's like `Tensor.param` but without shape inference. *)
-  let init_param ~l ?(b = []) ?(i = []) ?(o = []) values =
-    Tensor.term ~label:[ l ] ~grad_spec:Require_grad ~batch_dims:b ~input_dims:i ~output_dims:o
-      ~fetch_op:(Asgns.Constant_fill values) ()
+  let reshape = reshape ~grad_spec:If_needed
+  let wrap = wrap ~grad_spec:If_needed
+  let wrap_padded = wrap_padded ~grad_spec:If_needed
+  let rebatch = rebatch ~grad_spec:If_needed
+
+  (** The input and output dimensions will be inferred if omitted. See {!reshape}. *)
+  let reshape_param ~l ?i ?o ndarray =
+    let t =
+      Tensor.term ~grad_spec:Require_grad ~batch_dims:[] ~batch_axes:[] ~init_data:(Reshape ndarray)
+        ?fetch_op:None
+    in
+    Tensor.param ?input_dims:i ?output_dims:o ~t l
+
+  (** See {!wrap}. *)
+  let wrap_param ~l ?i ?o ndarray =
+    let t =
+      Tensor.term ~grad_spec:Require_grad ~batch_dims:[] ~batch_axes:[]
+        ~init_data:(Keep_shape_no_padding ndarray) ?fetch_op:None
+    in
+    Tensor.param ?input_dims:i ?output_dims:o ~t l
 end
 
 module NTDSL = struct
@@ -485,6 +526,10 @@ module NTDSL = struct
   let term = Tensor.term ~grad_spec:Prohibit_grad
   let range = range ~grad_spec:Prohibit_grad
   let range_of_shape = range_of_shape ~grad_spec:Prohibit_grad
+  let reshape = reshape ~grad_spec:Prohibit_grad
+  let wrap = wrap ~grad_spec:Prohibit_grad
+  let wrap_padded = wrap_padded ~grad_spec:Prohibit_grad
+  let rebatch = rebatch ~grad_spec:Prohibit_grad
 
   let counter ?(label = []) =
     let module NTDSL = Initial_NTDSL in
diff --git a/lib/ppx_op.ml b/lib/ppx_op.ml
@@ -21,8 +21,8 @@ let make_p ~has_config ~loc =
 
 let make_vb ?value ~has_config ~loc ~str_loc ~ident string =
   let pat = Ast_helper.Pat.var ~loc { loc = str_loc; txt = ident } in
-  let value = match value with Some c -> [%expr Some [| [%e c] |]] | None -> [%expr None] in
-  let v = [%expr [%e make_p ~has_config ~loc] ?values:[%e value] [%e string]] in
+  let value = match value with Some c -> [%expr Some [%e c] ] | None -> [%expr None] in
+  let v = [%expr [%e make_p ~has_config ~loc] ?value:[%e value] [%e string]] in
   let vb = Ast_helper.Vb.mk ~loc pat v in
   (pat, vb)
 
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -205,9 +205,8 @@ let raw_unop ~initialize_neutral ~accum ~(t : t) ~(lhs_is_grad : bool) ~op ~(t1
 type grad_spec = Require_grad | Prohibit_grad | If_needed [@@deriving sexp, equal, variants]
 
 let op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
-    ?(compose_op = Shape.Pointwise_bin) ?(transpose_op = Shape.Pointwise_un)
-    ?init_data ?fetch_op ~op_asn ~grad_asn
-    ?(grad_spec = If_needed) make_shape (orig_ts : t list) : t =
+    ?(compose_op = Shape.Pointwise_bin) ?(transpose_op = Shape.Pointwise_un) ?init_data ?fetch_op
+    ~op_asn ~grad_asn ?(grad_spec = If_needed) make_shape (orig_ts : t list) : t =
   (* The code needs to be included in the order it was computed due to potential non-tree DAGs. *)
   let ordered_ts = List.dedup_and_sort orig_ts ~compare:(fun t1 t2 -> Int.ascending t1.id t2.id) in
   let id = session_state.next_id in
@@ -222,7 +221,7 @@ let op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
       |> Option.value ~default)
   in
   let terminal_logic () =
-    match fetch_op, init_data with
+    match (fetch_op, init_data) with
     | None, None -> Shape.Terminal (`Fetch (Asgns.Constant 0.0))
     | Some fetch_op, _ -> Shape.Terminal (`Fetch fetch_op)
     | None, Some init_data -> Shape.Terminal (`Data init_data)
@@ -319,7 +318,8 @@ let op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
           diff.backprop)
     in
     let bcks =
-      List.filter_map ordered_ts ~f:(fun ti -> if is_bck_root ti then bprop ti else None)
+      List.filter_map ordered_ts ~f:(fun ti ->
+          if is_bck_root ti && not (Set.mem t.params ti) then bprop ti else None)
     in
     let backprop = Asgns.sequence @@ (grad_asn ~t ~g ~projections :: bcks) in
     let backprop =
@@ -375,7 +375,8 @@ let term ~label ~grad_spec ?batch_dims ?input_dims ?output_dims ?batch_axes ?inp
     Shape.make ?batch_dims ?input_dims ?output_dims ?batch_axes ?input_axes ?output_axes ?deduced ()
   in
   (* Note: fetch_op in op is used only for shape inference. *)
-  op ~label ?compose_op:None ?transpose_op:None ?init_data ?fetch_op ~op_asn ~grad_asn ~grad_spec make_shape []
+  op ~label ?compose_op:None ?transpose_op:None ?init_data ?fetch_op ~op_asn ~grad_asn ~grad_spec
+    make_shape []
 
 let float_to_label v = Float.to_string v
 
@@ -438,18 +439,19 @@ let ndarray ?(label = []) ?(grad_spec = Prohibit_grad) ?batch_dims ?input_dims ?
       Tn.update_prec ~only_if:is_up_to_fp16 t.value single);
   t
 
-let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?deduced ?value
-    ?values label =
-  let fetch_op =
-    match (values, value) with
-    | Some values, None -> Asgns.Constant_fill values
-    | None, Some value -> Asgns.Constant value
-    | None, None -> Asgns.Range_over_offsets
-    | Some _, Some _ -> invalid_arg "Tensor.param: both values and value are set"
-  in
+let fetch_param_init fetch_op =
+  term ~grad_spec:Require_grad ~batch_dims:[] ~batch_axes:[] ?init_data:None ~fetch_op
+
+let default_param_init = ref @@ fetch_param_init (Asgns.Constant 0.0)
+
+let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?deduced ?t label =
   let t =
-    term ~label:(label :: more_label) ~grad_spec:Require_grad ~batch_dims:[] ?input_dims
-      ?output_dims ?input_axes ?output_axes ?deduced ~fetch_op ()
+    match t with
+    | Some t ->
+        t ~label:(label :: more_label) ?input_dims ?output_dims ?input_axes ?output_axes ?deduced ()
+    | None ->
+        !default_param_init ~label:(label :: more_label) ?input_dims ?output_dims ?input_axes
+          ?output_axes ?deduced ()
   in
   let v = t.value in
   (* It is convenient to use the param syntax for volatiles (mutable embedded_nodes). *)
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -217,23 +217,54 @@ val ndarray :
     given values must fill the tensor's [value] node precisely; otherwise, the values will be looped
     over to populate the [value] node. *)
 
+val default_param_init :
+  (label:string list ->
+  ?input_dims:int list ->
+  ?output_dims:int list ->
+  ?input_axes:(string * int) list ->
+  ?output_axes:(string * int) list ->
+  ?deduced:Shape.deduce_within_shape ->
+  unit ->
+  t)
+  ref
+(** The default initialization operation for {!param} calls that do not pass a [t]. *)
+
+val fetch_param_init :
+  fetch_op ->
+  label:string list ->
+  ?input_dims:int list ->
+  ?output_dims:int list ->
+  ?input_axes:(string * int) list ->
+  ?output_axes:(string * int) list ->
+  ?deduced:Shape.deduce_within_shape ->
+  unit ->
+  t
+(** Helper for {!param} wrappers or to set {!default_param_init}. *)
+
 val param :
   ?more_label:string list ->
   ?input_dims:int list ->
   ?output_dims:int list ->
   ?input_axes:(string * int) list ->
   ?output_axes:(string * int) list ->
   ?deduced:Shape.deduce_within_shape ->
-  ?value:float ->
-  ?values:float array ->
+  ?t:
+    (label:string list ->
+    ?input_dims:int list ->
+    ?output_dims:int list ->
+    ?input_axes:(string * int) list ->
+    ?output_axes:(string * int) list ->
+    ?deduced:Shape.deduce_within_shape ->
+    unit ->
+    t) ->
   string ->
   t
-(* A tensor with no batch axes; input and output axes are by default inferred. [grad_spec] is set to
-   [Require_grad]. The resulting tensor's label is the passed string, appended by [more_label] if
-   any. If [value] is provided, the tensor is initialized to the given value. If [values] is
-   provided, the tensor is initialized to the given values. At most one of [value] or [values] can
-   be provided. Note: [values] will be looped over if necessary, but shape inference will try
-   incorporating the number of values as tensor size. *)
+(** For proper parameters, [t] should produce a tensor with no batch axes; input and output axes
+    should by default be inferred; [grad_spec] should be [Require_grad]. [t]'s label is the passed
+    string, appended by [more_label] if any, other parameters are forwarded to [t]. If [t] is not
+    provided, {!default_param_init} is used. This function returns [t]'s result with the field
+    {!field:params} replaced by a singleton set containing that result, and it also updates the
+    memory modes. *)
 
 val consume_forward_code : t -> comp
 (** A forward root is a tensor that is not (currently) used to compute another tensor.
diff --git a/test/hello_world_op.ml b/test/hello_world_op.ml
@@ -510,7 +510,7 @@ let%expect_test "Big matrix" =
   let ctx = Backend.make_context stream in
   Rand.init 0;
   (* Hey is inferred to be a matrix. *)
-  let hey = Tensor.param ~value:0.5 "hey" in
+  let hey = TDSL.param ~value:0.5 "hey" in
   let zero_to_twenty = TDSL.range 20 in
   let y = TDSL.O.((hey * zero_to_twenty) + zero_to_twenty) in
   Train.forward_and_forget backend ctx y;