Support Tensor.params field via the Tensor.param function; a couple early missing-init fixes

lukstafi · lukstafi · commit a75dce115590 · 2025-07-01T11:54:13.000+02:00
More missing-init fixes after %cd syntax is updated to allow inline bindings for non-assignment expressions.
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -116,10 +116,24 @@ let iter_embedded ~f t =
   Set.iter ~f t.forward.embedded_nodes;
   Option.iter t.diff ~f:(fun diff -> Set.iter ~f diff.backprop.embedded_nodes)
 
-let init_params _t =
-  (* Based on the interface documentation, this should collect forward code of t.params *)
-  (* For now, return empty since the 'params' field is missing from the current implementation *)
-  Asgns.empty_comp
+let rec init_params t =
+  let open Asgns in
+  let rem_embedded = ref @@ Set.empty (module Tn) in
+  let asgns =
+    Block_comment
+      ( "init params for " ^ Tn.debug_name t.value,
+        sequential
+        @@ Set.fold t.params ~init:[] ~f:(fun acc param ->
+               if Set.is_empty param.params then param.forward.asgns :: acc
+               else
+                 let asgns = init_params param in
+                 rem_embedded := Set.union !rem_embedded asgns.embedded_nodes;
+                 Seq (asgns.asgns, param.forward.asgns) :: acc) )
+  in
+  let embedded_nodes =
+    Set.fold ~init:!rem_embedded t.params ~f:(fun acc p -> Set.add acc p.value)
+  in
+  { asgns; embedded_nodes }
 
 let initial_default_prec =
   Ir.Ops.prec_of_string (Utils.get_global_arg ~default:"single" ~arg_name:"default_prec")
@@ -299,7 +313,8 @@ let op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
         session_state.backprop_roots <- Map.remove session_state.backprop_roots ti.id);
     (* The order is not relevant, we keep the same order as in backprop for readability. *)
     let diff = Some { grad = g; zero_grads; backprop } in
-    let tensor = { params = Set.empty (module T); forward; diff; id; value = v; shape; children } in
+    let params = Set.union_list (module T) @@ List.map ordered_ts ~f:(fun ti -> ti.params) in
+    let tensor = { params; forward; diff; id; value = v; shape; children } in
     session_state.forward_roots <- Map.add_exn session_state.forward_roots ~key:id ~data:tensor;
     session_state.backprop_roots <- Map.add_exn session_state.backprop_roots ~key:id ~data:tensor;
     tensor
@@ -409,10 +424,10 @@ let ndarray ?(label = []) ?(grad_spec = Prohibit_grad) ?batch_dims ?input_dims ?
       Tn.update_prec ~only_if:is_up_to_fp16 t.value single);
   t
 
-let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?deduced ?value ?values
-    label =
+let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?deduced ?value
+    ?values label =
   let fetch_op_fn ~v:_ =
-    match values, value with
+    match (values, value) with
     | Some values, None -> Asgns.Constant_fill values
     | None, Some value -> Asgns.Constant value
     | None, None -> Asgns.Range_over_offsets
@@ -429,7 +444,8 @@ let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?
      update computations. *)
   let g = (Option.value_exn ~here:[%here] t.diff).grad in
   Tn.update_memory_mode g Never_virtual 26;
-  t
+  remove_fwd_root t;
+  { t with params = Set.singleton (module T) t }
 
 let debug_name t = Tn.debug_name t.value
 let debug_grad t = Tn.debug_name (Option.value_exn t.diff).grad
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -47,8 +47,8 @@ and comparator_witness
 val comparator : (t, comparator_witness) Base.Comparator.t
 
 val init_params : t -> comp
-(** [init_params t] simply collects the {!field:forward} code of [t.params] into a single sequence.
-*)
+(** [init_params t] collects into a single sequence the {!field:forward} code of [t.params], and
+    transitively the initializations of the parameters of the parameters. *)
 
 val is_fwd_root : t -> bool
 val remove_fwd_root : t -> unit
diff --git a/lib/train.ml b/lib/train.ml
@@ -128,8 +128,8 @@ let grad_update ?(disable_rootness_check = false) ?(setup_for_parallel = false)
 
 (** See: https://github.com/tinygrad/tinygrad/blob/master/tinygrad/nn/optim.py *)
 let sgd_one ~learning_rate ?(momentum = 0.0) ?(weight_decay = 0.0) ?(nesterov = false) p =
-  if not @@ Set.mem p.Tensor.params p then
-    raise @@ Tensor.Session_error ("Train.sgd_one: not a parameter", Some p);
+  if Option.is_none p.Tensor.diff then
+  raise @@ Tensor.Session_error ("Train.sgd_one: not differentiable", Some p);
   [%cd
     ~~(p "param sgd step";
        "sgd_delta" =: p.grad + (!.weight_decay *. p);
@@ -254,6 +254,8 @@ let%track3_sexp parallel_update (type buffer_ptr dev runner event optimize_ctx)
       Array.for_all grad_updates ~f:(fun upd ->
           [%equal: Idx.static_symbol list] bindings @@ List.map ~f:fst upd.bindings))];
   let all_params : Tensor.t array = Set.to_array loss.Tensor.params in
+  if Array.is_empty all_params then
+    raise @@ Tensor.Session_error ("Train.parallel_update: no parameters", Some loss);
   let _occupancies_debug : bool array array = occupancies_dst_src in
   let ctxs = [%debug_notrace Array.map grad_updates ~f:(fun upd -> upd.context)] in
   let occupancy_dst ~dst_n = Array.exists ~f:Fn.id occupancies_dst_src.(dst_n) in
diff --git a/test/einsum/moons_demo_variant.ml b/test/einsum/moons_demo_variant.ml
@@ -54,15 +54,18 @@ let () =
      computation. *)
   let weight_decay = 0.0001 in
   let%op scalar_loss = (margin_loss ++ "...|... => 0") /. !..batch_size in
+  let init_params = Tensor.init_params scalar_loss in
   let update = Train.grad_update scalar_loss in
   let%op learning_rate = 0.1 *. ((2 *. !..steps) - !@step_n) /. !..steps in
   Train.set_hosted learning_rate.value;
   let sgd = Train.sgd_update ~learning_rate ~weight_decay scalar_loss in
+  let init = Train.to_routine (module Backend) ctx bindings init_params in
   let sgd_routine =
-    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update; sgd ])
+    Train.to_routine (module Backend) init.context bindings (Asgns.sequence [ update; sgd ])
   in
   let step_ref = IDX.find_exn sgd_routine.bindings step_n in
   step_ref := 0;
+  Train.run init;
   for _epoch = 1 to epochs do
     Train.sequential_loop sgd_routine.bindings ~f:(fun () ->
         Train.run sgd_routine;
@@ -77,8 +80,7 @@ let () =
   let points = Tn.points_2d ~xdim:0 ~ydim:1 moons_flat.value in
   let classes = Tn.points_1d ~xdim:0 moons_classes.value in
   let points1, points2 = Array.partitioni_tf points ~f:Float.(fun i _ -> classes.(i) > 0.) in
-  let%op mlp_result = mlp "point" in
-  Train.set_on_host mlp_result.value;
+  let%cd mlp_result = mlp "point" in
   let result_routine =
     Train.to_routine
       (module Backend)
@@ -89,8 +91,6 @@ let () =
   in
   let callback (x, y) =
     Tn.set_values point.value [| x; y |];
-    (* For the gccjit backend, point is only on host, not on device. For cuda, this will be
-       needed. *)
     Train.run result_routine;
     Float.(mlp_result.@[0] >= 0.)
   in
diff --git a/test/micrograd_demo_logging.ml b/test/micrograd_demo_logging.ml
@@ -30,9 +30,12 @@ let () =
   let%op g = f /. 2 in
   let%op g = g + (10. /. f) in
   List.iter ~f:(Option.iter ~f:(fun diff -> Train.set_hosted diff.Tensor.grad)) [ a.diff; b.diff ];
+  let init_params = Tensor.init_params g in
   let update = Train.grad_update g in
-  let step = Train.to_routine (module Backend) ctx IDX.empty update in
+  let init = Train.to_routine (module Backend) ctx IDX.empty init_params in
+  let step = Train.to_routine (module Backend) init.context IDX.empty update in
   Utils.capture_stdout_logs @@ fun () ->
+  Train.run init;
   Train.run step;
   Tensor.print ~with_code:false ~with_grad:false `Default g;
   Tensor.print ~with_code:false ~with_grad:true `Default a;