Don't include zero_grads in consume_backprop_code, to avoid forcing handling zero_grads and backprop together

lukstafi · lukstafi · commit 80b7d0485df2 · 2025-08-07T16:54:25.000+02:00
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -577,7 +577,7 @@ let consume_backprop_code t =
 found potentially unsafe roots: %{String.concat ~sep:", " @@ List.map ~f:debug_name unsafe_roots}|}],
            Some t );
   remove_bprop_root t;
-  (diff.zero_grads, diff.backprop)
+  diff.backprop
 
 let random_seed = ref None
 
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -8,15 +8,11 @@ type tn_set = Set.M(Ir.Tnode).t
 type asgns = Ir.Assignments.t
 type comp = Ir.Assignments.comp
 type fetch_op = Ir.Assignments.fetch_op
-type projections = {
-  projections_debug : string;
-  projections : Ir.Indexing.projections Lazy.t;
-}
+type projections = { projections_debug : string; projections : Ir.Indexing.projections Lazy.t }
 
 type diff = {
   grad : tn;
-  zero_grads : asgns;
-      (** Prepares for backpropagation. Always compile as: [Seq (zero_grads, backprop)]. *)
+  zero_grads : asgns;  (** Prepares for backpropagation. Beware of the "missing zero_grads" bug. *)
   backprop : comp;
       (** Backpropagates for the tensor and its descendants; which typically means adding partial
           gradients to the gradient tensor of the subtensors, then for sub-subtensors etc. *)
@@ -215,11 +211,12 @@ val consume_forward_code : t -> comp
     [consume_forward_code t] ensures [t] is a forward root, removes it from forward roots, and
     checks that there are no other forward roots for tensors with children. *)
 
-val consume_backprop_code : t -> asgns * comp
+val consume_backprop_code : t -> comp
 (** A backprop root is a tensor with a gradient that is not (currently) receiving gradients from
     another tensor. I.e. it is not currently used to compute a tensor with a gradient.
     [consume_backprop_code t] ensures [t] is a backprop root, removes it from backprop roots, and
-    checks that there are no other backprop roots for tensors with children. *)
+    checks that there are no other backprop roots for tensors with children. It returns the backprop
+    code -- note this does not include the zero_grads code. *)
 
 val iter_embedded : f:(tn -> unit) -> t -> unit
 (** [iter_embedded t] iterates over all descendant nodes that are embedded, i.e. are members of
diff --git a/lib/train.ml b/lib/train.ml
@@ -79,25 +79,6 @@ let forward t =
   let label = Tn.debug_name t.value in
   { fwd with asgns = Asgns.Block_comment (label ^ " fwd", fwd.asgns) }
 
-let diff_or_error t provenance =
-  Option.value_or_thunk t.Tensor.diff ~default:(fun () ->
-      raise @@ Tensor.Session_error (provenance ^ ": tensor is not differentiable", Some t))
-
-let grad_update_nochecks loss =
-  let diff = diff_or_error loss "Train.grad_update_nochecks" in
-  let fwd_bprop =
-    [%cd
-      ~~(loss "gradient update";
-         ~~(loss "fwd";
-            loss.forward);
-         ~~(loss "zero grads";
-            Asgns.to_comp diff.zero_grads);
-         loss.grad =: 1;
-         ~~(loss "bprop";
-            diff.backprop))]
-  in
-  fwd_bprop
-
 (** Returns the tensor's forward, zeroing gradients, and backprop code wrapped with label-derived
     comments. Sets the tensor's value as "fully on host". If [setup_for_parallel] is true (false by
     default), sets the parameters and their gradients as "non-local" (on-device). *)
@@ -107,7 +88,8 @@ let grad_update ?(setup_for_parallel = false) loss =
     Set.iter loss.Tensor.params ~f:(fun p ->
         set_materialized (Option.value_exn ~here:[%here] p.diff).grad);
   let fwd = Tensor.consume_forward_code loss in
-  let zero_grads, bprop = Tensor.consume_backprop_code loss in
+  let bprop = Tensor.consume_backprop_code loss in
+  let zero_grads = (Option.value_exn ~here:[%here] loss.diff).zero_grads in
   (* Note: the %cd syntax for [loss.grad] does not modify roots. *)
   [%cd
     ~~(loss "gradient update for" loss;