Rename non_embedded/embedded distinction to inputs/outputs,

lukstafi · lukstafi · commit 4aef0cc76c12 · 2024-10-05T18:26:49.000+02:00
defensively fix (make more precise) handling of grad nodes when computing inputs/outputs.
diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml
@@ -53,7 +53,7 @@ module type No_device_backend = sig
       [occupancy] returns true are included. *)
 
   val link :
-    ?from_prior_context:Tnode.t list ->
+    ?from_prior_context:Set.M(Tnode).t ->
     merge_buffer:(buffer_ptr * Tnode.t) option ref ->
     context ->
     code ->
@@ -64,7 +64,7 @@ module type No_device_backend = sig
       context, they must be part of the given context. *)
 
   val link_batch :
-    ?from_prior_context:Tnode.t list ->
+    ?from_prior_context:Set.M(Tnode).t ->
     merge_buffer:(buffer_ptr * Tnode.t) option ref ->
     context ->
     code_batch ->
@@ -90,13 +90,19 @@ end
 module type Backend = sig
   include No_device_backend
 
-  val link : ?from_prior_context:Tnode.t list -> context -> code -> routine
-  (** Returns the routine for the code's procedure, in a new context derived from the given context. *)
+  val link : ?from_prior_context:Set.M(Tnode).t -> context -> code -> routine
+  (** Returns the routine for the code's procedure, in a new context derived from the given context.
+
+      The [from_prior_context] nodes must not be added to the resulting context -- if needed in
+      context, they must be part of the given context. *)
 
   val link_batch :
-    ?from_prior_context:Tnode.t list -> context -> code_batch -> context * routine option array
+    ?from_prior_context:Set.M(Tnode).t -> context -> code_batch -> context * routine option array
   (** Returns the routines for the procedures included in the code batch. The returned context is
-      downstream of all the returned routines. *)
+      downstream of all the returned routines.
+
+      The [from_prior_context] nodes must not be added to the resulting context -- if needed in
+      context, they must be part of the given context. *)
 
   type event
   (** An event tracks if a device finished computing past a particular point in its schedue. These
@@ -805,7 +811,7 @@ end
 let verify_prior_context ~ctx_arrays ~is_in_context ~prior_context ~from_prior_context traced_stores
     =
   let olds = ctx_arrays prior_context in
-  List.iter from_prior_context ~f:(fun tn ->
+  Set.iter from_prior_context ~f:(fun tn ->
       let node = Array.find_map traced_stores ~f:(fun store -> Hashtbl.find store tn) in
       if
         Option.value_map node ~default:false ~f:(fun node ->
@@ -875,7 +881,8 @@ module Simple_no_device_backend (Backend : Simple_backend) : No_device_backend =
     if shared then Compiled (lowereds, compile_batch ~names ~opt_ctx_arrays:None bindings lowereds)
     else Postponed { lowereds; bindings; names }
 
-  let link ?(from_prior_context = []) ~merge_buffer (prior_context : context) (code : code) =
+  let link ?(from_prior_context = Set.empty (module Tnode)) ~merge_buffer (prior_context : context)
+      (code : code) =
     Backend.(
       verify_prior_context ~ctx_arrays ~is_in_context ~prior_context ~from_prior_context
         [| get_traced_store code |]);
@@ -890,8 +897,8 @@ module Simple_no_device_backend (Backend : Simple_backend) : No_device_backend =
     in
     { context; schedule; bindings; name }
 
-  let link_batch ?(from_prior_context = []) ~merge_buffer (prior_context : context)
-      (code_batch : code_batch) =
+  let link_batch ?(from_prior_context = Set.empty (module Tnode)) ~merge_buffer
+      (prior_context : context) (code_batch : code_batch) =
     Backend.(
       verify_prior_context ~ctx_arrays ~is_in_context ~prior_context ~from_prior_context
       @@ get_traced_stores code_batch);
@@ -975,13 +982,13 @@ module Cuda_backend : Backend = struct
             Option.(join @@ map lowered ~f:(fun optim -> optim.Low_level.merge_node)));
     }
 
-  let link ?(from_prior_context = []) context code =
+  let link ?(from_prior_context = Set.empty (module Tnode)) context code =
     verify_prior_context ~ctx_arrays ~is_in_context ~prior_context:context.ctx ~from_prior_context
       [| code.traced_store |];
     let ctx, bindings, schedule = link context.ctx code.code in
     { context = { ctx; expected_merge_node = code.expected_merge_node }; schedule; bindings; name }
 
-  let link_batch ?(from_prior_context = []) context code_batch =
+  let link_batch ?(from_prior_context = Set.empty (module Tnode)) context code_batch =
     verify_prior_context ~ctx_arrays ~is_in_context ~prior_context:context.ctx ~from_prior_context
       code_batch.traced_stores;
     let ctx, bindings, schedules = link_batch context.ctx code_batch.code_batch in
diff --git a/bin/compilation_speed.ml b/bin/compilation_speed.ml
@@ -31,7 +31,7 @@ let benchmark_overhead backend () =
   let init_assign_x = link ctx @@ compile ~name:"init_assign_x" IDX.empty mock_update_x in
   let f_routine = link init_assign_x.context @@ compile IDX.empty update_f.fwd_bprop in
   Tensor.print_tree ~with_grad:true ~with_backend_info:true ~depth:9 f;
-  Tensor.iter_embedded_arrays f ~f:(fun a -> ignore (from_host f_routine.context a : bool));
+  Tensor.iter_outputs f ~f:(fun a -> ignore (from_host f_routine.context a : bool));
 
   let xs = Array.init n_data ~f:Float.(fun i -> of_int i - (of_int n_data /. 2.)) in
   let open Operation.At in
diff --git a/bin/zero2hero_1of7.ml b/bin/zero2hero_1of7.ml
@@ -163,9 +163,9 @@ let _suspended () =
   let device = new_virtual_device @@ get_device ~ordinal:0 in
   let update = Train.grad_update l in
   let routine = link (init device) @@ compile IDX.empty @@ update.fwd_bprop in
-  Tensor.iter_embedded_arrays l ~f:(fun a -> ignore (from_host routine.context a : bool));
+  Tensor.iter_outputs l ~f:(fun a -> ignore (from_host routine.context a : bool));
   Train.run routine;
-  Tensor.iter_embedded_arrays l ~f:(fun a -> ignore (to_host routine.context a : bool));
+  Tensor.iter_outputs l ~f:(fun a -> ignore (to_host routine.context a : bool));
   await device;
   Stdio.print_endline
     {|
@@ -177,7 +177,7 @@ let _suspended () =
     link routine.context @@ compile IDX.empty @@ Train.sgd_update ~learning_rate update
   in
   (* learning_rate is virtual so this will not print anything. *)
-  Tensor.iter_embedded_arrays learning_rate ~f:(fun a ->
+  Tensor.iter_outputs learning_rate ~f:(fun a ->
       ignore (from_host routine.context a : bool));
   Stdio.print_endline
     {|
@@ -187,7 +187,7 @@ let _suspended () =
   List.iter [ a.value; b.value; c.value; f.value ] ~f:(fun a ->
       assert (from_host routine.context a));
   Train.run routine;
-  Tensor.iter_embedded_arrays l ~f:(fun a -> ignore (to_host routine.context a : bool));
+  Tensor.iter_outputs l ~f:(fun a -> ignore (to_host routine.context a : bool));
   await device;
   Stdio.print_endline
     {|
@@ -198,7 +198,7 @@ let _suspended () =
   let update = Train.grad_update l in
   let routine = link routine.context @@ compile IDX.empty update.fwd_bprop in
   Train.run routine;
-  Tensor.iter_embedded_arrays l ~f:(fun a -> ignore (to_host routine.context a : bool));
+  Tensor.iter_outputs l ~f:(fun a -> ignore (to_host routine.context a : bool));
   await device;
   Stdio.print_endline
     {|
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -6,6 +6,7 @@ module Idx = Arrayjit.Indexing
 module Debug_runtime = Arrayjit.Utils.Debug_runtime
 
 type tn = Tn.t
+type tn_set = Set.M(Arrayjit.Tnode).t
 type asgns = Asgns.t
 type init_op = Arrayjit.Ops.init_op
 type fetch_op = Asgns.fetch_op
@@ -23,9 +24,10 @@ type t = {
   forward : Asgns.t;
   diff : diff option;
   id : int;
-  value : Tn.t;
+  value : tn;
   shape : Shape.t;
   children : subtensor list;
+  non_embedded : tn_set;
 }
 
 and subtensor = { subtensor : t; embedded : bool }
@@ -147,12 +149,14 @@ let op ~(label : string list) ?(compose_op = Shape.Pointwise_bin)
     ?(transpose_op = Shape.Pointwise_un) ?(init_op = default_init_op) ~op_asn ~grad_asn
     ?(grad_spec = If_needed) make_shape (orig_ts : t list) : t =
   let ordered_ts = List.dedup_and_sort orig_ts ~compare:(fun t1 t2 -> Int.ascending t1.id t2.id) in
+  let non_embedded = ref @@ Set.empty (module Tn) in
   let children =
     List.folding_map orig_ts
       ~init:(Set.empty (module Int))
       ~f:(fun used ti ->
-        ( Set.add used ti.id,
-          { subtensor = ti; embedded = is_fwd_root ti && not (Set.mem used ti.id) } ))
+        let root = is_fwd_root ti in
+        if not root then non_embedded := Set.add !non_embedded ti.value;
+        (Set.add used ti.id, { subtensor = ti; embedded = root && not (Set.mem used ti.id) }))
   in
   let id = session_state.next_id in
   session_state.next_id <- session_state.next_id + 1;
@@ -187,7 +191,9 @@ let op ~(label : string list) ?(compose_op = Shape.Pointwise_bin)
     || Fn.non is_require_grad grad_spec
        && List.for_all orig_ts ~f:(fun ti -> Option.is_none ti.diff)
   then (
-    let tensor = { forward; diff = None; id; value = v; shape; children } in
+    let tensor =
+      { forward; diff = None; id; value = v; shape; children; non_embedded = !non_embedded }
+    in
     session_state.forward_roots <- Map.add_exn session_state.forward_roots ~key:id ~data:tensor;
     tensor)
   else
@@ -216,7 +222,11 @@ let op ~(label : string list) ?(compose_op = Shape.Pointwise_bin)
        that all ancestors of a node are backpropagated before the node is backpropagated, even for
        non-tree DAGs. *)
     let backprop =
-      let bprop = dcode ~f:(fun diff -> diff.backprop) in
+      let bprop =
+        dcode ~f:(fun diff ->
+            non_embedded := Set.add !non_embedded diff.grad;
+            diff.backprop)
+      in
       let bcks =
         List.map ordered_ts ~f:(fun ti -> if is_bck_root ti then bprop ti else Asgns.Noop)
       in
@@ -226,7 +236,7 @@ let op ~(label : string list) ?(compose_op = Shape.Pointwise_bin)
         session_state.backprop_roots <- Map.remove session_state.backprop_roots ti.id);
     (* The order is not relevant, we keep the same order as in backprop for readability. *)
     let diff = Some { grad = g; zero_grads; backprop } in
-    let tensor = { forward; diff; id; value = v; shape; children } in
+    let tensor = { forward; diff; id; value = v; shape; children; non_embedded = !non_embedded } in
     session_state.forward_roots <- Map.add_exn session_state.forward_roots ~key:id ~data:tensor;
     session_state.backprop_roots <- Map.add_exn session_state.backprop_roots ~key:id ~data:tensor;
     tensor
@@ -350,30 +360,33 @@ let param ?(more_label = []) ?input_dims ?output_dims ?input_axes ?output_axes ?
   Tn.update_memory_mode g Never_virtual 26;
   t
 
-let rec iter_embedded_arrays ~f t =
-  f t.value;
-  Option.iter t.diff ~f:(fun diff -> f diff.grad);
-  List.iter ~f:(fun ch -> if ch.embedded then iter_embedded_arrays ~f ch.subtensor) t.children
-
-let rec non_and_embedded_nodes t =
+let rec inputs_and_outputs t =
+  (* TODO: consider either caching here, or as a field of t. *)
+  let opt_grad t = Option.value_map ~default:[] ~f:(fun diff -> [ diff.grad ]) t.diff in
+  let dir_outputs t =
+    Set.of_list (module Tn)
+    @@ List.filter ~f:(fun tn -> not @@ Set.mem t.non_embedded tn)
+    @@ (t.value :: opt_grad t)
+  in
+  let open Arrayjit.Utils.Set_O in
   let non_embedded, embedded =
     List.fold t.children
-      ~init:(Set.empty (module Self), Set.empty (module Self))
+      ~init:(t.non_embedded, Set.of_list (module Tn) (t.value :: opt_grad t))
       ~f:(fun (non_embedded, embedded) ch ->
-        if ch.embedded then (non_embedded, Set.add embedded ch.subtensor)
-        else (Set.add non_embedded ch.subtensor, embedded))
+        (ch.subtensor.non_embedded + non_embedded, dir_outputs ch.subtensor + embedded))
   in
-  let open Arrayjit.Utils.Set_O in
   let non_embedded, embedded =
     List.fold t.children ~init:(non_embedded, embedded)
       ~f:(fun ((non_embedded, embedded) as accu) ch ->
         if ch.embedded then
-          let more_non, more = non_and_embedded_nodes ch.subtensor in
+          let more_non, more = inputs_and_outputs ch.subtensor in
           (non_embedded + more_non, embedded + more)
         else accu)
   in
   (non_embedded - embedded, embedded)
 
+let iter_outputs ~f t = Set.iter ~f @@ snd @@ inputs_and_outputs t
+let input_nodes t = fst @@ inputs_and_outputs t
 let debug_name t = Tn.debug_name t.value
 let debug_grad t = Tn.debug_name (Option.value_exn t.diff).grad
 
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -3,6 +3,7 @@
 open Base
 
 type tn = Arrayjit.Tnode.t
+type tn_set = Set.M(Arrayjit.Tnode).t
 type asgns = Arrayjit.Assignments.t
 type init_op = Arrayjit.Ops.init_op
 type fetch_op = Arrayjit.Assignments.fetch_op
@@ -26,11 +27,19 @@ type t = {
       (** The eventual shape of [t.value] and [t.diff.grad], incorporating the current state of
           shape inference. *)
   children : subtensor list;
+  non_embedded : tn_set;
+      (** These tensor nodes ([value], resp. [grad] of {!diff}) of the children which are not
+          computed by [forward], resp. [backprop] of {!diff}. *)
 }
 [@@deriving sexp_of]
 (** Information needed for compositional code generation. *)
 
-and subtensor = { subtensor : t; embedded : bool }
+and subtensor = {
+  subtensor : t;
+  embedded : bool;
+      (** A tensor can be an [embedded] child at most once -- that's where its [forward] computation
+          ends up when used as part of a bigger computation. *)
+}
 
 type comparator_witness
 
@@ -174,9 +183,6 @@ val param :
    [Require_grad]. The resulting tensor's label is the passed string, appended by [more_label] if
    any. *)
 
-val iter_embedded_arrays : f:(tn -> unit) -> t -> unit
-val non_and_embedded_nodes : t -> (t, comparator_witness) Set.t * (t, comparator_witness) Set.t
-
 val consume_forward_code : t -> asgns
 (** A forward root is a tensor that is not (currently) used to compute another tensor.
     [consume_forward_code t] ensures [t] is a forward root, removes it from forward roots, and
@@ -188,6 +194,14 @@ val consume_backprop_code : t -> asgns * asgns
     [consume_backprop_code t] ensures [t] is a backprop root, removes it from backprop roots, and
     checks that there are no other backprop roots for tensors with children. *)
 
+val input_nodes : t -> tn_set
+(** The nodes of descendant tensors whose computation is not embedded by the given tensor. They are
+    "inputs" coming from other computations. *)
+
+val iter_outputs : f:(tn -> unit) -> t -> unit
+(** [iter_outputs t] iterates over all descendant nodes that are embedded, i.e. are not members
+    of [input_nodes t]. *)
+
 val unsafe_reinitialize : unit -> unit
 (** Bring global state to its initialization values. This invalidates any previously defined tensors
     and tensor nodes. Also reinitializes the modules: {!Shape}, {!Arrayjit.Tnode},
diff --git a/lib/train.ml b/lib/train.ml
@@ -262,22 +262,18 @@ let%track3_sexp round_robin_dry_run ~num_devices jitbs ~dry_sync : unit =
 let set_virtual (a : Tn.t) = Tn.update_memory_mode a Virtual 29
 
 let every_non_literal_on_host =
-  Tensor.iter_embedded_arrays ~f:(fun a ->
+  Tensor.iter_outputs ~f:(fun a ->
       if Tn.mode_is_unspecified a && not (Tn.known_constant a) then set_hosted a)
 
 let%debug2_sexp all_host_to_device (type context)
     (module Backend : Backend_type with type context = context) context =
   let f tn = ignore (Backend.from_host context tn : bool) in
-  Tensor.iter_embedded_arrays ~f
+  Tensor.iter_outputs ~f
 
 let%debug2_sexp all_device_to_host (type context)
     (module Backend : Backend_type with type context = context) context =
   let f tn = ignore (Backend.to_host context tn : bool) in
-  Tensor.iter_embedded_arrays ~f
-
-let needs_prior_context t =
-  Tensor.non_and_embedded_nodes t |> fst |> Set.to_list
-  |> List.concat_map ~f:(fun t -> t.value :: Option.(to_list @@ map t.diff ~f:(fun d -> d.grad)))
+  Tensor.iter_outputs ~f
 
 (** Executes the jitted code and copies arrays embedded in the given tenosor from and to host,
     synchronizes before copying to host. If [looping] is provided, loops over bindings and executes
@@ -352,7 +348,7 @@ let%track3_sexp parallel_update (type context)
   (* We can cache scheduling, because merging and copying does not depend on static indexing. *)
   let loss_merge =
     Backend.(
-      link ~from_prior_context:(needs_prior_context updaten.loss) sgd_update.context
+      link ~from_prior_context:(Tensor.input_nodes updaten.loss) sgd_update.context
       @@ compile Idx.Empty
            [%cd
              ~~("merging" updaten.loss;
@@ -459,7 +455,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
   set_hosted learning_rate.value;
   let sgd = sgd_update ~learning_rate ~weight_decay update in
   let grad_update = Backend.compile ~shared:true bindings update.fwd_bprop in
-  let from_prior_context = needs_prior_context update.loss in
+  let from_prior_context = Tensor.input_nodes update.loss in
   let grad_updates =
     Array.map prior_contexts ~f:(fun ctx -> Backend.link ~from_prior_context ctx grad_update)
   in
@@ -511,7 +507,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
   (* By using sgd_update.context, maybe we don't need to copy the parameters back to the host. *)
   let routine =
     Backend.(
-      link ~from_prior_context:(needs_prior_context model_result) sgd_update.context
+      link ~from_prior_context:(Tensor.input_nodes model_result) sgd_update.context
       @@ compile IDX.empty
       @@ Block_comment ("infer " ^ Tn.debug_name model_result.value, infer_fwd))
   in
@@ -533,7 +529,7 @@ let%track3_sexp forward_and_ctx ?(disable_rootness_check = false) (type context)
     (module Backend : Backend_type with type context = context) ctx ?(bindings = IDX.empty) t =
   let routine =
     Backend.(
-      link ~from_prior_context:(needs_prior_context t) ctx
+      link ~from_prior_context:(Tensor.input_nodes t) ctx
       @@ compile bindings @@ forward ~disable_rootness_check t)
   in
   if not disable_rootness_check then Tensor.remove_bprop_root t;