ahrefs
diff --git a/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎arrayjit/lib/assignments.ml‎
Lines changed: 55 additions & 31 deletions b/‎arrayjit/lib/assignments.ml‎
Lines changed: 55 additions & 31 deletions
diff --git a/‎arrayjit/lib/backends.ml‎
Lines changed: 19 additions & 14 deletions b/‎arrayjit/lib/backends.ml‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎arrayjit/lib/cc_backend.ml‎
Lines changed: 1 addition & 7 deletions b/‎arrayjit/lib/cc_backend.ml‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎arrayjit/lib/tnode.ml‎
Lines changed: 7 additions & 0 deletions b/‎arrayjit/lib/tnode.ml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎arrayjit/lib/writing_a_backend.md‎
Lines changed: 1 addition & 1 deletion b/‎arrayjit/lib/writing_a_backend.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/micrograd_demo.ml‎
Lines changed: 10 additions & 3 deletions b/‎bin/micrograd_demo.ml‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎bin/moons_demo.ml‎
Lines changed: 10 additions & 3 deletions b/‎bin/moons_demo.ml‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎bin/zero2hero_1of7.ml‎
Lines changed: 1 addition & 1 deletion b/‎bin/zero2hero_1of7.ml‎
Lines changed: 1 addition & 1 deletion
@@ -7,6 +7,7 @@
 ### Changed
 
 - Migrated to cudajit 0.5.
+- Verifying that code is linked with the right contexts, by tracking `embedded_nodes` with assignments.
 - TODO: Built per-tensor-node device-to-device synchronization into device-to-device copying functions, removed obsolete blocking synchronizations.
 
 ## [0.4.1] -- 2024-09-17
 
@@ -24,7 +24,7 @@ OCANNL is sponsored by [Ahrefs](https://ocaml.org/success-stories/peta-byte-scal
   * Differentiable computations, centered around the [`%op`](lib/ppx_op.ml) syntax extension.
     * `%op` stands for "operation", it's meant to express tensors: `Tensor.t`, and tensor functions.
   * Plain computations, centered around the [`%cd`](lib/ppx_cd.ml) syntax extension. It integrates the `arrayjit` backend library with shape inference.
-    * `%cd` stands for "code", it's meant to express assignments: `Assignments.t`.
+    * `%cd` stands for "code", it's meant to express assignment computations: `Assignments.comp`.
 * The support for mixed-precision computations is upcoming.
   * E.g. higher-precision network components, or gradients at a higher precision than values.
   * Currently (v0.3), you can select the precision, and individual computation nodes track their precision, but mixing precisions might break things.
 
@@ -44,7 +44,18 @@ and t =
   | Fetch of { array : Tn.t; fetch_op : fetch_op; dims : int array Lazy.t }
 [@@deriving sexp_of]
 
-type comp = {asgns: t; }
+type comp = {
+  asgns : t;
+  embedded_nodes : Set.M(Tn).t;
+      (** The nodes in {!field-asgns} that are not in [embedded_nodes] need to already be in
+          contexts linked with the {!comp}. *)
+}
+[@@deriving sexp_of]
+(** Computations based on assignments. Note: the [arrayjit] library makes use of, but does not
+    produce nor verify the {!field-embedded_nodes} associated to some given {!field-asgns}. *)
+
+let to_comp asgns = { asgns; embedded_nodes = Set.empty (module Tnode) }
+let empty_comp = to_comp Noop
 
 let get_name_exn asgns =
   let punct_or_sp = Str.regexp "[-@*/:.;, ]" in
@@ -63,39 +74,54 @@ let get_name_exn asgns =
   let result = loop asgns in
   if String.is_empty result then invalid_arg "Assignments.get_name: no comments in code" else result
 
-(** Returns nodes that are inputs to the computation in a narrow sense: nodes that were potentially
-    computed by assignments executed before. *)
-let input_or_recurrent_nodes asgns =
+let is_total ~initialize_neutral ~projections =
+  initialize_neutral && Indexing.is_bijective projections
+
+(** Returns the left-hand-side nodes of total assignments. NOTE: [output_nodes] forces the
+    computation of the assignments' projections, so should only be called after shape inference. *)
+let output_nodes asgns =
   let open Utils.Set_O in
   let empty = Set.empty (module Tn) in
-  let single = function
-    | Node tn ->
-        if Tn.known_constant tn || Tn.known_volatile tn || Tn.known_not_materialized tn then
-          Set.empty (module Tn)
-        else Set.singleton (module Tn) tn
-    | Merge_buffer _ -> Set.empty (module Tn)
-  in
-  let maybe have lhs = if have then Set.singleton (module Tn) lhs else empty in
   let rec loop = function
     | Noop -> empty
-    | Seq (t1, t2) -> loop t1 + (loop t2 - assigned t1)
+    | Seq (t1, t2) -> loop t1 + loop t2
     | Block_comment (_, t) -> loop t
-    | Accum_binop { initialize_neutral; lhs; rhs1; rhs2; _ } ->
-        maybe (not initialize_neutral) lhs + single rhs1 + single rhs2
-    | Accum_unop { initialize_neutral; lhs; rhs; _ } ->
-        maybe (not initialize_neutral) lhs + single rhs
+    | Accum_unop { lhs; initialize_neutral; projections; _ }
+    | Accum_binop { lhs; initialize_neutral; projections; _ } ->
+        if is_total ~initialize_neutral ~projections:(Lazy.force projections) then
+          Set.singleton (module Tn) lhs
+        else empty
     | Fetch _ -> empty
-  and assigned = function
-    | Noop -> Set.empty (module Tn)
-    | Seq (t1, t2) -> assigned t1 + assigned t2
-    | Block_comment (_, t) -> assigned t
-    | Accum_binop { initialize_neutral; lhs; _ } -> maybe initialize_neutral lhs
-    | Accum_unop { initialize_neutral; lhs; _ } -> maybe initialize_neutral lhs
-    | Fetch { array; _ } -> Set.singleton (module Tn) array
   in
   loop asgns
 
-let sequential l = Option.value ~default:Noop @@ List.reduce l ~f:(fun st sts -> Seq (st, sts))
+(** Returns materialized nodes in the sense of {!Tnode.is_in_context_force}. NOTE: it ideally should
+    be called after compilation. *)
+let context_nodes asgns =
+  let open Utils.Set_O in
+  let empty = Set.empty (module Tn) in
+  let one tn = if Tnode.is_in_context_force tn 34 then Set.singleton (module Tn) tn else empty in
+  let of_node = function Node rhs -> one rhs | Merge_buffer _ -> empty in
+  let rec loop = function
+    | Noop -> empty
+    | Seq (t1, t2) -> loop t1 + loop t2
+    | Block_comment (_, t) -> loop t
+    | Accum_unop { lhs; rhs; _ } -> Set.union (one lhs) (of_node rhs)
+    | Accum_binop { lhs; rhs1; rhs2; _ } ->
+        Set.union_list (module Tn) [ one lhs; of_node rhs1; of_node rhs2 ]
+    | Fetch { array; _ } -> one array
+  in
+  loop asgns
+
+let sequential l =
+  Option.value ~default:Noop @@ List.reduce l ~f:(fun sts another_st -> Seq (sts, another_st))
+
+let sequence l =
+  Option.value ~default:{ asgns = Noop; embedded_nodes = Set.empty (module Tn) }
+  @@ List.reduce l
+       ~f:(fun
+           { asgns = sts; embedded_nodes = embs } { asgns = another_st; embedded_nodes = emb } ->
+         { asgns = Seq (sts, another_st); embedded_nodes = Set.union embs emb })
 
 let%diagn1_sexp to_low_level code =
   let open Indexing in
@@ -145,7 +171,6 @@ let%diagn1_sexp to_low_level code =
           derive_index ~product_syms:projections.product_iterators
             ~projection:projections.project_rhs.(1)
         in
-        let is_assignment = initialize_neutral && Indexing.is_bijective projections in
         let basecase rev_iters =
           let product = Array.of_list_rev_map rev_iters ~f:(fun s -> Indexing.Iterator s) in
           let rhs1_idcs = rhs1_idx ~product in
@@ -156,7 +181,7 @@ let%diagn1_sexp to_low_level code =
           let rhs1_ll = get rhs1 rhs1_idcs in
           let rhs2_ll = get rhs2 rhs2_idcs in
           let rhs2 = binop ~op ~rhs1:rhs1_ll ~rhs2:rhs2_ll in
-          if is_assignment then set lhs lhs_idcs rhs2
+          if is_total ~initialize_neutral ~projections then set lhs lhs_idcs rhs2
           else set lhs lhs_idcs @@ binop ~op:accum ~rhs1:lhs_ll ~rhs2
         in
         let rec for_loop rev_iters = function
@@ -178,7 +203,7 @@ let%diagn1_sexp to_low_level code =
             [%log "projections=", (projections : projections)];
             raise e
         in
-        if initialize_neutral && not is_assignment then
+        if initialize_neutral && not (is_total ~initialize_neutral ~projections) then
           let dims = lazy projections.lhs_dims in
           let fetch_op = Constant (Ops.neutral_elem accum) in
           Low_level.Seq (loop (Fetch { array = lhs; fetch_op; dims }), for_loops)
@@ -193,15 +218,14 @@ let%diagn1_sexp to_low_level code =
           derive_index ~product_syms:projections.product_iterators
             ~projection:projections.project_rhs.(0)
         in
-        let is_assignment = initialize_neutral && Indexing.is_bijective projections in
         let basecase rev_iters =
           let product = Array.of_list_rev_map rev_iters ~f:(fun s -> Indexing.Iterator s) in
           let lhs_idcs = lhs_idx ~product in
           let open Low_level in
           let lhs_ll = get (Node lhs) lhs_idcs in
           let rhs_ll = get rhs @@ rhs_idx ~product in
           let rhs2 = unop ~op ~rhs:rhs_ll in
-          if is_assignment then set lhs lhs_idcs rhs2
+          if is_total ~initialize_neutral ~projections then set lhs lhs_idcs rhs2
           else set lhs lhs_idcs @@ binop ~op:accum ~rhs1:lhs_ll ~rhs2
         in
         let rec for_loop rev_iters = function
@@ -218,7 +242,7 @@ let%diagn1_sexp to_low_level code =
                 }
         in
         let for_loops = for_loop [] (Array.to_list projections.product_space) in
-        if initialize_neutral && not is_assignment then
+        if initialize_neutral && not (is_total ~initialize_neutral ~projections) then
           let dims = lazy projections.lhs_dims in
           let fetch_op = Constant (Ops.neutral_elem accum) in
           Low_level.Seq (loop (Fetch { array = lhs; fetch_op; dims }), for_loops)
 
@@ -35,7 +35,7 @@ module type No_device_backend = sig
   val expected_merge_node : code -> Tnode.t option
   val expected_merge_nodes : code_batch -> Tnode.t option array
 
-  val compile : ?shared:bool -> ?name:string -> Indexing.unit_bindings -> Assignments.t -> code
+  val compile : ?shared:bool -> ?name:string -> Indexing.unit_bindings -> Assignments.comp -> code
   (** If [~shared:true] (default [false]), the backend should prefer to do more compile work in a
       device-agnostic way. If [~shared:false], the backend can opt to postpone compiling altogether
       until [link] is called, to benefit from more optimizations. *)
@@ -45,7 +45,7 @@ module type No_device_backend = sig
     ?names:string array ->
     ?occupancy:(name:string -> src_n:int -> bool) ->
     Indexing.unit_bindings ->
-    Assignments.t array ->
+    Assignments.comp array ->
     code_batch
   (** Unlike the [~shared] parameter, [compile_batch] vs. [compile] is mostly about improving the
       compile time and debugging convenience by generating fewer files -- ideally does not affect
@@ -871,18 +871,20 @@ module Simple_no_device_backend (Backend : Simple_backend) : No_device_backend =
     | Compiled (lowereds, _) ->
         Array.filter_map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store))
 
-  let compile ?(shared = false) ?name bindings asgns : code =
-    let name, lowered = lower_assignments ?name bindings asgns in
+  let compile ?(shared = false) ?name bindings comp : code =
+    let name, lowered = lower_assignments ?name bindings comp.Assignments.asgns in
     if shared then Compiled (lowered, Backend.compile ~name ~opt_ctx_arrays:None bindings lowered)
     else Postponed { lowered; bindings; name }
 
-  let compile_batch ?(shared = false) ?names ?occupancy bindings asgns_l : code_batch =
-    let names, lowereds = lower_batch_assignments ?names ?occupancy bindings asgns_l in
+  let compile_batch ?(shared = false) ?names ?occupancy bindings comp_l : code_batch =
+    let names, lowereds =
+      lower_batch_assignments ?names ?occupancy bindings
+      @@ Array.map comp_l ~f:(fun c -> c.Assignments.asgns)
+    in
     if shared then Compiled (lowereds, compile_batch ~names ~opt_ctx_arrays:None bindings lowereds)
     else Postponed { lowereds; bindings; names }
 
-  let link ~from_prior_context ~merge_buffer (prior_context : context)
-      (code : code) =
+  let link ~from_prior_context ~merge_buffer (prior_context : context) (code : code) =
     Backend.(
       verify_prior_context ~ctx_arrays ~is_in_context ~prior_context ~from_prior_context
         [| get_traced_store code |]);
@@ -897,8 +899,8 @@ module Simple_no_device_backend (Backend : Simple_backend) : No_device_backend =
     in
     { context; schedule; bindings; name }
 
-  let link_batch ~from_prior_context ~merge_buffer
-      (prior_context : context) (code_batch : code_batch) =
+  let link_batch ~from_prior_context ~merge_buffer (prior_context : context)
+      (code_batch : code_batch) =
     Backend.(
       verify_prior_context ~ctx_arrays ~is_in_context ~prior_context ~from_prior_context
@@ get_traced_stores code_batch);
@@ -963,16 +965,19 @@ module Cuda_backend : Backend = struct
   let work_for context = work_for context.ctx
   let will_wait_for context = will_wait_for context.ctx
 
-  let compile ?shared:_ ?name bindings asgns : code =
-    let name, lowered = lower_assignments ?name bindings asgns in
+  let compile ?shared:_ ?name bindings comp : code =
+    let name, lowered = lower_assignments ?name bindings comp.Assignments.asgns in
     {
       traced_store = lowered.traced_store;
       code = compile ~name bindings lowered;
       expected_merge_node = lowered.Low_level.merge_node;
     }
 
-  let compile_batch ?shared:_ ?names ?occupancy bindings asgns_l =
-    let names, lowereds = lower_batch_assignments ?names ?occupancy bindings asgns_l in
+  let compile_batch ?shared:_ ?names ?occupancy bindings comp_l =
+    let names, lowereds =
+      lower_batch_assignments ?names ?occupancy bindings
+      @@ Array.map comp_l ~f:(fun c -> c.Assignments.asgns)
+    in
     {
       traced_stores =
         Array.filter_map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store));
 
@@ -67,13 +67,7 @@ type procedure = {
 [@@deriving sexp_of]
 
 let expected_merge_node proc = proc.lowered.merge_node
-
-let is_in_context node =
-  Tnode.default_to_most_local node.Low_level.tn 33;
-  match node.tn.memory_mode with
-  | Some (Hosted (Constant | Volatile), _) -> false
-  | Some ((Virtual | Local), _) -> false
-  | _ -> true
+let is_in_context node = Tnode.is_in_context_force node.Low_level.tn 33
 
 let header_sep =
   let open Re in
 
@@ -156,6 +156,13 @@ let is_materialized_force tn provenance =
   | Some ((On_device | Hosted _ | Materialized), _) -> true
   | Some ((Never_virtual | Device_only | Effectively_constant), _) -> assert false
 
+let is_in_context_force tn provenance =
+  default_to_most_local tn provenance;
+  match tn.memory_mode with
+  | Some (Hosted (Constant | Volatile), _) -> false
+  | Some ((Virtual | Local), _) -> false
+  | _ -> true
+
 let known_not_materialized tn =
   match tn.memory_mode with Some ((Virtual | Local), _) -> true | _ -> false
 
 
@@ -20,7 +20,7 @@ TODO: update regarding events and device-to-device synchronization.
 
 ## Design around compiling and running code, backend interfaces
 
-Currently, OCANNL integrates new backends via code in [Backends](backends.ml), so it's the "sink" of backend module dependencies; [Backend_utils](backend_utils.ml) is the "source". `Backend_utils.Types` introduces the context-specific `routine` type, for code executable on a backend. The interface `Backends.No_device_backend` has `compile` functions that take `Assignments.t` as input, to allow full flexibility in backend implementations. There is a helper `Backends.lower_assignments` that wraps `Assignments.lower` and `Low_level.optimize_proc`, since currently all backends use the optimized C-like representation `Low_level.t`. The user-facing interface `Backends.Backend` builds on top of `No_device_backend` providing multi-device functionality. The functor `Multicore_backend` converts a `No_device_backend` targetting the CPU into a `Backend` whose devices are parallel threads (and ultimately the CPU cores).
+Currently, OCANNL integrates new backends via code in [Backends](backends.ml), so it's the "sink" of backend module dependencies; [Backend_utils](backend_utils.ml) is the "source". `Backend_utils.Types` introduces the context-specific `routine` type, for code executable on a backend. The interface `Backends.No_device_backend` has `compile` functions that take `Assignments.comp` as input, to allow full flexibility in backend implementations. There is a helper `Backends.lower_assignments` that wraps `Assignments.lower` and `Low_level.optimize_proc`, since currently all backends use the optimized C-like representation `Low_level.t`. The user-facing interface `Backends.Backend` builds on top of `No_device_backend` providing multi-device functionality. The functor `Multicore_backend` converts a `No_device_backend` targetting the CPU into a `Backend` whose devices are parallel threads (and ultimately the CPU cores).
 
 ```ocaml
 type lowered_bindings = (static_symbol, int ref) List.Assoc.t  (* in indexing.ml *)
 
@@ -1,6 +1,7 @@
 open Base
 open Ocannl
 module Tn = Arrayjit.Tnode
+module Asgns = Arrayjit.Assignments
 module IDX = Train.IDX
 module TDSL = Operation.TDSL
 module NTDSL = Operation.NTDSL
@@ -81,7 +82,9 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   let module Backend = (val Arrayjit.Backends.fresh_backend ()) in
   let device = Backend.(new_virtual_device @@ get_device ~ordinal:0) in
   let ctx = Backend.init device in
-  let routine = Train.to_routine (module Backend) ctx bindings (Seq (update.fwd_bprop, sgd)) in
+  let routine =
+    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
+  in
   Train.all_host_to_device (module Backend) routine.context scalar_loss;
   Train.all_host_to_device (module Backend) routine.context learning_rate;
   (* Stdio.print_endline "\n******** scalar_loss **********"; Tensor.print_tree ~with_id:true
@@ -122,8 +125,12 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   Train.set_on_host Changed_on_devices mlp_result.value;
   (* By using jitted.context here, we don't need to copy the parameters back to the host. *)
   let result_routine =
-    Train.to_routine (module Backend) routine.context IDX.empty
-    @@ Block_comment ("moons infer", mlp_result.forward)
+    Train.to_routine
+      (module Backend)
+      routine.context IDX.empty
+      [%cd
+        ~~("moons" "infer";
+           mlp_result.forward)]
   in
   Stdio.print_endline "\n******** mlp_result **********";
   Tensor.print_tree ~with_id:true ~with_grad:false ~depth:9 mlp_result;
 
@@ -6,6 +6,7 @@ module TDSL = Operation.TDSL
 module NTDSL = Operation.NTDSL
 module CDSL = Train.CDSL
 module Utils = Arrayjit.Utils
+module Asgns = Arrayjit.Assignments
 module Rand = Arrayjit.Rand.Lib
 module Debug_runtime = Utils.Debug_runtime
 
@@ -59,7 +60,9 @@ let demo () =
   let module Backend = (val Arrayjit.Backends.fresh_backend ~backend_name:"cuda" ()) in
   let device = Backend.(new_virtual_device @@ get_device ~ordinal:0) in
   let ctx = Backend.init device in
-  let routine = Train.to_routine (module Backend) ctx bindings (Seq (update.fwd_bprop, sgd)) in
+  let routine =
+    Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
+  in
 
   let points = Tensor.value_2d_points ~xdim:0 ~ydim:1 moons_flat in
   let classes = Tensor.value_1d_points ~xdim:0 moons_classes in
@@ -102,8 +105,12 @@ let demo () =
   let%op mlp_result = mlp "point" in
   Train.set_on_host Changed_on_devices mlp_result.value;
   let result_routine =
-    Train.to_routine (module Backend) routine.context IDX.empty
-    @@ Block_comment ("moons infer", mlp_result.forward)
+    Train.to_routine
+      (module Backend)
+      routine.context IDX.empty
+      [%cd
+        ~~("moons" "infer";
+           mlp_result.forward)]
   in
   let callback (x, y) =
     Tensor.set_values point [| x; y |];
 
@@ -27,7 +27,7 @@ let _suspended () =
   Stdio.printf "\n%!";
   Tensor.print_tree ~with_id:true ~with_grad:true ~depth:9 v;
   Stdlib.Format.printf "\nHigh-level code:\n%!";
-  Stdlib.Format.printf "%a\n%!" (Arrayjit.Assignments.fprint_hum ()) code.fwd_bprop
+  Stdlib.Format.printf "%a\n%!" (Arrayjit.Assignments.fprint_hum ()) code.fwd_bprop.asgns
 
 let _suspended () =
   Rand.init 0;