Fix: host_read_by_devices -> devices_not_lagging_host with the corresponding change of semantics

lukstafi · lukstafi · commit 7510e766ac23 · 2025-07-15T22:03:03.000+02:00
This fixes overriding of changes by transferring a stale version from host.
diff --git a/arrayjit/lib/anatomy_of_a_backend.md b/arrayjit/lib/anatomy_of_a_backend.md
@@ -204,22 +204,22 @@ Unless disabled via setting `automatic_host_transfers` to false, `arrayjit` auto
 
 - `prepare_read` for synchronization and `to_host` transfers right before a host array is read,
 - `prepare_write` for synchronization right before a host array is written to,
-- `host_read_by_devices` for tracking which devices have scheduled transferring the data already.
+- `devices_not_lagging_host` for tracking which devices have scheduled transferring the data already, or don't need transferring because they computed or scheduled computing the data themselves.
 
-Since currently the tagging is per-device, for per-stream tensor nodes might need supplementary `from_host` (or `device_to_device`) calls in rare situations.
+Since currently the tagging is per-device, for per-stream, tensor nodes might need supplementary `from_host` (or `device_to_device`) calls in rare situations.
 
 There are three code components to the automation.
 
 - Within `Tnode`:
-  - The helper function `do_read` unconditionally invokes synchronization code, and if `automatic_host_transfers` invokes data transfer code, as stored in the `prepare_read` field of a node; then clears the field.
+  - The helper function `do_read` unconditionally invokes synchronization code, and if `automatic_host_transfers` it invokes data transfer code, as stored in the `prepare_read` field of a node; then clears the field.
   - The helper function `do_write` unconditionally invokes synchronization code as stored in the `prepare_write` field of a node, then clears the field.
   - `do_read` is invoked from `points_1d`, `points_2d`, `get_value`, `get_values` of `Tnode`; and also from `to_dag` and `print` of `Tensor`.
   - `do_write` is invoked from `set_value`, `set_values`.
   - `Tnode` exposes `prepare_read` and `prepare_write` for updating the fields: only the new data transfer is preserved, but the synchronization codes are combined.
 - Within `Backends.Add_buffer_retrieval_and_syncing`:
   - The `update_writer_event` helper adds the after-modification event to synchronization and sets data transfer to `to_host` from the stream, using `prepare_read`. This happens for `device_to_device` and `sync_routine` (after scheduling the routine) scheduling calls, and independently of `automatic_host_transfers`.
-  - Moreover, `sync_routine`, before scheduling the routine and only if `automatic_host_transfers`, directly schedules `from_host` for input nodes that are not tagged with the device (via `host_read_by_devices`). Note that input nodes are the "read only" and "read before write" nodes that are not constants.
+  - Moreover, `sync_routine`, before scheduling the routine and only if `automatic_host_transfers`, directly schedules `from_host` for input nodes that are not tagged with the device (via `devices_not_lagging_host`). Note that input nodes are the "read only" and "read before write" nodes that are not constants.
 - Within `Backends.Raise_backend.alloc_if_needed`:
-  - If `automatic_host_transfers` and the node allocated for the context is a constant, `alloc_if_needed` directly schedules `from_host` for the node regardless of whether it is tagged with the device (via `host_read_by_devices`); it does add the device tag to the node (if missing).
+  - If `automatic_host_transfers` and the node allocated for the context is a constant, `alloc_if_needed` directly schedules `from_host` for the node regardless of whether it is tagged with the device (via `devices_not_lagging_host`); it does add the device tag to the node (if missing).
 
   **Note:** we do **not** invoke `Tnode.do_read` from within `Backends.Add_buffer_retrieval_and_syncing.from_host`, since to adequately handle such transfers one should deliberately use `device_to_device` functions. This can lead to confusing behavior, in particular observing (or not) a tensor node (on host) can change later computations by inserting (or not) an additional `to_host` before a `from_host`. This aspect of the design might change in the future.
diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml
@@ -81,6 +81,7 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
           Hashtbl.update s.device.shared_writer_streams tn ~f:(fun l ->
               (s, e) :: Option.value ~default:[] l)
         else Hashtbl.remove s.device.shared_writer_streams tn;
+        Hash_set.add tn.devices_not_lagging_host ctx.stream.device.device_id;
         Hashtbl.update s.updating_for tn ~f:(fun _ -> e)
     | Merge_buffer tn ->
         (* Note: the previous event does not need to be done! *)
@@ -94,7 +95,6 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
         (* Stdio.printf "copying: %s from_host\n" (Tn.debug_name tn); *)
         Backend.from_host ~dst_ptr:dst ~dst:ctx hosted;
         update_writer_event ~from:`Host ctx @@ Node tn;
-        Hash_set.add tn.host_read_by_devices ctx.stream.device.device_id;
         true
     | _ -> false
 
@@ -109,7 +109,6 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
         (* Stdio.printf "copying: %s from_host\n" (Tn.debug_name tn); *)
         Backend.from_host ~dst_ptr:dst ~dst:ctx hosted;
         update_writer_event ~from:`Host ctx @@ Node tn;
-        Hash_set.add tn.host_read_by_devices ctx.stream.device.device_id;
         { ctx with ctx_arrays = Map.add_exn ctx.ctx_arrays ~key:tn ~data:dst }
     | _, Some _ ->
         raise
@@ -211,7 +210,7 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
       assert (Domain.is_main_domain ());
       if Utils.settings.automatic_host_transfers then
         Set.iter hosted_inputs ~f:(fun tn ->
-            if not (Hash_set.mem tn.host_read_by_devices s.device.device_id) then
+            if not (Hash_set.mem tn.devices_not_lagging_host s.device.device_id) then
               assert (from_host r.context tn));
       Set.iter r.inputs ~f:(fun tn ->
           if Tn.potentially_cross_stream tn then
@@ -480,7 +479,7 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct
            match key.array with
            | (lazy (Some hosted)) ->
                Device.from_host ~dst_ptr ~dst:parent_context hosted;
-               Hash_set.add key.host_read_by_devices stream.device.device_id
+               Hash_set.add key.devices_not_lagging_host stream.device.device_id
            | _ -> ());
         dst_ptr
       in
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
@@ -87,8 +87,10 @@ type t = {
   mutable code_name : string option;
   mutable prepare_read : prepare option;
   mutable prepare_write : prepare option;
-  mutable host_read_by_devices : Hash_set.M(Int).t;
-      (** The unique ids of devices that read the most recent modification of the host array. *)
+  mutable devices_not_lagging_host : Hash_set.M(Int).t;
+      (** The unique ids of devices that either read the most recent modification of the host
+          buffer, or computed the most recent modification of the node themselves, whether or not it
+          has been transferred to the host yet. *)
 }
 [@@deriving sexp_of]
 
@@ -458,7 +460,7 @@ let has a = match a.array with (lazy (Some _)) -> true | _ -> false
 
 let dims_to_string ?(with_axis_numbers = false) arr =
   let dims_s =
-    if Lazy.is_val arr.dims then 
+    if Lazy.is_val arr.dims then
       let padding = Option.map ~f:fst (Lazy.force arr.padding) in
       Nd.int_dims_to_string ~with_axis_numbers ?padding @@ Lazy.force arr.dims
     else "<not-in-yet>"
@@ -576,7 +578,7 @@ let create ?default_prec ~id ~label ~dims ~padding () =
       code_name = None;
       prepare_read = None;
       prepare_write = None;
-      host_read_by_devices = Hash_set.create (module Int);
+      devices_not_lagging_host = Hash_set.create (module Int);
     }
   in
   (* Note: if tensor nodes get non-trivial finalizers, remember to either add an is_finalized flag
@@ -604,7 +606,7 @@ let create_from_padded ~id ~label ~ndarray ~padding () =
       code_name = None;
       prepare_read = None;
       prepare_write = None;
-      host_read_by_devices = Hash_set.create (module Int);
+      devices_not_lagging_host = Hash_set.create (module Int);
     }
   in
   Registry.add registry tn;
@@ -678,7 +680,7 @@ let create_with_reshape ~id ~label ~base_ndarray ~dims ~padding ~from_padded ()
       code_name = None;
       prepare_read = None;
       prepare_write = None;
-      host_read_by_devices = Hash_set.create (module Int);
+      devices_not_lagging_host = Hash_set.create (module Int);
     }
   in
   Registry.add registry tn;
@@ -703,7 +705,7 @@ let find =
       code_name = None;
       prepare_read = None;
       prepare_write = None;
-      host_read_by_devices = Hash_set.create (module Int);
+      devices_not_lagging_host = Hash_set.create (module Int);
     }
   in
   fun ~id -> Registry.find_opt registry { mock with id }
@@ -721,7 +723,7 @@ let do_read tn =
 let do_write tn =
   Option.iter ~f:(fun p -> p.sync ()) tn.prepare_write;
   tn.prepare_write <- None;
-  Hash_set.clear tn.host_read_by_devices
+  Hash_set.clear tn.devices_not_lagging_host
 
 let points_1d ?from_axis ~xdim tn =
   do_read tn;
@@ -732,7 +734,8 @@ let points_1d ?from_axis ~xdim tn =
 let points_2d ?from_axis ~xdim ~ydim tn =
   do_read tn;
   let padding = Option.map ~f:fst (Lazy.force tn.padding) in
-  Option.value_map ~default:[||] ~f:(fun arr -> Nd.retrieve_2d_points ?from_axis ?padding ~xdim ~ydim arr)
+  Option.value_map ~default:[||] ~f:(fun arr ->
+      Nd.retrieve_2d_points ?from_axis ?padding ~xdim ~ydim arr)
   @@ Lazy.force tn.array
 
 let set_value tn =