Specification of device-to-device synchronization via API and docu-comments

lukstafi · lukstafi · commit 112d458526ff · 2024-09-28T01:33:17.000+02:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,8 +2,7 @@
 
 ### Added
 
-- TODO: (Virtual) device-to-device synchronization functionality.
-- TODO: lazy per-tensor-node synchronization functionality.
+- TODO: (Virtual) device-to-device synchronization functionality, with lazy per-tensor-node synchronization.
 
 ### Changed
 
diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml
@@ -98,39 +98,64 @@ module type Backend = sig
   (** Returns the routines for the procedures included in the code batch. The returned context is
       downstream of all the returned routines. *)
 
-  val from_host : context -> Tnode.t -> bool
-  (** If the array is both hosted and in-context, schedules a copy from host to context and returns
-      true, otherwise returns false. NOTE: when run for a device, it's the caller's responsibility
-      to synchronize the device before the host's data is overwritten. *)
+  type event
+  (** An event tracks if a device finished computing past a particular point in its schedue. These
+      values are used internally for scheduling across devices of the backend, and can be used for
+      explicit scheduling. *)
 
-  val to_host : context -> Tnode.t -> bool
-  (** If the array is both hosted and in-context, schedules a copy from context to host and returns
-      true, otherwise returns false. NOTE: when run for a device, it's the caller's responsibility
-      to synchronize the device before the host's data is read. *)
+  val await_ev : event -> unit
+  (** Blocks till the event completes, if it's not done already. *)
 
-  val device_to_device :
-    Tnode.t -> into_merge_buffer:merge_buffer_use -> dst:context -> src:context -> bool
-  (** If the node is absent from the [src] context and either it is present in the [dst] context or
-      [~into_merge_buffer] is different from [No]: raises an error.
+  val is_done : event -> bool
+  (** Whether the event completed. *)
+
+  val work_for : context -> Tnode.t -> event option
+  (** If the tensor node is in the context, returns the event indicating if currently running or
+      scheduled computations modifying that node on the context's device have completed.
+
+      NOTE: [work_for ctx tn], if work tracking was not registered for [tn], will register work
+      tracking for [tn] and return the event tracking all currently scheduled computations on
+      [ctx]'s device. *)
 
-      If [~into_merge_buffer:No]: If the node is present in the [dst] context, schedules a copy of
-      the tensor node from the device of [src] to the device of [dst] and returns true, otherwise
-      returns false.
+  val will_wait_for : context -> event -> unit
+  (** Schedules waiting for the given event on the context's device.
 
-      If [~into_merge_buffer] is different from [No]: schedules the following task and returns true.
+      NOTE: it should rarely be needed to call [will_wait_for] explicitly, because it is typically
+      called internally when necessary. But there is one exception, see {!device_to_device} when
+      [into_merge_buffer=Streaming]. *)
 
-      The merge-buffer task sets on [dst] the merge buffer source to the given node. If
-      [~into_merge_buffer:Streaming], remembers the buffer pointer of the source node to use for
-      streaming, without blocking. If [~into_merge_buffer:Copy], copies from [src] to the merge
-      buffer of [dst]'s device.
+  val from_host : context -> Tnode.t -> bool
+  (** If the tensor node is both hosted and in-context, schedules a copy from host to context and
+      returns true, otherwise returns false. NOTE: it's the caller's responsibility to synchronize
+      the device (via [await ctx.device] or [await_ev (work_for ctx tn)]) before the host's data is
+      overwritten. *)
 
-      If the [dst] context resulted from a compilation with [Streaming] or [Copy] specific merge
-      buffer code, the [device_to_device] call should fail immediately if there's a mismatch with
-      [~into_merge_buffer].
+  val to_host : context -> Tnode.t -> bool
+  (** If the tensor node is both hosted and in-context, schedules a copy from context to host and
+      returns true, otherwise returns false. NOTE: it's the caller's responsibility to synchronize
+      the device (via [await ctx.device] or [await_ev (work_for ctx tn)]) before the host's data is
+      read. *)
 
-      NOTE: it's the caller's responsibility to synchronize the [src] device, if needed, {i before}
-      calling [device_to_device], and if [~into_merge_buffer:Streaming], the [dst] device
-      {i afterward}, before any computations on the [src] device overwrite the node. *)
+  val device_to_device :
+    Tnode.t -> into_merge_buffer:merge_buffer_use -> dst:context -> src:context -> bool
+  (** [device_to_device tn ~into_merge_buffer ~dst ~src] proceeds as follows:
+      - If the node is absent from the [src] context and either it is present in the [dst] context
+        or [into_merge_buffer] is different from [No]: raises an error.
+      - If the node is absent from [dst] and [into_merge_buffer=No]: returns false.
+      - Executes [will_wait_for dst (work_for src tn)].
+      - If [into_merge_buffer=No]: schedules a copy of the tensor node from the device of [src] to
+        the device of [dst].
+      - If [into_merge_buffer] is different from [No]: sets on [dst] the merge buffer source to the
+        given node. If [into_merge_buffer=Streaming], remembers the buffer pointer of the source
+        node to use for streaming, without blocking. If [into_merge_buffer=Copy], schedules copying
+        from [src] to the merge buffer of [dst]'s device.
+      - If the [dst] context resulted from a compilation with [Streaming] or [Copy] specific merge
+        buffer code, the [device_to_device] call should fail immediately if there's a mismatch with
+        [into_merge_buffer].
+
+      NOTE: If [into_merge_buffer:Streaming], after scheduling the work on [dst] using the merge
+      buffer but before scheduling work on [src] that modifies [tn], execute
+      [will_wait_for src (all_work (get_ctx_device dst))]. *)
 
   type physical_device
   type device
@@ -141,6 +166,10 @@ module type Backend = sig
   val await : device -> unit
   (** Blocks till the device becomes idle, i.e. synchronizes the device. *)
 
+  val all_work : device -> event
+  (** Returns the event indicating if any currently running or scheduled computations on the device
+      have completed. *)
+
   val is_idle : device -> bool
   (** Whether the device is currently waiting for work. *)
 
@@ -173,6 +202,25 @@ module Multicore_backend (Backend : No_device_backend) : Backend = struct
   let sexp_of_task_queue q =
     Sexp.(List [ Atom "task_queue_of_size"; Atom (Int.to_string @@ Queue.size q) ])
 
+  type event = Not_implemented_yet  (** TODO: NOT IMPLEMENTED YET *)
+
+  (** TODO: Blocks till the event completes, if it's not done already. *)
+  let await_ev Not_implemented_yet = ()
+
+  (** TODO: Whether the event completed. *)
+  let is_done Not_implemented_yet = true
+
+  (** TODO: If the tensor node is in the context, returns the event indicating if currently running
+      or scheduled computations modifying that node on the context's device have completed.
+
+      NOTE: [work_for ctx tn], if work tracking was not registered for [tn], will register work
+      tracking for [tn] and return the event tracking all currently scheduled computations on
+      [ctx]'s device. *)
+  let work_for _ctx _tn = Some Not_implemented_yet
+
+  (** TODO: Schedules waiting for the given event on the context's device. *)
+  let will_wait_for _ctx Not_implemented_yet = ()
+
   type device_state = {
     mutable keep_spinning : bool;
     mutable device_error : exn option;
@@ -222,6 +270,10 @@ module Multicore_backend (Backend : No_device_backend) : Backend = struct
       Option.iter d.device_error ~f:(fun e ->
           Exn.reraise e @@ name ^ " device " ^ Int.to_string device.ordinal))
 
+  (** TODO: Returns the event indicating if any currently running or scheduled computations on the
+      device have completed. *)
+  let all_work _device = Not_implemented_yet
+
   let%track3_l_sexp schedule_task device task =
     assert (Domain.is_main_domain ());
     [%log_result "schedule_task", Tnode.describe task, "device", (device.ordinal : int)];
@@ -456,7 +508,7 @@ module Multicore_backend (Backend : No_device_backend) : Backend = struct
 
   let num_physical_devices () = Domain.recommended_domain_count () - 1
   let suggested_num_virtual_devices _device = 1
-  let devices = Array.create ~len:(num_physical_devices ()) None
+  let devices : physical_device option array = Array.create ~len:(num_physical_devices ()) None
 
   let%track2_sexp unsafe_cleanup () =
     assert (Domain.is_main_domain ());
@@ -497,8 +549,14 @@ let sync_suggested_num_virtual_devices = ref 1
 
 (** A minimalisitc wrapper creating backends where all calls run synchronously on the main thread.
     There is only one physical device, but an arbitrary number of virtual devices. *)
-module Sync_backend (Backend : No_device_backend) (* : Backend *) = struct
+module Sync_backend (Backend : No_device_backend) : Backend = struct
   type buffer_ptr = Backend.buffer_ptr [@@deriving sexp_of]
+  type event = unit
+
+  let await_ev () = ()
+  let is_done () = true
+  let work_for _context _tn = Some ()
+  let will_wait_for _context () = ()
 
   type device = {
     subordinal : int;
@@ -516,10 +574,11 @@ module Sync_backend (Backend : No_device_backend) (* : Backend *) = struct
 
   let expected_merge_node (code : code) = Backend.expected_merge_node code
   let expected_merge_nodes (codes : code_batch) = Backend.expected_merge_nodes codes
+  let all_work _device = ()
   let is_idle _device = true
   let name = "sync " ^ Backend.name
   let await _device = ()
-  let global_run_no = ref 0
+  (* let global_run_no = ref 0 *)
 
   type context = { device : device; ctx : Backend.context; expected_merge_node : Tnode.t option }
   [@@deriving sexp_of]
@@ -934,6 +993,20 @@ module Cuda_backend : Backend = struct
                 name;
               })) )
 
+  type event = Cudajit.Event.t
+
+  let work_for _ctx _tn = Some (Cudajit.Event.create ())
+  (* TODO: NOT IMPLEMENTED YET *)
+
+  let is_done event = Cudajit.Event.query event
+  let will_wait_for _context _event = ()
+  (* Cudajit.Event.wait (get_ctx_device context.ctx).Cuda_backend.stream event *)
+  (* TODO: NOT IMPLEMENTED YET *)
+
+  let await_ev event = Cudajit.Event.synchronize event
+  let all_work _device = Cudajit.Event.create ()
+  (* TODO: NOT IMPLEMENTED YET *)
+
   let init device = { ctx = init device; expected_merge_node = None }
   let get_ctx_device context = get_ctx_device context.ctx
   let finalize context = finalize context.ctx
diff --git a/arrayjit/lib/writing_a_backend.md b/arrayjit/lib/writing_a_backend.md
@@ -15,6 +15,9 @@
 
 <!-- /TOC -->
 
+NOTE: these are outdated.
+TODO: update regarding events and device-to-device synchronization.
+
 ## Design around compiling and running code, backend interfaces
 
 Currently, OCANNL integrates new backends via code in [Backends](backends.ml), so it's the "sink" of backend module dependencies; [Backend_utils](backend_utils.ml) is the "source". `Backend_utils.Types` introduces the context-specific `routine` type, for code executable on a backend. The interface `Backends.No_device_backend` has `compile` functions that take `Assignments.t` as input, to allow full flexibility in backend implementations. There is a helper `Backends.lower_assignments` that wraps `Assignments.lower` and `Low_level.optimize_proc`, since currently all backends use the optimized C-like representation `Low_level.t`. The user-facing interface `Backends.Backend` builds on top of `No_device_backend` providing multi-device functionality. The functor `Multicore_backend` converts a `No_device_backend` targetting the CPU into a `Backend` whose devices are parallel threads (and ultimately the CPU cores).