In progress: factor out alloc_if_needed

lukstafi · lukstafi · commit 0f6feaf5b322 · 2024-11-04T09:11:22.000+01:00
diff --git a/arrayjit/lib/assignments.ml b/arrayjit/lib/assignments.ml
@@ -79,10 +79,10 @@ let is_total ~initialize_neutral ~projections =
 
 (** Returns materialized nodes in the sense of {!Tnode.is_in_context_force}. NOTE: it ideally should
     be called after compilation. *)
-let context_nodes asgns =
+let context_nodes ~unified_memory asgns =
   let open Utils.Set_O in
   let empty = Set.empty (module Tn) in
-  let one tn = if Tnode.is_in_context_force tn 34 then Set.singleton (module Tn) tn else empty in
+  let one tn = if Tnode.is_in_context_force ~unified_memory tn 34 then Set.singleton (module Tn) tn else empty in
   let of_node = function Node rhs -> one rhs | Merge_buffer _ -> empty in
   let rec loop = function
     | Noop -> empty
diff --git a/arrayjit/lib/backend_impl.ml b/arrayjit/lib/backend_impl.ml
@@ -133,7 +133,7 @@ end
 module type Backend_impl_common = sig
   include Buffer
 
-  val is_in_context : Low_level.traced_array -> bool
+  val unified_memory : bool
   (** If true, the node is required to be in the contexts linked with code that uses it.
 
       Should return false for nodes that are virtual, local, or which the backend prefers to access
@@ -305,3 +305,36 @@ struct
   let alloc_zero_init_array prec ~dims _stream = Backend.alloc_zero_init_array prec ~dims ()
   let free_buffer = Option.map Backend.free_buffer ~f:(fun memfree _stream ptr -> memfree () ptr)
 end
+
+let%track3_sexp alloc_if_needed (type buffer_ptr) ~ ~unified_memory ctx stream ~key ~data:node ctx_arrays =
+  if Tnode.is_in_context ~unified_memory node && not (Map.mem ctx_arrays key) then (
+    [%log2 Tn.debug_name key, "read_only", (node.read_only : bool)];
+    [%log3 (key : Tn.t)];
+    let default () : buffer_ptr =
+      set_ctx ctx;
+      Cu.Deviceptr.mem_alloc ~size_in_bytes:(Tn.size_in_bytes key)
+    in
+    let add_new () = Map.add_exn ctx_arrays ~key ~data:(default ()) in
+    let device = stream.device in
+    if node.read_only then
+      if Tn.known_non_cross_stream key then add_new ()
+      else (
+        if Hashtbl.mem device.cross_stream_candidates key then
+          Tn.update_memory_sharing key Tn.Shared_cross_stream 40;
+        let data = Hashtbl.find_or_add device.cross_stream_candidates key ~default in
+        Map.add_exn ctx_arrays ~key ~data)
+    else if Tn.known_shared_cross_stream key then (
+      if Hashtbl.mem device.owner_streams key then
+        if not (stream.stream_id = Hashtbl.find_exn device.owner_streams key) then
+          raise
+          @@ Utils.User_error
+               ("Cuda_backend.alloc_if_needed: node " ^ Tn.debug_name key
+              ^ " assumed to be cross-stream-shared but then written to on multiple devices")
+        else Hashtbl.add_exn device.owner_streams ~key ~data:stream.stream_id;
+      let data = Hashtbl.find_exn device.cross_stream_candidates key in
+      Map.add_exn ctx_arrays ~key ~data)
+    else (
+      Tn.update_memory_sharing key Tn.Per_stream 41;
+      Hashtbl.remove device.cross_stream_candidates key;
+      add_new ()))
+  else ctx_arrays
diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml
@@ -115,18 +115,20 @@ let lower_batch_assignments ?names ?occupancy bindings asgns_l =
              Some (Assignments.lower ~unoptim_ll_source ~ll_source ~cd_source ~name bound asgns) )
          else (None, None))
 
-let verify_prior_context ~is_in_context ~ctx_arrays ~from_prior_context traced_stores =
+let verify_prior_context ~unified_memory ~ctx_arrays ~from_prior_context traced_stores =
   Set.iter from_prior_context ~f:(fun tn ->
       let node = Array.find_map traced_stores ~f:(fun store -> Hashtbl.find store tn) in
       if
         Option.value_map node ~default:false ~f:(fun node ->
-            is_in_context node && not (Option.is_some @@ Map.find ctx_arrays tn))
+            Tn.is_in_context ~unified_memory node && not (Option.is_some @@ Map.find ctx_arrays tn))
       then raise @@ Utils.User_error ("The linked context lacks node " ^ Tnode.debug_name tn))
 
 let from_prior_context_batch comps =
   Array.filter_map comps ~f:(fun comp ->
       Option.map comp ~f:(fun comp ->
-          Set.diff (Assignments.context_nodes comp.Assignments.asgns) comp.embedded_nodes))
+          Set.diff
+            (Assignments.context_nodes ~unified_memory comp.Assignments.asgns)
+            comp.embedded_nodes))
   |> Array.fold ~init:(Set.empty (module Tnode)) ~f:Set.union
 
 (** Adds a scheduler and brings a lowered no-device backend on par with lowered device backends. *)
@@ -296,7 +298,7 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct
     }
 
   let link context (code : code) =
-    verify_prior_context ~is_in_context ~ctx_arrays:context.ctx_arrays
+    verify_prior_context ~unified_memory ~ctx_arrays:context.ctx_arrays
       ~from_prior_context:code.from_prior_context [| code.lowered.traced_store |];
     let inputs, outputs = Low_level.input_and_output_nodes code.lowered in
     let ctx_arrays, bindings, schedule = link context code.code in
@@ -310,7 +312,7 @@ module Raise_backend (Device : Lowered_backend) : Backend = struct
     { context; schedule; bindings; name = code.name; inputs; outputs }
 
   let link_batch context code_batch =
-    verify_prior_context ~is_in_context ~ctx_arrays:context.ctx_arrays
+    verify_prior_context ~unified_memory ~ctx_arrays:context.ctx_arrays
       ~from_prior_context:code_batch.from_prior_context
     @@ Array.filter_map code_batch.lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store));
     let _ctx_arrays, bindings, schedules = link_batch context code_batch.code_batch in
diff --git a/arrayjit/lib/c_syntax.ml b/arrayjit/lib/c_syntax.ml
@@ -17,7 +17,7 @@ module C_syntax (B : sig
 
   val opt_ctx_arrays : buffer_ptr Map.M(Tnode).t option
   val hardcoded_context_ptr : (buffer_ptr -> Ops.prec -> string) option
-  val is_in_context : Low_level.traced_array -> bool
+  val unified_memory : bool
   val host_ptrs_for_readonly : bool
   val logs_to_stdout : bool
   val main_kernel_prefix : string
@@ -68,7 +68,7 @@ struct
         Hashtbl.iter l.Low_level.traced_store ~f:(fun (node : Low_level.traced_array) ->
             let tn = node.tn in
             if not @@ Hash_set.mem is_global tn then
-              let in_ctx : bool = B.is_in_context node in
+              let in_ctx : bool = B.unified_memory node in
               let ctx_ptr = B.hardcoded_context_ptr in
               let mem : (Tn.memory_mode * int) option = tn.memory_mode in
               match
@@ -296,14 +296,14 @@ struct
              (* A rough approximation to the type Gccjit_backend.mem_properties. *)
              let backend_info =
                Sexp.Atom
-                 (if B.is_in_context node then "From_context"
+                 (if B.unified_memory node then "From_context"
                   else if Hash_set.mem is_global tn then "Constant_from_host"
                   else if Tn.is_virtual_force tn 3331 then "Virtual"
                   else "Local_only")
              in
              if not @@ Utils.sexp_mem ~elem:backend_info tn.backend_info then
                tn.backend_info <- Utils.sexp_append ~elem:backend_info tn.backend_info;
-             if B.is_in_context node && not (Hash_set.mem is_global tn) then
+             if B.unified_memory node && not (Hash_set.mem is_global tn) then
                (B.typ_of_prec (Lazy.force tn.Tn.prec) ^ " *" ^ get_ident tn, Param_ptr tn) :: params
              else params)
     in
@@ -369,7 +369,7 @@ struct
         params);
     fprintf ppf "/* Local declarations and initialization. */@ ";
     Hashtbl.iteri traced_store ~f:(fun ~key:tn ~data:node ->
-        if not (Tn.is_virtual_force tn 333 || B.is_in_context node || Hash_set.mem is_global tn)
+        if not (Tn.is_virtual_force tn 333 || B.unified_memory node || Hash_set.mem is_global tn)
         then
           fprintf ppf "%s %s[%d]%s;@ "
             (B.typ_of_prec @@ Lazy.force tn.prec)
diff --git a/arrayjit/lib/cc_backend.ml b/arrayjit/lib/cc_backend.ml
@@ -34,7 +34,7 @@ type procedure = {
 }
 [@@deriving sexp_of]
 
-let is_in_context node = Tnode.is_in_context_force node.Low_level.tn 33
+let unified_memory = true
 
 let get_global_run_id =
   let next_id = ref 0 in
@@ -82,7 +82,7 @@ struct
   let for_lowereds = Input.for_lowereds
   let opt_ctx_arrays = Input.opt_ctx_arrays
   let hardcoded_context_ptr = c_ptr_to_string
-  let is_in_context = is_in_context
+  let unified_memory = unified_memory
   let host_ptrs_for_readonly = true
   let logs_to_stdout = false
   let main_kernel_prefix = ""
diff --git a/arrayjit/lib/cuda_backend.cudajit.ml b/arrayjit/lib/cuda_backend.cudajit.ml
@@ -30,6 +30,8 @@ module Backend_buffer = struct
   end)
 end
 
+let unified_memory = false
+
 module Device_config = struct
   include Backend_buffer
 
@@ -261,11 +263,6 @@ let%diagn2_sexp cuda_to_ptx ~name cu_src =
     Stdio.Out_channel.close oc);
   ptx
 
-let is_in_context node =
-  (* FIXME: shouldn't we use Tnode.is_in_context_force? *)
-  Tnode.default_to_most_local node.Low_level.tn 33;
-  match node.tn.memory_mode with Some ((Virtual | Local), _) -> false | _ -> true
-
 module C_syntax_config (Input : sig
   val for_lowereds : Low_level.optimized array
 end) =
@@ -276,7 +273,7 @@ struct
 
   let opt_ctx_arrays = None
   let hardcoded_context_ptr = None
-  let is_in_context = is_in_context
+  let unified_memory = unified_memory
   let host_ptrs_for_readonly = false
   (* GPUs cannot access host memory pointers directly. *)
 
@@ -449,39 +446,6 @@ let link_proc ~prior_context ~name ~(params : (string * param_source) list) ~ctx
       work;
     }
 
-let%track3_sexp alloc_if_needed ctx stream ~key ~data:node ctx_arrays =
-  if is_in_context node && not (Map.mem ctx_arrays key) then (
-    [%log2 Tn.debug_name key, "read_only", (node.read_only : bool)];
-    [%log3 (key : Tn.t)];
-    let default () : buffer_ptr =
-      set_ctx ctx;
-      Cu.Deviceptr.mem_alloc ~size_in_bytes:(Tn.size_in_bytes key)
-    in
-    let add_new () = Map.add_exn ctx_arrays ~key ~data:(default ()) in
-    let device = stream.device in
-    if node.read_only then
-      if Tn.known_non_cross_stream key then add_new ()
-      else (
-        if Hashtbl.mem device.cross_stream_candidates key then
-          Tn.update_memory_sharing key Tn.Shared_cross_stream 40;
-        let data = Hashtbl.find_or_add device.cross_stream_candidates key ~default in
-        Map.add_exn ctx_arrays ~key ~data)
-    else if Tn.known_shared_cross_stream key then (
-      if Hashtbl.mem device.owner_streams key then
-        if not (stream.stream_id = Hashtbl.find_exn device.owner_streams key) then
-          raise
-          @@ Utils.User_error
-               ("Cuda_backend.alloc_if_needed: node " ^ Tn.debug_name key
-              ^ " assumed to be cross-stream-shared but then written to on multiple devices")
-        else Hashtbl.add_exn device.owner_streams ~key ~data:stream.stream_id;
-      let data = Hashtbl.find_exn device.cross_stream_candidates key in
-      Map.add_exn ctx_arrays ~key ~data)
-    else (
-      Tn.update_memory_sharing key Tn.Per_stream 41;
-      Hashtbl.remove device.cross_stream_candidates key;
-      add_new ()))
-  else ctx_arrays
-
 let run_options () =
   if Utils.with_runtime_debug () then
     Cu.Module.[ GENERATE_DEBUG_INFO true; GENERATE_LINE_INFO true ]
diff --git a/arrayjit/lib/cuda_backend.missing.ml b/arrayjit/lib/cuda_backend.missing.ml
@@ -19,7 +19,7 @@ let compile ?shared:_ ~name:_ bindings _optimized = bindings
 let compile_batch ?shared:_ ~names:_ (bindings : Indexing.unit_bindings) optimized : code_batch =
   Array.map optimized ~f:(fun _ -> bindings)
 
-let is_in_context _traced_array = false
+let unified_memory = false
 let ctx_arrays Unimplemented_ctx = Map.empty (module Tnode)
 
 let link (Unimplemented_ctx : context) (code : code) =
diff --git a/arrayjit/lib/gcc_backend.gccjit.ml b/arrayjit/lib/gcc_backend.gccjit.ml
@@ -74,13 +74,7 @@ type procedure = {
 }
 [@@deriving sexp_of]
 
-let is_in_context node =
-  (* FIXME: shouldn't we use Tnode.is_in_context_force? *)
-  Tnode.default_to_most_local node.Low_level.tn 33;
-  match node.tn.memory_mode with
-  | Some (Hosted (Constant | Volatile), _) -> false
-  | Some ((Virtual | Local), _) -> false
-  | _ -> true
+let unified_memory = true
 
 type gccjit_param = Gccjit.param
 
diff --git a/arrayjit/lib/gcc_backend.missing.ml b/arrayjit/lib/gcc_backend.missing.ml
@@ -5,7 +5,7 @@ include Backend_impl.No_device_buffer_and_copying ()
 let expected_merge_node Unimplemented_proc =
   failwith "gcc backend missing: install the optional dependency gccjit"
 
-let is_in_context _node = failwith "gcc backend missing: install the optional dependency gccjit"
+let unified_memory = true
 
 let to_buffer _tn ~dst:_ ~src:_ =
   failwith "gcc backend missing: install the optional dependency gccjit"
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
@@ -183,10 +183,10 @@ let is_materialized_force tn provenance =
   | Some ((On_device _ | Hosted _ | Materialized), _) -> true
   | Some ((Never_virtual | Device_only | Effectively_constant), _) -> assert false
 
-let is_in_context_force tn provenance =
+let is_in_context_force ~unified_memory tn provenance =
   default_to_most_local tn provenance;
   match tn.memory_mode with
-  | Some (Hosted (Constant | Volatile), _) -> false
+  | Some (Hosted (Constant | Volatile), _) when unified_memory -> false
   | Some ((Virtual | Local), _) -> false
   | _ -> true