Better names for builtin files, for CUDA handle large builtins via function pointers

lukstafi · lukstafi · commit 655d5bb3026a · 2025-07-24T15:23:40.000+02:00
Signed-off-by: Lukasz Stafiniak &lt;lukstafi@gmail.com&gt;
diff --git a/arrayjit/lib/builtins.c b/arrayjit/lib/builtins.c
diff --git a/arrayjit/lib/builtins.msl b/arrayjit/lib/builtins.msl
diff --git a/arrayjit/lib/builtins_large.cu b/arrayjit/lib/builtins_large.cu
@@ -43,7 +43,7 @@ __device__ __forceinline__ void threefry_round(uint4 &x, unsigned int r0, unsign
 }
 
 /* Threefry4x32 implementation - 20 rounds */
-__device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter) {
+__device__ uint4x32_t arrayjit_threefry4x32_impl(uint4x32_t key, uint4x32_t counter) {
     uint4 x = make_uint4(counter.v[0], counter.v[1], counter.v[2], counter.v[3]);
     uint4 k = make_uint4(key.v[0], key.v[1], key.v[2], key.v[3]);
     
@@ -107,68 +107,4 @@ __device__ uint4x32_t arrayjit_threefry4x32(uint4x32_t key, uint4x32_t counter)
     return result;
 }
 
-/* Conversion functions from uint4x32 to various precisions uniformly */
-
-/* Convert to float in [0, 1) using CUDA intrinsics */
-__device__ __forceinline__ float uint32_to_single_uniform(uint32_t x) {
-    /* Use __uint2float_rn for correct rounding */
-    return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f);
-}
-
-/* Convert to double in [0, 1) */
-__device__ __forceinline__ double uint32_to_double_uniform(uint32_t x) {
-    return __uint2double_rn(x) * (1.0 / 4294967296.0);
-}
-
-/* Uint4x32 to float32 uniform */
-__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
-    return uint32_to_single_uniform(x.v[0]);
-}
-
-/* Uint4x32 to float64 uniform */
-__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
-    uint64_t combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-    return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0);
-}
-
-/* Uint4x32 to int32 uniform */
-__device__ int32_t uint4x32_to_int32_uniform(uint4x32_t x) {
-    return (int32_t)x.v[0];
-}
-
-/* Uint4x32 to int64 uniform */
-__device__ int64_t uint4x32_to_i64_uniform(uint4x32_t x) {
-    return __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}
-
-/* Uint4x32 to uint32 uniform */
-__device__ uint32_t uint4x32_to_u32_uniform(uint4x32_t x) {
-    return x.v[0];
-}
-
-/* Uint4x32 to uint64 uniform */
-__device__ uint64_t uint4x32_to_u64_uniform(uint4x32_t x) {
-    return (uint64_t)__double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
-}
-
-/* Uint4x32 to int8 uniform */
-__device__ int8_t uint4x32_to_i8_uniform(uint4x32_t x) {
-    return (int8_t)(x.v[0] & 0xFF);
-}
-
-/* Uint4x32 to uint8 uniform */
-__device__ uint8_t uint4x32_to_u8_uniform(uint4x32_t x) {
-    return (uint8_t)(x.v[0] & 0xFF);
-}
-
-/* Uint4x32 to bfloat16 uniform */
-__device__ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
-    float f = uint32_to_single_uniform(x.v[0]);
-    return (uint16_t)(__float_as_uint(f) >> 16);
-}
-
-/* Uint4x32 to float16 uniform using CUDA half intrinsics */
-__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
-    float f = uint32_to_single_uniform(x.v[0]);
-    return __float2half(f);
-}
+__device__ uint4x32_t ( *arrayjit_threefry4x32)(uint4x32_t key, uint4x32_t counter) = arrayjit_threefry4x32_impl;
diff --git a/arrayjit/lib/builtins_small.cu b/arrayjit/lib/builtins_small.cu
@@ -0,0 +1,70 @@
+
+typedef struct {
+    uint32_t v[4];
+} uint4x32_t;
+
+/* Conversion functions from uint4x32 to various precisions uniformly */
+
+/* Convert to float in [0, 1) using CUDA intrinsics */
+__device__ __forceinline__ float uint32_to_single_uniform(uint32_t x) {
+  /* Use __uint2float_rn for correct rounding */
+  return __uint2float_rn(x >> 8) * (1.0f / 16777216.0f);
+}
+
+/* Convert to double in [0, 1) */
+__device__ __forceinline__ double uint32_to_double_uniform(uint32_t x) {
+  return __uint2double_rn(x) * (1.0 / 4294967296.0);
+}
+
+/* Uint4x32 to float32 uniform */
+__device__ float uint4x32_to_single_uniform(uint4x32_t x) {
+  return uint32_to_single_uniform(x.v[0]);
+}
+
+/* Uint4x32 to float64 uniform */
+__device__ double uint4x32_to_double_uniform(uint4x32_t x) {
+  uint64_t combined = __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
+  return __longlong_as_double(combined) * (1.0 / 18446744073709551616.0);
+}
+
+/* Uint4x32 to int32 uniform */
+__device__ int32_t uint4x32_to_int32_uniform(uint4x32_t x) {
+  return (int32_t)x.v[0];
+}
+
+/* Uint4x32 to int64 uniform */
+__device__ int64_t uint4x32_to_i64_uniform(uint4x32_t x) {
+  return __double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
+}
+
+/* Uint4x32 to uint32 uniform */
+__device__ uint32_t uint4x32_to_u32_uniform(uint4x32_t x) {
+  return x.v[0];
+}
+
+/* Uint4x32 to uint64 uniform */
+__device__ uint64_t uint4x32_to_u64_uniform(uint4x32_t x) {
+  return (uint64_t)__double_as_longlong(__hiloint2double(x.v[1], x.v[0]));
+}
+
+/* Uint4x32 to int8 uniform */
+__device__ int8_t uint4x32_to_i8_uniform(uint4x32_t x) {
+  return (int8_t)(x.v[0] & 0xFF);
+}
+
+/* Uint4x32 to uint8 uniform */
+__device__ uint8_t uint4x32_to_u8_uniform(uint4x32_t x) {
+  return (uint8_t)(x.v[0] & 0xFF);
+}
+
+/* Uint4x32 to bfloat16 uniform */
+__device__ uint16_t uint4x32_to_bfloat16_uniform(uint4x32_t x) {
+  float f = uint32_to_single_uniform(x.v[0]);
+  return (uint16_t)(__float_as_uint(f) >> 16);
+}
+
+/* Uint4x32 to float16 uniform using CUDA half intrinsics */
+__device__ __half uint4x32_to_half_uniform(uint4x32_t x) {
+  float f = uint32_to_single_uniform(x.v[0]);
+  return __float2half(f);
+}
diff --git a/arrayjit/lib/cc_backend.ml b/arrayjit/lib/cc_backend.ml
@@ -13,7 +13,7 @@ open Backend_intf
 let name = "cc"
 
 (* Header declarations for arrayjit builtins *)
-let arrayjit_builtins_header = {|
+let builtins_header = {|
 /* ArrayJIT builtins declarations */
 #include <stdint.h>
 
@@ -184,7 +184,7 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)
   let build_file = Utils.open_build_file ~base_name:name ~extension:".c" in
   let declarations_doc = Syntax.print_declarations () in
   let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-  let header_doc = PPrint.string arrayjit_builtins_header in
+  let header_doc = PPrint.string builtins_header in
   let final_doc = PPrint.(header_doc ^^ declarations_doc ^^ proc_doc) in
   (* Use ribbon = 1.0 for usual code formatting, width 110 *)
   PPrint.ToChannel.pretty 1.0 110 build_file.oc final_doc;
@@ -214,7 +214,7 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized opt
             Syntax.compile_proc ~name idx_params lowered))
   in
   let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
-  let header_doc = PPrint.string arrayjit_builtins_header in
+  let header_doc = PPrint.string builtins_header in
   let final_doc = PPrint.(header_doc ^^ declarations_doc ^^ separate hardline all_proc_docs) in
   PPrint.ToChannel.pretty 1.0 110 build_file.oc final_doc;
   build_file.finalize ();
diff --git a/arrayjit/lib/cuda_backend.ml b/arrayjit/lib/cuda_backend.ml
@@ -606,8 +606,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Tanh_approx, Single_prec _ -> func "__tanhf"
       | Tanh_approx, _ -> func "tanh"
       | Not, _ -> f "(" " == 0.0 ? 1.0 : 0.0)"
-      | Uint4x32_to_prec_uniform, _ ->
-          func ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform")
+      | Uint4x32_to_prec_uniform, _ -> func ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform")
 
     let ternop_syntax prec v =
       let open PPrint in
@@ -657,6 +656,24 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       ^^ rparen ^^ semi
   end
 
+  let builtins_large_header =
+    {|
+  __device__ uint4x32_t ( *arrayjit_threefry4x32)(uint4x32_t key, uint4x32_t counter) = nullptr;
+  |}
+
+  let prepend_builtins b =
+    if Utils.debug_log_from_routines () then
+      Buffer.add_string b "__device__ int printf (const char * format, ... );\n";
+    Buffer.add_string b "\n\n";
+    let builtins_path =
+      Stdlib.Filename.concat (Stdlib.Filename.dirname Stdlib.__FILE__) "builtins_small.cu"
+    in
+    let builtins_content = Stdio.In_channel.read_all builtins_path in
+    Buffer.add_string b builtins_content;
+    (* Needs to be after the small builtins, because uses uint4x32_t. *)
+    Buffer.add_string b builtins_large_header;
+    Buffer.add_string b "\n\n"
+
   let%diagn2_sexp compile ~name bindings ({ Low_level.traced_store; _ } as lowered) =
     (* TODO: The following link seems to claim it's better to expand into loops than use memset.
        https://stackoverflow.com/questions/23712558/how-do-i-best-initialize-a-local-memory-array-to-0 *)
@@ -665,8 +682,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     end)) in
     let idx_params = Indexing.bound_symbols bindings in
     let b = Buffer.create 4096 in
-    if Utils.debug_log_from_routines () then
-      Buffer.add_string b "__device__ int printf (const char * format, ... );\n";
+    prepend_builtins b;
     let declarations_doc = Syntax.print_declarations () in
     let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
     let final_doc = PPrint.(declarations_doc ^^ proc_doc) in
@@ -680,16 +696,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     end)) in
     let idx_params = Indexing.bound_symbols bindings in
     let b = Buffer.create 4096 in
-    (* Read and prepend the CUDA builtins file *)
-    let builtins_path =
-      Stdlib.Filename.concat (Stdlib.Filename.dirname Stdlib.__FILE__) "arrayjit_builtins.cu"
-    in
-    (try
-       let builtins_content = Stdio.In_channel.read_all builtins_path in
-       Buffer.add_string b builtins_content;
-       Buffer.add_string b "\n\n"
-     with _ -> ());
-    (* Silently skip if file not found *)
+    prepend_builtins b;
     let declarations_doc = Syntax.print_declarations () in
     let params_and_docs =
       Array.map2_exn names lowereds
@@ -787,10 +794,29 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       Cu.Module.[ GENERATE_DEBUG_INFO true; GENERATE_LINE_INFO true ]
     else []
 
+  let set_ptr_in_kernel kernel_module src name =
+    let dst, _ = Cuda.Module.get_global kernel_module ~name in
+    (* Copy the helper function address to the kernel's function pointer variable *)
+    Cuda.Deviceptr.memcpy_D_to_D ~dst ~src ~size_in_bytes:8 (* pointer size *) ()
+
+  let set_builtins_in_kernel =
+    assert !initialized;
+    let builtins_path =
+      Stdlib.Filename.concat (Stdlib.Filename.dirname Stdlib.__FILE__) "builtins_large.cu"
+    in
+    let cu_src = Stdio.In_channel.read_all builtins_path in
+    let code = cuda_to_ptx ~name:"builtins_large" cu_src in
+    (* set_ctx ctx; *)
+    let run_module = Cu.Module.load_data_ex code (run_options ()) in
+    let threefry4x32_ptr, _ = Cu.Module.get_global run_module ~name:"arrayjit_threefry4x32" in
+    fun kernel_module ->
+      set_ptr_in_kernel kernel_module threefry4x32_ptr "arrayjit_threefry4x32"
+
   let%track3_sexp link prior_context (code : code) ctx_arrays =
     let ctx = ctx_of prior_context in
     set_ctx ctx;
     let run_module = Cu.Module.load_data_ex code.ptx (run_options ()) in
+    set_builtins_in_kernel run_module;
     let idx_params = Indexing.bound_symbols code.bindings in
     let lowered_bindings : Indexing.lowered_bindings =
       List.map idx_params ~f:(fun s -> (s, ref 0))
@@ -809,6 +835,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     let ctx = ctx_of prior_context in
     set_ctx ctx;
     let run_module = Cu.Module.load_data_ex code_batch.ptx (run_options ()) in
+    set_builtins_in_kernel run_module;
     let procs =
       Array.mapi code_batch.params_and_names ~f:(fun i pns ->
           Option.value ~default:None
diff --git a/arrayjit/lib/dune b/arrayjit/lib/dune
@@ -35,7 +35,7 @@
   ppx_minidebug.runtime)
  (foreign_stubs
   (language c)
-  (names arrayjit_builtins))
+  (names builtins))
  (preprocess
   (pps
    ppx_compare