Fix: CUDA syntax binops were missing outer parentheses

lukstafi · lukstafi · commit 32a9731d4ac6 · 2025-05-21T20:59:35.000+02:00
Signed-off-by: Lukasz Stafiniak &lt;lukstafi@gmail.com&gt;
diff --git a/arrayjit/lib/cuda_backend.ml b/arrayjit/lib/cuda_backend.ml
@@ -67,7 +67,11 @@ module Alloc_buffer = struct
       (if Array.length dims = 0 then 0 else Array.reduce_exn dims ~f:( * )) * Ops.prec_in_bytes prec
     in
     set_ctx stream.device.dev.primary_context;
-    Cu.Deviceptr.mem_alloc ~size_in_bytes
+    let ptr = Cu.Deviceptr.mem_alloc ~size_in_bytes in
+    (* TODO: consider using memset_d8 to zero-initialize the memory. *)
+    (* if size_in_bytes > 0 then
+      Cu.Stream.memset_d8 ptr Unsigned.UChar.zero ~length:size_in_bytes stream.runner; *)
+    ptr
 
   let free_buffer = Some (fun _stream ptr -> Cu.Deviceptr.mem_free ptr)
 end
@@ -283,10 +287,10 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Void_prec -> "void"
 
     let binop_syntax prec v =
+      (* TODO: consider using binop_syntax inherited from Pure_C_config and overriding only
+         where different. *)
       let open PPrint in
-      let f op_str v1 v2 =
-        group (lparen ^^ v1 ^^ space ^^ string op_str ^^ space ^^ v2 ^^ rparen)
-      in
+      let f op_str v1 v2 = group @@ parens (v1 ^^ space ^^ string op_str ^^ space ^^ v2) in
       let func fn v1 v2 = group (string fn ^^ parens (separate comma [ v1; v2 ])) in
       match (v, prec) with
       | Ops.Arg1, _ -> invalid_arg "Cuda_backend.binop_syntax: Arg1 is not an operator"
@@ -307,44 +311,50 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | ToPowOf, Byte_prec _ ->
           invalid_arg "Cuda_backend.binop_syntax: ToPowOf not supported for byte/integer precisions"
       | Relu_gate, Byte_prec _ ->
-          fun v1 v2 -> group (parens (v1 ^^ string " > 0") ^^ string " ? " ^^ v2 ^^ string " : 0")
+          fun v1 v2 ->
+            group @@ parens (parens (v1 ^^ string " > 0") ^^ string " ? " ^^ v2 ^^ string " : 0")
       | Relu_gate, Half_prec _ ->
           fun v1 v2 ->
             group
-              (parens
-                 (string "__hgt(" ^^ v1 ^^ comma
-                 ^^ string " __ushort_as_half((unsigned short)0x0000U))")
-              ^^ string " ? " ^^ v2
-              ^^ string " : __ushort_as_half((unsigned short)0x0000U)")
+            @@ parens
+                 (parens
+                    (string "__hgt(" ^^ v1 ^^ comma
+                    ^^ string " __ushort_as_half((unsigned short)0x0000U))")
+                 ^^ string " ? " ^^ v2
+                 ^^ string " : __ushort_as_half((unsigned short)0x0000U)")
       | Relu_gate, _ ->
           fun v1 v2 ->
-            group (parens (v1 ^^ string " > 0.0") ^^ string " ? " ^^ v2 ^^ string " : 0.0")
+            group @@ parens (parens (v1 ^^ string " > 0.0") ^^ string " ? " ^^ v2 ^^ string " : 0.0")
       | Satur01_gate, Byte_prec _ ->
           fun v1 v2 ->
-            parens
-              (parens
-                 (string "(float)" ^^ v1 ^^ string " > 0.0f && (float)" ^^ v1 ^^ string " < 1.0f")
-              ^^ string " ? " ^^ v2 ^^ string " : (unsigned char)0")
+            group
+            @@ parens
+                 (parens
+                    (string "(float)" ^^ v1 ^^ string " > 0.0f && (float)" ^^ v1 ^^ string " < 1.0f")
+                 ^^ string " ? " ^^ v2 ^^ string " : (unsigned char)0")
       | Satur01_gate, Half_prec _ ->
           fun v1 v2 ->
-            parens
-              (parens
-                 (string "__hgt(" ^^ v1 ^^ comma
-                 ^^ string " __ushort_as_half((unsigned short)0x0000U)) && __hlt("
-                 ^^ v1 ^^ comma
-                 ^^ string " __ushort_as_half((unsigned short)0x3C00U)))")
-              ^^ string " ? " ^^ v2
-              ^^ string " : __ushort_as_half((unsigned short)0x0000U)")
+            group
+            @@ parens
+                 (parens
+                    (string "__hgt(" ^^ v1 ^^ comma
+                    ^^ string " __ushort_as_half((unsigned short)0x0000U)) && __hlt("
+                    ^^ v1 ^^ comma
+                    ^^ string " __ushort_as_half((unsigned short)0x3C00U)))")
+                 ^^ string " ? " ^^ v2
+                 ^^ string " : __ushort_as_half((unsigned short)0x0000U)")
       | Satur01_gate, Single_prec _ ->
           fun v1 v2 ->
-            parens
-              (parens (v1 ^^ string " > 0.0f && " ^^ v1 ^^ string " < 1.0f")
-              ^^ string " ? " ^^ v2 ^^ string " : 0.0f")
+            group
+            @@ parens
+                 (parens (v1 ^^ string " > 0.0f && " ^^ v1 ^^ string " < 1.0f")
+                 ^^ string " ? " ^^ v2 ^^ string " : 0.0f")
       | Satur01_gate, Double_prec _ ->
           fun v1 v2 ->
-            parens
-              (parens (v1 ^^ string " > 0.0 && " ^^ v1 ^^ string " < 1.0")
-              ^^ string " ? " ^^ v2 ^^ string " : 0.0")
+            group
+            @@ parens
+                 (parens (v1 ^^ string " > 0.0 && " ^^ v1 ^^ string " < 1.0")
+                 ^^ string " ? " ^^ v2 ^^ string " : 0.0")
       | Max, Byte_prec _ -> func "max"
       | Max, Half_prec _ -> func "__hmax"
       | Max, Double_prec _ -> func "fmax"
@@ -403,13 +413,16 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Recip, Byte_prec _ ->
           invalid_arg "Cuda_backend.unop_syntax: Recip not supported for byte/integer precisions"
       | Recip, Half_prec _ -> func "hrcp"
-      | Recip, _ -> f "(1.0 / (" "))"
+      | Recip, Single_prec _ -> f "(1.0f / (" "))"
+      | Recip, Double_prec _ -> f "(1.0 / (" "))"
+      | Recip, _ -> f "(1 / (" "))"
       | Recip_sqrt, Byte_prec _ ->
           invalid_arg
             "Cuda_backend.unop_syntax: Recip_sqrt not supported for byte/integer precisions"
       | Recip_sqrt, Half_prec _ -> func "hrsqrt"
       | Recip_sqrt, Double_prec _ -> f "(1.0 / sqrt(" "))"
-      | Recip_sqrt, _ -> f "(1.0 / sqrtf(" "))"
+      | Recip_sqrt, Single_prec _ -> f "(1.0f / sqrtf(" "))"
+      | Recip_sqrt, _ -> f "(1 / sqrtf(" "))"
       | Neg, _ -> f "(-(" "))"
       | Tanh_approx, Byte_prec _ ->
           invalid_arg
diff --git a/bin/micrograd_basic.ml b/bin/micrograd_basic.ml
@@ -8,7 +8,7 @@ module Rand = Ir.Rand.Lib
 
 let _get_local_debug_runtime = Utils.get_local_debug_runtime
 
-let%diagn_sexp () =
+let%diagn_sexp _suspended() =
   let module Backend = (val Backends.fresh_backend ~backend_name:"multicore_cc" ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
@@ -33,7 +33,7 @@ let%diagn_sexp () =
   Tensor.print ~with_code:false ~with_grad:true `Default @@ a;
   Tensor.print ~with_code:false ~with_grad:true `Default @@ b
 
-let%diagn_sexp _suspended () : unit =
+let%diagn_sexp  () : unit =
   let module Backend = (val Backends.fresh_backend ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
@@ -53,6 +53,7 @@ let%diagn_sexp _suspended () : unit =
   (* Train.every_non_literal_on_host g; *)
   let update = Train.grad_update g in
   let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
+  Utils.capture_stdout_logs @@ fun () ->
   Train.run routine;
   (* Tensor.print_tree ~with_grad:true ~depth:9 g; *)
   Tensor.print ~with_code:false ~with_grad:false `Default @@ g;
diff --git a/test/micrograd_demo_logging-cuda-0-0.log.expected b/test/micrograd_demo_logging-cuda-0-0.log.expected
@@ -60,13 +60,13 @@ b_grad[0]{=MAYBE UNINITIALIZED} = -333201e-3 = fmaf(a[0]{=-4000e-3},n14_d_grad[0
 # b.grad[0] := fma((3 * (b[0] * b[0])), n14_d.grad[0], b.grad[0]);
 b_grad[0]{=MAYBE UNINITIALIZED} = 666402e-3 = fmaf(((float)(3) * (b[0]{=2000e-3} * b[0]{=2000e-3})),n14_d_grad[0]{=83300e-3},b_grad[0]{=-333201e-3})
 # b.grad[0] := (b.grad[0] + relu_gate(n31[0], n40_d.grad[0]));
-b_grad[0]{=MAYBE UNINITIALIZED} = 27766e-3 = (b_grad[0]{=666402e-3} + (n31[0]{=-2000e-3} > 0.0) ? n40_d_grad[0]{=27766e-3} : 0.0)
+b_grad[0]{=MAYBE UNINITIALIZED} = 666402e-3 = (b_grad[0]{=666402e-3} + ((n31[0]{=-2000e-3} > 0.0) ? n40_d_grad[0]{=27766e-3} : 0.0))
 # a.grad[0] := (a.grad[0] + relu_gate(n31[0], n40_d.grad[0]));
-a_grad[0]{=MAYBE UNINITIALIZED} = 27766e-3 = (a_grad[0]{=166600e-3} + (n31[0]{=-2000e-3} > 0.0) ? n40_d_grad[0]{=27766e-3} : 0.0)
+a_grad[0]{=MAYBE UNINITIALIZED} = 166600e-3 = (a_grad[0]{=166600e-3} + ((n31[0]{=-2000e-3} > 0.0) ? n40_d_grad[0]{=27766e-3} : 0.0))
 # b.grad[0] :=$  (b.grad[0] + relu_gate(n42[0], (-1 * ((2 * e[0]) * f.grad[0]))));
-b_grad[0]{=MAYBE UNINITIALIZED} = 6941e-3 = (b_grad[0]{=27766e-3} + (n42[0]{=6000e-3} > 0.0) ? ((float)(-1) * (((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3})) : 0.0)
+b_grad[0]{=MAYBE UNINITIALIZED} = 673344e-3 = (b_grad[0]{=666402e-3} + ((n42[0]{=6000e-3} > 0.0) ? ((float)(-1) * (((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3})) : 0.0))
 # a.grad[0] :=$  (a.grad[0] - relu_gate(n42[0], (-1 * ((2 * e[0]) * f.grad[0]))));
-a_grad[0]{=MAYBE UNINITIALIZED} = 6941e-3 = (a_grad[0]{=27766e-3} - (n42[0]{=6000e-3} > 0.0) ? ((float)(-1) * (((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3})) : 0.0)
+a_grad[0]{=MAYBE UNINITIALIZED} = 159658e-3 = (a_grad[0]{=166600e-3} - ((n42[0]{=6000e-3} > 0.0) ? ((float)(-1) * (((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3})) : 0.0))
 # n19_c.grad[0] := fma((2 * e[0]), f.grad[0], n19_c.grad[0]);
 n19_c_grad[0]{=MAYBE UNINITIALIZED} = -6941e-3 = fmaf(((float)(2) * e[0]{=-7000e-3}),f_grad[0]{=495e-3},n19_c_grad[0]{=0e-3})
 # n19_c.grad[0] := fma((2 * e[0]), f.grad[0], n19_c.grad[0]);
@@ -76,10 +76,10 @@ n4_c_grad[0]{=MAYBE UNINITIALIZED} = -13883e-3 = (n4_c_grad[0]{=0e-3} + n19_c_gr
 # n4_c.grad[0] := (n4_c.grad[0] + n19_c.grad[0]);
 n4_c_grad[0]{=MAYBE UNINITIALIZED} = -27766e-3 = (n4_c_grad[0]{=-13883e-3} + n19_c_grad[0]{=-13883e-3})
 # a.grad[0] := (a.grad[0] + n4_c.grad[0]);
-a_grad[0]{=MAYBE UNINITIALIZED} = -20825e-3 = (a_grad[0]{=6941e-3} + n4_c_grad[0]{=-27766e-3})
+a_grad[0]{=MAYBE UNINITIALIZED} = 131892e-3 = (a_grad[0]{=159658e-3} + n4_c_grad[0]{=-27766e-3})
 # b.grad[0] := (b.grad[0] + n4_c.grad[0]);
-b_grad[0]{=MAYBE UNINITIALIZED} = -20825e-3 = (b_grad[0]{=6941e-3} + n4_c_grad[0]{=-27766e-3})
+b_grad[0]{=MAYBE UNINITIALIZED} = 645577e-3 = (b_grad[0]{=673344e-3} + n4_c_grad[0]{=-27766e-3})
 # a.grad[0] := fma(-1, ((2 * e[0]) * f.grad[0]), a.grad[0]);
-a_grad[0]{=MAYBE UNINITIALIZED} = -13883e-3 = fmaf((float)(-1),(((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3}),a_grad[0]{=-20825e-3})
+a_grad[0]{=MAYBE UNINITIALIZED} = 138833e-3 = fmaf((float)(-1),(((float)(2) * e[0]{=-7000e-3}) * f_grad[0]{=495e-3}),a_grad[0]{=131892e-3})
 COMMENT: end
 COMMENT: end