First pass on a slew of bugs uncovered by the Metal backend

lukstafi · lukstafi · commit fb4b363045dd · 2025-08-02T15:12:44.000+02:00
Very strange to be getting session-level bugs in one of the backends but not the other.
diff --git a/arrayjit/lib/c_syntax.ml b/arrayjit/lib/c_syntax.ml
@@ -274,6 +274,12 @@ struct
 
   let binop_syntax prec op v1 v2 =
     match op with
+    | Ops.Threefry4x32 -> (
+        match prec with
+        | Ops.Uint4x32_prec _ ->
+            let open PPrint in
+            group (string "arrayjit_threefry4x32(" ^^ v1 ^^ string ", " ^^ v2 ^^ string ")")
+        | _ -> invalid_arg "Pure_C_config.binop_syntax: Threefry4x32 on non-uint4x32 precision")
     | Ops.Satur01_gate -> (
         match prec with
         | Ops.Byte_prec _ | Ops.Uint16_prec _ | Ops.Int32_prec _ | Ops.Uint4x32_prec _ ->
diff --git a/arrayjit/lib/cuda_backend.ml b/arrayjit/lib/cuda_backend.ml
@@ -588,7 +588,13 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Cmpeq, _ -> f "=="
       | Or, _ -> f "||"
       | And, _ -> f "&&"
-      | Threefry4x32, _ -> func "arrayjit_threefry4x32"
+      | Threefry4x32, _ ->
+          (* Threefry4x32 must output to uint4x32 precision *)
+          (match prec with
+          | Ops.Uint4x32_prec _ -> func "arrayjit_threefry4x32"
+          | _ -> raise @@ Utils.User_error 
+              (Printf.sprintf "CUDA backend: Threefry4x32 requires target precision to be uint4x32, but got %s"
+                 (Ops.prec_string prec)))
 
     let unop_syntax prec v =
       let open PPrint in
diff --git a/arrayjit/lib/low_level.ml b/arrayjit/lib/low_level.ml
@@ -531,11 +531,17 @@ let inline_computation ~id computations_table traced static_indices call_args =
     @@ List.map ~f:(fun s -> s.Indexing.static_symbol) static_indices
   in
   let make_subst i lhs_ind =
-    let rhs_ind = call_args.(i) in
-    match lhs_ind with
-    | Indexing.Iterator lhs_s when not (Set.mem static_indices lhs_s) -> Some (lhs_s, rhs_ind)
-    | _ when Indexing.equal_axis_index lhs_ind rhs_ind -> None
-    | _ -> raise @@ Non_virtual 13
+    if i >= Array.length call_args then
+      failwith
+        [%string
+          "make_subst: call_args too short, maybe stale optimization context? Tnode: \
+           %{Tn.debug_name traced.tn} #%{traced.tn.Tn.id#Int} i: %{i#Int}"]
+    else
+      let rhs_ind = call_args.(i) in
+      match lhs_ind with
+      | Indexing.Iterator lhs_s when not (Set.mem static_indices lhs_s) -> Some (lhs_s, rhs_ind)
+      | _ when Indexing.equal_axis_index lhs_ind rhs_ind -> None
+      | _ -> raise @@ Non_virtual 13
   in
   (* In the order of computation. *)
   let loop_proc (def_args, def) : t option =
diff --git a/arrayjit/lib/metal_backend.ml b/arrayjit/lib/metal_backend.ml
@@ -448,13 +448,13 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Ops.Bfloat16_prec _ -> "bfloat" (* Metal supports bfloat16 natively *)
       | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
       | Ops.Single_prec _ -> "float"
-      | Ops.Double_prec _ -> "double"
+      | Ops.Double_prec _ -> raise @@ Utils.User_error "Metal backend does not support double precision"
       | Ops.Void_prec -> "void"
 
     let vec_typ_of_prec ~length prec =
       match (prec, length) with
       | Ops.Single_prec _, 4 -> "float4_t"
-      | Ops.Double_prec _, 2 -> "float2_t" (* Metal uses float2 since it lacks double *)
+      | Ops.Double_prec _, 2 -> raise @@ Utils.User_error "Metal backend does not support double precision"
       | Ops.Int32_prec _, 4 -> "int32x4_t"
       | (Ops.Byte_prec _ | Ops.Fp8_prec _), 16 -> "int8x16_t"
       | (Ops.Uint16_prec _ | Ops.Bfloat16_prec _), 8 -> "uint16x8_t"
@@ -472,7 +472,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Ops.Bfloat16_prec _ -> "bf" (* TODO: Verify actual Metal suffix for bfloat16 *)
       | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
       | Ops.Single_prec _ -> "f"
-      | Ops.Double_prec _ -> ""
+      | Ops.Double_prec _ -> raise @@ Utils.User_error "Metal backend does not support double precision"
       | Ops.Void_prec -> ""
 
     let ternop_syntax _prec op =
@@ -514,12 +514,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                  ^^ space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
                  ^^ string "0.0f"))
       | Relu_gate, Ops.Double_prec _ ->
-          fun v1 v2 ->
-            group
-              (parens
-                 (group (parens (v1 ^^ string " > 0.0"))
-                 ^^ space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
-                 ^^ string "0.0"))
+          raise @@ Utils.User_error "Metal backend does not support double precision"
       | Relu_gate, _ (* Byte_prec, Void_prec *) ->
           fun v1 v2 ->
             group
@@ -537,7 +532,13 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                  ^^ space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
                  ^^ string ("0.0" ^ s)))
       | ToPowOf, _ -> func "pow"
-      | Threefry4x32, _ -> func "arrayjit_threefry4x32"
+      | Threefry4x32, _ ->
+          (* Threefry4x32 must output to uint4x32 precision *)
+          (match prec with
+          | Ops.Uint4x32_prec _ -> func "arrayjit_threefry4x32"
+          | _ -> raise @@ Utils.User_error 
+              (Printf.sprintf "Metal backend: Threefry4x32 requires target precision to be uint4x32, but got %s"
+                 (Ops.prec_string prec)))
       | Arg1, _ | Arg2, _ -> invalid_arg "Metal C_syntax_config: Arg1/Arg2 not operators"
 
     let unop_syntax prec op =
@@ -554,7 +555,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Sqrt, _ -> func_doc "sqrt"
       | Relu, Ops.Half_prec _ -> fun v -> func_doc "max" (separate comma_sep [ string "0.0h"; v ])
       | Relu, Ops.Single_prec _ -> fun v -> func_doc "max" (separate comma_sep [ string "0.0f"; v ])
-      | Relu, Ops.Double_prec _ -> fun v -> func_doc "max" (separate comma_sep [ string "0.0"; v ])
+      | Relu, Ops.Double_prec _ -> raise @@ Utils.User_error "Metal backend does not support double precision"
       | Relu, _ (* Byte_prec, Void_prec *) ->
           fun v -> func_doc "max" (separate comma_sep [ string "0"; v ])
       | Satur01, p ->
diff --git a/arrayjit/lib/ops.ml b/arrayjit/lib/ops.ml
@@ -485,7 +485,7 @@ let binop_c_syntax prec v =
   | Or, _ -> ("(", " ||", ")")
   | And, _ -> ("(", " &&", ")")
   | Threefry4x32, _ ->
-      (* This corresponds to the pure C implementation in arrayjit_stubs.c. *)
+      (* This corresponds to the pure C implementation in builtins.c. *)
       ("arrayjit_threefry4x32(", ",", ")")
 
 let is_assign_op = function
diff --git a/test/einsum/moons_demo_variant.expected b/test/einsum/moons_demo_variant.expected
@@ -1,8 +1,8 @@
 Retrieving commandline, environment, or config file variable ocannl_log_level
 Found 0, in the config file
 Tnode: collecting accessible arrays...
-n0 moons_flat as moons_flat: Host-const/37; double prec 40x10x2; mem in bytes: 6_400
-n1 moons_classes as moons_classes: Host-const/37; double prec 40x10x1; mem in bytes: 3_200
+n0 moons_flat as moons_flat: Host-const/37; single prec 40x10x2; mem in bytes: 3_200
+n1 moons_classes as moons_classes: Host-const/37; single prec 40x10x1; mem in bytes: 1_600
 n2 range_over_offsets as range_over_offsets: Virt/15; single prec 4; mem in bytes: <not-in-yet>
 n3 !@self_id as n3: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n4 threefry4x32 as threefry4x32: Virt/15; single prec 4; mem in bytes: <not-in-yet>
@@ -28,42 +28,42 @@ n23 !@self_id as n23: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n24 threefry4x32 as threefry4x32: Virt/15; single prec 4; mem in bytes: <not-in-yet>
 n25 w3 as w3: Host&stream/412410; single prec 1x16; mem in bytes: <not-in-yet>
 n26 grad_w3 as w3.grad: Local/26046; single prec 1x16; mem in bytes: <not-in-yet>
-n27 @|_moons_input as moons_input: Local/1046; double prec 10x2; mem in bytes: <not-in-yet>
-n30 @|_moons_class as moons_class: Local/1046; double prec 10x1; mem in bytes: <not-in-yet>
-n33 * as n33: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n27 @|_moons_input as moons_input: Local/1046; single prec 10x2; mem in bytes: <not-in-yet>
+n30 @|_moons_class as moons_class: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
+n33 * as n33: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n34 grad_* as n33.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n35 + as n35: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n35 + as n35: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n36 grad_+ as n35.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n37 relu as relu: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n37 relu as relu: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n38 grad_relu as relu.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n39 * as n39: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n39 * as n39: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n40 grad_* as n39.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n41 + as n41: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n41 + as n41: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n42 grad_+ as n41.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n43 relu as relu: Local/1046; double prec 10x16; mem in bytes: <not-in-yet>
+n43 relu as relu: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n44 grad_relu as relu.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
-n45 * as n45: Local/1046; double prec 10x1; mem in bytes: <not-in-yet>
+n45 * as n45: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
 n46 grad_* as n45.grad: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
 n47 0.5 as n47: Virt/40; single prec 1; mem in bytes: <not-in-yet>
-n48 +_mlp_@|_moons_input as mlp_moons_input: Virt/15; double prec 10x1; mem in bytes: <not-in-yet>
+n48 +_mlp_@|_moons_input as mlp_moons_input: Virt/15; single prec 10x1; mem in bytes: <not-in-yet>
 n49 grad_+_mlp_@|_moons_input as mlp_moons_input.grad: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
-n50 *. as n50: Virt/15; double prec 10x1; mem in bytes: <not-in-yet>
+n50 *. as n50: Virt/15; single prec 10x1; mem in bytes: <not-in-yet>
 n51 grad_*. as n50.grad: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
 n52 1 as 1: Virt/40; single prec 1; mem in bytes: <not-in-yet>
-n53 - as n53: Local/1046; double prec 10x1; mem in bytes: <not-in-yet>
+n53 - as n53: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
 n54 grad_- as n53.grad: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
-n55 relu_margin_loss as relu_margin_loss: Virt/15; double prec 10x1; mem in bytes: <not-in-yet>
+n55 relu_margin_loss as relu_margin_loss: Virt/15; single prec 10x1; mem in bytes: <not-in-yet>
 n56 grad_relu_margin_loss as relu_margin_loss.grad: Local/1046; single prec 10x1; mem in bytes: <not-in-yet>
 n57 10 as 10: Virt/40; single prec 1; mem in bytes: <not-in-yet>
-n58 => as n58: Local/1046; double prec 1; mem in bytes: <not-in-yet>
+n58 => as n58: Local/1046; single prec 1; mem in bytes: <not-in-yet>
 n59 grad_=> as n58.grad: Virt/40; single prec 1; mem in bytes: <not-in-yet>
-n60 /._scalar_loss as scalar_loss: Host&stream/412410; double prec 1; mem in bytes: 8
+n60 /._scalar_loss as scalar_loss: Host&stream/412410; single prec 1; mem in bytes: 4
 n61 grad_/._scalar_loss as scalar_loss.grad: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n62 2 as 2: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n63 **. as n63: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n64 -1 as n64: Virt/40; single prec 1; mem in bytes: <not-in-yet>
-n65 *. as n65: Virt/152; double prec 1; mem in bytes: <not-in-yet>
-n66 /. as n66: Host&stream/412410; double prec 1; mem in bytes: <not-in-yet>
+n65 *. as n65: Virt/152; single prec 1; mem in bytes: <not-in-yet>
+n66 /. as n66: Host&stream/412410; single prec 1; mem in bytes: <not-in-yet>
 n67 1 as 1: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n68 80 as 80: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n69 !@ as n69: Virt/152; single prec 1; mem in bytes: <not-in-yet>
diff --git a/test/einsum/moons_demo_variant.ml b/test/einsum/moons_demo_variant.ml
@@ -20,9 +20,11 @@ let () =
   let epochs = 2 in
   let steps = epochs * 2 * len / batch_size in
   let moons_config = Datasets.Half_moons.Config.{ noise_range = 0.1; seed = Some 5 } in
-  let moons_coordinates, moons_labels = Datasets.Half_moons.generate ~config:moons_config ~len () in
-  let moons_flat_ndarray = Ir.Ndarray.as_array Ir.Ops.Double moons_coordinates in
-  let moons_classes_ndarray = Ir.Ndarray.as_array Ir.Ops.Double moons_labels in
+  let moons_coordinates, moons_labels =
+    Datasets.Half_moons.generate_single_prec ~config:moons_config ~len ()
+  in
+  let moons_flat_ndarray = Ir.Ndarray.as_array Ir.Ops.Single moons_coordinates in
+  let moons_classes_ndarray = Ir.Ndarray.as_array Ir.Ops.Single moons_labels in
   let batch_n, bindings = IDX.get_static_symbol ~static_range:n_batches IDX.empty in
   let step_n, bindings = IDX.get_static_symbol bindings in
   let moons_flat = TDSL.rebatch ~l:"moons_flat" moons_flat_ndarray () in
diff --git a/test/operations/dune b/test/operations/dune
@@ -80,6 +80,13 @@
  (preprocess
   (pps ppx_here ppx_ocannl)))
 
+(test
+ (name test_threefry_precision)
+ (modules test_threefry_precision)
+ (libraries base ocannl)
+ (preprocess
+  (pps ppx_here ppx_ocannl)))
+
 (library
  (name operations_tutorials)
  (package neural_nets_lib)
diff --git a/test/operations/test_threefry_precision.ml b/test/operations/test_threefry_precision.ml
@@ -0,0 +1,32 @@
+open Base
+open Ocannl
+
+let () =
+  Utils.settings.output_debug_files_in_build_directory <- true;
+  Utils.settings.log_level <- 1;
+  let module TDSL = Operation.TDSL in
+  
+  (* Create a simple Threefry4x32 operation *)
+  let key = TDSL.number ~label:["key"] 42.0 in
+  let counter = TDSL.number ~label:["counter"] 1.0 in
+  let rng_result = TDSL.threefry4x32 ~label:["rng_result"] key counter () in
+  
+  (* Print the precision of the result *)
+  Stdlib.Printf.printf "Threefry4x32 result precision: %s\n" 
+    (Ir.Ops.prec_string (Lazy.force rng_result.value.prec));
+  
+  (* Try to use it in a computation - this should trigger the error *)
+  let uniform_result = TDSL.uint4x32_to_prec_uniform ~label:["uniform"] rng_result () in
+  Stdlib.Printf.printf "Uniform result precision: %s\n" 
+    (Ir.Ops.prec_string (Lazy.force uniform_result.value.prec));
+  let module Backend = (val Backends.fresh_backend ()) in
+  try
+    let _ctx = Train.forward_once (module Backend) uniform_result in
+    Stdlib.Printf.printf "Compilation successful!\n";
+    (* Also check the actual value precision in the context *)
+    let tn = rng_result.value in
+    Stdlib.Printf.printf "Actual tensor precision in context: %s\n" 
+      (Ir.Ops.prec_string (Lazy.force tn.prec))
+  with
+  | Utils.User_error msg -> Stdlib.Printf.printf "Error: %s\n" msg
+  | e -> Stdlib.Printf.printf "Unexpected error: %s\n" (Exn.to_string e)
diff --git a/test/training/moons_demo_parallel.ml b/test/training/moons_demo_parallel.ml
@@ -22,9 +22,11 @@ let main () =
   let epochs = 60 in
   (* let epochs = 1 in *)
   let moons_config = Datasets.Half_moons.Config.{ noise_range = 0.1; seed = Some seed } in
-  let moons_coordinates, moons_labels = Datasets.Half_moons.generate ~config:moons_config ~len () in
-  let moons_flat_ndarray = Ir.Ndarray.as_array Ir.Ops.Double moons_coordinates in
-  let moons_classes_ndarray = Ir.Ndarray.as_array Ir.Ops.Double moons_labels in
+  let moons_coordinates, moons_labels =
+    Datasets.Half_moons.generate_single_prec ~config:moons_config ~len ()
+  in
+  let moons_flat_ndarray = Ir.Ndarray.as_array Ir.Ops.Single moons_coordinates in
+  let moons_classes_ndarray = Ir.Ndarray.as_array Ir.Ops.Single moons_labels in
   let moons_flat = TDSL.rebatch ~l:"moons_flat" moons_flat_ndarray () in
   let moons_classes = TDSL.rebatch ~l:"moons_classes" moons_classes_ndarray () in
   let%op mlp x = "w3" * relu ("b2" hid_dim + ("w2" * relu ("b1" hid_dim + ("w1" * x)))) in