Third pass on bidirectional precision inference: include top-down tensors with precision specified by the user in bottom-up propagation

lukstafi · lukstafi · commit 2d00d552d660 · 2025-08-09T20:38:58.000+02:00
There'll be a fourth pass, to not force precisions from below from defaults but rather start with unspecified.
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
@@ -436,6 +436,9 @@ let update_infer_prec tn delayed_prec =
         tn.delayed_prec_unsafe <-
           Default_spec (lazy (Ops.promote_prec (Lazy.force old_prec) (Lazy.force delayed_prec)))
 
+let get_specified_prec tn =
+  match tn.delayed_prec_unsafe with Specified prec -> Some prec | _ -> None
+
 let exceeds_fp16_cutoff tn c =
   match Utils.settings.check_half_prec_constants_cutoff with
   | None -> false
diff --git a/lib/precision_inference.md b/lib/precision_inference.md
@@ -0,0 +1,37 @@
+# Bidirectional precision inference
+
+OCANNL features a rudimentary bidirectional precision inference. It is much much less powerful than the constraints-based shape and projections inference. It is somewhat prominent because it contributes the `top_down_prec` flag to the central `Tensor.t` type. The core algorithm is just a couple dozen lines in the `Tensor.op` function, first the bottom-up pass:
+
+```ocaml
+  let default_prec_for default get =
+    if top_down_prec then
+      (* For top-down precision, don't promote from inputs *)
+      lazy default
+    else
+      (* For bottom-up precision, only promote from non-top-down subtensors *)
+      let lazy_v_precs =
+        List.filter_map ordered_ts ~f:(fun ti ->
+            Option.map (get ti) ~f:(fun v ->
+                if ti.top_down_prec then lazy (Tn.get_specified_prec v)
+                else lazy (Some (Lazy.force v.prec))))
+      in
+      lazy
+        (List.filter_map lazy_v_precs ~f:Lazy.force
+        |> List.reduce ~f:Ir.Ops.promote_prec
+        |> Option.value ~default)
+  in
+```
+
+and later the top-down pass, here from the value node `v`:
+
+```ocaml
+  let update_infer_prec tn prec =
+    (* Instead of just checking prec, we cross-check with dims (needed for code generation), to
+       catch prec forcing bugs. *)
+    if not (Lazy.is_val tn.Tn.dims) then Tn.update_infer_prec tn prec
+  in
+  (* Apply delayed top-down precision updates to parameter subtensors *)
+  List.iter top_down_ts ~f:(fun ti -> update_infer_prec ti.value v.Tn.prec);
+```
+
+Tensors that choose `top_down_prec=true` "detach" themselves from their defining tensor expression as far as precision goes. By default tensors are `top_down_prec=false`, except for all the parameter tensors (created via `Tensor.param`), and results of the operation `uint4x32_to_prec_uniform`.
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -254,20 +254,25 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
   let _session_state_next_id : int = session_state.next_id in
   let shape = make_shape ~debug_name:(Tn.get_debug_name ~id ~label ()) ~id in
   (* Split subtensors by whether they use top-down precision inference *)
-  let top_down_ts, bottom_up_ts = List.partition_tf ordered_ts ~f:(fun t -> t.top_down_prec) in
-  let default_prec =
+  let top_down_ts = List.filter ordered_ts ~f:(fun t -> t.top_down_prec) in
+  let default_prec_for default get =
     if top_down_prec then
       (* For top-down precision, don't promote from inputs *)
-      lazy !default_value_prec
+      lazy default
     else
       (* For bottom-up precision, only promote from non-top-down subtensors *)
-      let lazy_v_precs = List.map bottom_up_ts ~f:(fun ti -> ti.value.prec) in
-      let default = !default_value_prec in
+      let lazy_v_precs =
+        List.filter_map ordered_ts ~f:(fun ti ->
+            Option.map (get ti) ~f:(fun v ->
+                if ti.top_down_prec then lazy (Tn.get_specified_prec v)
+                else lazy (Some (Lazy.force v.prec))))
+      in
       lazy
-        (List.map lazy_v_precs ~f:Lazy.force
+        (List.filter_map lazy_v_precs ~f:Lazy.force
         |> List.reduce ~f:Ir.Ops.promote_prec
         |> Option.value ~default)
   in
+  let default_prec = default_prec_for !default_value_prec (fun t -> Some t.value) in
   let terminal_logic () =
     let open Shape in
     match terminal_op with
@@ -357,20 +362,8 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
     session_state.forward_roots <- Map.add_exn session_state.forward_roots ~key:id ~data:t;
     t)
   else
-    let default_prec =
-      if top_down_prec then
-        (* For top-down precision, don't promote from inputs *)
-        lazy !default_grad_prec
-      else
-        (* For bottom-up precision, only promote from non-top-down subtensors *)
-        let f ti = Option.map ti.diff ~f:(fun d -> d.grad.Tn.prec) in
-        let lazy_g_precs = List.filter_map bottom_up_ts ~f in
-        let default = !default_grad_prec in
-        lazy
-          (List.map lazy_g_precs ~f:Lazy.force
-          |> List.reduce ~f:Ir.Ops.promote_prec
-          |> Option.value ~default)
-    in
+    let get ti = Option.map ti.diff ~f:(fun d -> d.grad) in
+    let default_prec = default_prec_for !default_grad_prec get in
     let grad_id = session_state.next_id in
     session_state.next_id <- session_state.next_id + 1;
     let g =
diff --git a/test/operations/dune b/test/operations/dune
@@ -66,28 +66,59 @@
  (preprocess
   (pps ppx_here ppx_ocannl)))
 
+(test
+ (name top_down_prec)
+ (modules top_down_prec)
+ (libraries base ocannl)
+ (preprocess
+  (pps ppx_here ppx_ocannl)))
+
 (rule
  (alias runtest)
  (target
   (dir build_files))
  (action
-  (run
-   %{dep:threefry4x32_demo.exe}
-   "--ocannl_output_prec_in_ll_files=true"
-   "--ocannl_output_debug_files_in_build_directory=true")))
+  (progn
+   (run
+    %{dep:threefry4x32_demo.exe}
+    "--ocannl_output_prec_in_ll_files=true"
+    "--ocannl_output_debug_files_in_build_directory=true"
+    "--ocannl_clean_up_artifacts_on_startup=false")
+   (run
+    %{dep:top_down_prec.exe}
+    "--ocannl_output_prec_in_ll_files=true"
+    "--ocannl_output_debug_files_in_build_directory=true"
+    "--ocannl_clean_up_artifacts_on_startup=false"))))
+
+(rule
+ (deps "build_files/n3_fwd-unoptimized.ll")
+ (target "n3_fwd_with_prec-unoptimized.ll.actual")
+ (action
+  (copy
+   "build_files/n3_fwd-unoptimized.ll"
+   "n3_fwd_with_prec-unoptimized.ll.actual")))
+
+(rule
+ (alias runtest)
+ (action
+  (diff
+   "n3_fwd_with_prec-unoptimized.ll.expected"
+   "n3_fwd_with_prec-unoptimized.ll.actual")))
 
 (rule
- (deps "build_files/n3_fwd.ll")
- (target "n3_fwd_with_prec.ll.actual")
+ (deps "build_files/d_fwd-unoptimized.ll")
+ (target "top_down_prec-unoptimized.ll.actual")
  (action
-  (copy "build_files/n3_fwd.ll" "n3_fwd_with_prec.ll.actual")))
+  (copy
+   "build_files/d_fwd-unoptimized.ll"
+   "top_down_prec-unoptimized.ll.actual")))
 
 (rule
  (alias runtest)
  (action
   (diff
-   "n3_fwd_with_prec.ll.expected"
-   "n3_fwd_with_prec.ll.actual")))
+   "top_down_prec-unoptimized.ll.expected"
+   "top_down_prec-unoptimized.ll.actual")))
 
 (test
  (name test_vec_simple)
diff --git a/test/operations/n3_fwd_with_prec-unoptimized.ll.expected b/test/operations/n3_fwd_with_prec-unoptimized.ll.expected
@@ -0,0 +1,11 @@
+
+n3_fwd (): /* n3 fwd */
+  random_seed<uint4x32>[] := 42;
+  for i2 = 0 to 5 { n1<uint4x32>[i2] := i2; }
+  for i4 = 0 to 5 {
+    threefry4x32<uint4x32>[i4] := (random_seed<uint4x32>[] ^^^^ n1<uint4x32>[i4]);
+  }
+  for i6 = 0 to 5 {
+    n3<half>[8*i6]<8> := uint4x32_to_prec_uniform(threefry4x32<uint4x32>[i6], <8>);
+  }
+  /* end */
diff --git a/test/operations/n3_fwd_with_prec.ll.expected b/test/operations/n3_fwd_with_prec.ll.expected
diff --git a/test/operations/top_down_prec-unoptimized.ll.expected b/test/operations/top_down_prec-unoptimized.ll.expected
@@ -0,0 +1,5 @@
+
+d_fwd (): /* d fwd */
+  n6<half>[0] := (a<single>[0] + b<half>[0]);
+  d<bfloat16>[0] := (n6<half>[0] * c<single>[0]);
+  /* end */
diff --git a/test/operations/top_down_prec.expected b/test/operations/top_down_prec.expected
@@ -0,0 +1,11 @@
+Retrieving commandline, environment, or config file variable ocannl_log_level
+Found 0, in the config file
+┌────────────────────┐
+│[8]: *._d shape 0:1 │
+│┌┬──────┐           │
+│││axis 0│           │
+│├┼──────┤           │
+│││ 8.00 │           │
+│└┴──────┘           │
+└────────────────────┘
+grad_*._d <not-hosted>
diff --git a/test/operations/top_down_prec.ml b/test/operations/top_down_prec.ml
@@ -0,0 +1,18 @@
+(* A simple demo of using the Threefry4x32 PRNG in OCANNL *)
+
+open Base
+module Tensor = Ocannl.Tensor
+module Train = Ocannl.Train
+module TDSL = Ocannl.Operation.TDSL
+module Tn = Ir.Tnode
+
+let () =
+  Tensor.unsafe_reinitialize ();
+  let module Backend = (val Backends.fresh_backend ()) in
+  let%op d = ("a" [2] + "b" [2]) *. "c" [2] in
+  Tn.update_prec b.value Ir.Ops.half;
+  Tn.update_prec d.value Ir.Ops.bfloat16;
+  (* Compile and run *)
+  Ocannl.Train.set_hosted d.value;
+  ignore (Ocannl.Train.forward_once (module Backend) d);
+  Train.printf d