Fourth pass on bidirectional precision inference: don't force precisions from below from defaults when inferring

lukstafi · lukstafi · commit 0a45e7bcf9b7 · 2025-08-09T21:35:14.000+02:00
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
@@ -62,7 +62,7 @@ type memory_mode =
           optional [array] of {!t}). *)
 [@@deriving sexp, compare, equal]
 
-type delayed_prec = Not_specified | Default_spec of Ops.prec Lazy.t | Specified of Ops.prec
+type delayed_prec = Default of Ops.prec | Inferred of Ops.prec Lazy.t | Specified of Ops.prec
 [@@deriving sexp, equal]
 
 type prepare = { is_done : unit -> bool; sync : unit -> unit; transfer : unit -> unit }
@@ -378,7 +378,8 @@ let update_prec ?only_if tn prec =
     | Some cond -> (
         match tn.delayed_prec_unsafe with
         | Specified old_prec -> cond old_prec
-        | Default_spec old_prec when Lazy.is_val old_prec -> cond @@ Lazy.force old_prec
+        | Default old_prec -> cond old_prec
+        | Inferred old_prec when Lazy.is_val old_prec -> cond @@ Lazy.force old_prec
         | _ -> true)
   in
   if do_update then
@@ -409,12 +410,14 @@ let update_prec ?only_if tn prec =
                     ", but the precision is already set to ";
                     Ops.prec_string (Lazy.force tn.prec);
                   ])
-      | Default_spec old_prec, Some cond when not @@ Lazy.is_val old_prec ->
+      | Inferred old_prec, Some cond ->
           tn.delayed_prec_unsafe <-
-            Default_spec
+            Inferred
               (lazy
                 (let old = Lazy.force old_prec in
                  if cond old then prec else old))
+      | Default old_prec, Some cond ->
+          tn.delayed_prec_unsafe <- (if cond old_prec then Specified prec else Default old_prec)
       | _ -> tn.delayed_prec_unsafe <- Specified prec
 
 let update_infer_prec tn delayed_prec =
@@ -430,11 +433,11 @@ let update_infer_prec tn delayed_prec =
   else
     match tn.delayed_prec_unsafe with
     | Specified _ -> () (* User-specified precision has higher priority *)
-    | Not_specified -> tn.delayed_prec_unsafe <- Default_spec delayed_prec
-    | Default_spec old_prec ->
-        (* Combine with existing default precision via promotion *)
+    | Default _ -> tn.delayed_prec_unsafe <- Inferred delayed_prec
+    | Inferred old_prec ->
+        (* Combine with existing inferred precision via promotion *)
         tn.delayed_prec_unsafe <-
-          Default_spec (lazy (Ops.promote_prec (Lazy.force old_prec) (Lazy.force delayed_prec)))
+          Inferred (lazy (Ops.promote_prec (Lazy.force old_prec) (Lazy.force delayed_prec)))
 
 let get_specified_prec tn =
   match tn.delayed_prec_unsafe with Specified prec -> Some prec | _ -> None
@@ -450,9 +453,8 @@ let exceeds_fp16_cutoff tn c =
         if Lazy.is_val tn.prec then Lazy.force tn.prec
         else
           match tn.delayed_prec_unsafe with
-          | Specified prec -> prec
-          | Default_spec prec -> Lazy.force prec
-          | Not_specified -> Lazy.force tn.prec
+          | Specified prec | Default prec -> prec
+          | Inferred prec -> Lazy.force prec
       in
       Ops.is_up_to_fp16 prec
 
@@ -582,7 +584,7 @@ end)
 
 let registry = Registry.create 16
 
-let create ?default_prec ~id ~label ~dims ~padding () =
+let create delayed_prec ~id ~label ~dims ~padding () =
   let debug = "Host array for " ^ get_debug_name ~id ~label () in
   let rec array =
     lazy
@@ -594,17 +596,12 @@ let create ?default_prec ~id ~label ~dims ~padding () =
   and prec =
     lazy
       (match tn.delayed_prec_unsafe with
-      | Specified prec | Default_spec (lazy prec) -> prec
-      | Not_specified ->
-          raise @@ Utils.User_error "Tnode.update_prec: precision is not specified yet")
+      | Default prec | Specified prec | Inferred (lazy prec) -> prec)
   and size_in_bytes = lazy (num_elems tn * Ops.prec_in_bytes (Lazy.force tn.prec))
   and tn =
-    let delayed_prec_unsafe =
-      match default_prec with None -> Not_specified | Some prec -> Default_spec prec
-    in
     {
       array;
-      delayed_prec_unsafe;
+      delayed_prec_unsafe = delayed_prec;
       prec;
       dims;
       padding;
diff --git a/lib/precision_inference.md b/lib/precision_inference.md
@@ -1,6 +1,10 @@
 # Bidirectional precision inference
 
-OCANNL features a rudimentary bidirectional precision inference. It is much much less powerful than the constraints-based shape and projections inference. It is somewhat prominent because it contributes the `top_down_prec` flag to the central `Tensor.t` type. The core algorithm is just a couple dozen lines in the `Tensor.op` function, first the bottom-up pass:
+OCANNL features a rudimentary bidirectional precision inference. It is much less powerful than the constraints-based shape and projections inference. It is somewhat prominent because it contributes the `top_down_prec` flag to the central `Tensor.t` type.
+
+Tensors that choose `top_down_prec=true` "detach" themselves from their defining tensor expression as far as precision goes. By default tensors are `top_down_prec=false`, except for all the parameter tensors (created via `Tensor.param`), and results of the operation `uint4x32_to_prec_uniform`. When a tensor precision is set by the user via `Tnode.update_prec`, this setting takes precedence over any inferences. When a `top_down_prec=true` tensor has its precision set by the user, it contributes this precision in the bottom up inference (together with all `top_down_prec=false` subtensors).
+
+The core algorithm is just a couple dozen lines in the `Tensor.op` function, first the bottom-up pass:
 
 ```ocaml
   let default_prec_for default get =
@@ -34,4 +38,3 @@ and later the top-down pass, here from the value node `v`:
   List.iter top_down_ts ~f:(fun ti -> update_infer_prec ti.value v.Tn.prec);
 ```
 
-Tensors that choose `top_down_prec=true` "detach" themselves from their defining tensor expression as far as precision goes. By default tensors are `top_down_prec=false`, except for all the parameter tensors (created via `Tensor.param`), and results of the operation `uint4x32_to_prec_uniform`.
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -255,10 +255,10 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
   let shape = make_shape ~debug_name:(Tn.get_debug_name ~id ~label ()) ~id in
   (* Split subtensors by whether they use top-down precision inference *)
   let top_down_ts = List.filter ordered_ts ~f:(fun t -> t.top_down_prec) in
-  let default_prec_for default get =
+  let delayed_prec_for default get =
     if top_down_prec then
       (* For top-down precision, don't promote from inputs *)
-      lazy default
+      Tn.Default default
     else
       (* For bottom-up precision, only promote from non-top-down subtensors *)
       let lazy_v_precs =
@@ -267,12 +267,13 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
                 if ti.top_down_prec then lazy (Tn.get_specified_prec v)
                 else lazy (Some (Lazy.force v.prec))))
       in
-      lazy
-        (List.filter_map lazy_v_precs ~f:Lazy.force
-        |> List.reduce ~f:Ir.Ops.promote_prec
-        |> Option.value ~default)
+      Tn.Inferred
+        (lazy
+          (List.filter_map lazy_v_precs ~f:Lazy.force
+          |> List.reduce ~f:Ir.Ops.promote_prec
+          |> Option.value ~default))
   in
-  let default_prec = default_prec_for !default_value_prec (fun t -> Some t.value) in
+  let delayed_prec = delayed_prec_for !default_value_prec (fun t -> Some t.value) in
   let terminal_logic () =
     let open Shape in
     match terminal_op with
@@ -291,7 +292,7 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
     | Some (Shape.Data (Asgns.Padded { data; padding = padding_spec; padded_value })) ->
         let padding = Some (padding_spec, padded_value) in
         Tn.create_from_padded ~id ~label ~ndarray:data ~padding ()
-    | Some (Shape.Fetch _) | None -> Tn.create ~default_prec ~id ~label ~dims ~padding ()
+    | Some (Shape.Fetch _) | None -> Tn.create delayed_prec ~id ~label ~dims ~padding ()
   in
   let update_infer_prec tn prec =
     (* Instead of just checking prec, we cross-check with dims (needed for code generation), to
@@ -363,11 +364,11 @@ let%track7_sexp op ~(label : string list) ?(ternary_op = Shape.Pointwise_tern)
     t)
   else
     let get ti = Option.map ti.diff ~f:(fun d -> d.grad) in
-    let default_prec = default_prec_for !default_grad_prec get in
+    let delayed_prec = delayed_prec_for !default_grad_prec get in
     let grad_id = session_state.next_id in
     session_state.next_id <- session_state.next_id + 1;
     let g =
-      Tn.create ~default_prec ~id:grad_id ~label:("grad" :: label) ~dims
+      Tn.create delayed_prec ~id:grad_id ~label:("grad" :: label) ~dims
         ~padding:(lazy (Shape.to_padding shape))
         ()
     in
diff --git a/test/operations/dune b/test/operations/dune
@@ -83,7 +83,7 @@
     %{dep:threefry4x32_demo.exe}
     "--ocannl_output_prec_in_ll_files=true"
     "--ocannl_output_debug_files_in_build_directory=true"
-    "--ocannl_clean_up_artifacts_on_startup=false")
+    "--ocannl_clean_up_artifacts_on_startup=true")
    (run
     %{dep:top_down_prec.exe}
     "--ocannl_output_prec_in_ll_files=true"
diff --git a/test/operations/top_down_prec-unoptimized.ll.expected b/test/operations/top_down_prec-unoptimized.ll.expected
@@ -1,5 +1,5 @@
 
 d_fwd (): /* d fwd */
-  n6<half>[0] := (a<single>[0] + b<half>[0]);
-  d<bfloat16>[0] := (n6<half>[0] * c<single>[0]);
+  n6<half>[0] := (a<half>[0] + b<half>[0]);
+  d<bfloat16>[0] := (n6<half>[0] * c<bfloat16>[0]);
   /* end */
diff --git a/test/operations/top_down_prec.expected b/test/operations/top_down_prec.expected
@@ -1,11 +1,14 @@
 Retrieving commandline, environment, or config file variable ocannl_log_level
 Found 0, in the config file
-┌────────────────────┐
-│[8]: *._d shape 0:1 │
-│┌┬──────┐           │
-│││axis 0│           │
-│├┼──────┤           │
-│││ 8.00 │           │
-│└┴──────┘           │
-└────────────────────┘
-grad_*._d <not-hosted>
+                      #8 *._d
+                       8.00
+                      #9 grad_*._d Virt/30
+                      <void>
+             #6 + Virt/152                 │#4 c non-emb
+             <void>                        │ 2.00
+             #7 grad_+ Virt/30             │#5 grad_c Local/26030
+             <void>                        │<void>
+#0 a non-emb         │#2 b non-emb         │
+ 2.00                │ 2.00                │
+#1 grad_a Local/26030│#3 grad_b Local/26030│
+<void>               │<void>               │
diff --git a/test/operations/top_down_prec.ml b/test/operations/top_down_prec.ml
@@ -12,7 +12,7 @@ let () =
   let%op d = ("a" [2] + "b" [2]) *. "c" [2] in
   Tn.update_prec b.value Ir.Ops.half;
   Tn.update_prec d.value Ir.Ops.bfloat16;
-  (* Compile and run *)
+  (* Even when the default precision is single, c is bfloat16 and a is half. *)
   Ocannl.Train.set_hosted d.value;
   ignore (Ocannl.Train.forward_once (module Backend) d);
-  Train.printf d
+  Train.printf_tree d