Untested: mixed precision conversions in C_syntax; related cleanup

lukstafi · lukstafi · commit 3ea5f5831ea8 · 2024-09-12T16:35:30.000+02:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,8 +2,10 @@
 
 ### Added
 
-- The previously-mocked support for half precision.
+- Implemented the previously-mocked support for half precision (FP16).
   - We work around the missing Ctypes coverage by not using `Ctypes.bigarray_start`.
+  - We check FP16 constants for overflow.
+  - We output half precision specific code from the CUDA backend.
 
 ### Changed
 
@@ -18,6 +20,8 @@
   - `debug_log_from_routines` should only happen when `log_level > 1`.
 - Bugs in `Multicore_backend`: `await` was not checking queue emptiness, `worker`'s `Condition.broadcast` was non-atomically guarded (doesn't need to be), possible deadloop due to the lockfree queue -- now replaced with `saturn_lockfree`.
 - Reduced busy-waiting inside `c_compile_and_load`, propagating compilation errors now instead of infinite loop on error.
+- Fixed loss of significant digits for small numbers when outputting files.
+- Added missing mixed-precision conversions in the `C_syntax` backend builder.
 
 ## [0.4.0] -- 2024-09-04
 
diff --git a/arrayjit/lib/backend_utils.ml b/arrayjit/lib/backend_utils.ml
@@ -66,6 +66,7 @@ struct
 
   let pp_index_axis ppf = function
     | Indexing.Iterator it -> pp_index ppf it
+    | Fixed_idx i when i < 0 -> Stdlib.Format.fprintf ppf "(%d)" i
     | Fixed_idx i -> Stdlib.Format.fprintf ppf "%d" i
 
   let pp_array_offset ppf (idcs, dims) =
@@ -223,33 +224,38 @@ struct
       | Binop (_, v1, v2) -> pp_top_locals ppf v1 + pp_top_locals ppf v2
       | Unop (_, v) -> pp_top_locals ppf v
     and pp_float prec ppf value =
-      let num_typ = B.typ_of_prec prec in
       let loop = pp_float prec in
       match value with
       | Local_scope { id; _ } ->
           (* Embedding of Local_scope is done by pp_top_locals. *)
           loop ppf @@ Get_local id
       | Get_local id ->
-          let get_typ = B.typ_of_prec id.tn.prec in
-          if not @@ String.equal num_typ get_typ then fprintf ppf "(%s)" num_typ;
-          fprintf ppf "v%d" id.scope_id
+          let prefix, postfix = B.convert_precision ~from:id.tn.prec ~to_:prec in
+          fprintf ppf "%sv%d%s" prefix id.scope_id postfix
       | Get_global (Ops.Merge_buffer { source_node_id }, Some idcs) ->
           let tn = Option.value_exn ~here:[%here] @@ Tn.find ~id:source_node_id in
-          fprintf ppf "@[<2>((%s*)merge_buffer)[%a@;<0 -2>]@]" (B.typ_of_prec prec) pp_array_offset
+          let prefix, postfix = B.convert_precision ~from:tn.prec ~to_:prec in
+          fprintf ppf "@[<2>%smerge_buffer[%a@;<0 -2>]%s@]" prefix pp_array_offset
             (idcs, Lazy.force tn.dims)
+            postfix
       | Get_global _ -> failwith "C_syntax: Get_global / FFI NOT IMPLEMENTED YET"
       | Get (tn, idcs) ->
-          (* FIXME: implement type casts here and in other places to support mixed precision. *)
           Hash_set.add visited tn;
           let ident = get_ident tn in
-          fprintf ppf "@[<2>%s[%a@;<0 -2>]@]" ident pp_array_offset (idcs, Lazy.force tn.dims)
+          let prefix, postfix = B.convert_precision ~from:tn.prec ~to_:prec in
+          fprintf ppf "@[<2>%s%s[%a@;<0 -2>]%s@]" prefix ident pp_array_offset
+            (idcs, Lazy.force tn.dims)
+            postfix
       | Constant c ->
           let prefix, postfix = B.convert_precision ~from:Ops.double ~to_:prec in
+          let prefix, postfix =
+            if String.is_empty prefix && Float.(c < 0.0) then ("(", ")" ^ postfix)
+            else (prefix, postfix)
+          in
           fprintf ppf "%s%.16g%s" prefix c postfix
       | Embed_index idx ->
-          if not @@ List.exists ~f:(String.equal num_typ) [ "int"; "size_t" ] then
-            fprintf ppf "(%s)" num_typ;
-          pp_index_axis ppf idx
+          let prefix, postfix = B.convert_precision ~from:Ops.double ~to_:prec in
+          fprintf ppf "%s%a%s" prefix pp_index_axis idx postfix
       | Binop (Arg1, v1, _v2) -> loop ppf v1
       | Binop (Arg2, _v1, v2) -> loop ppf v2
       | Binop (op, v1, v2) ->
@@ -259,31 +265,39 @@ struct
           let prefix, postfix = B.unop_syntax prec op in
           fprintf ppf "@[<1>%s%a@]%s" prefix loop v postfix
     and debug_float prec (value : Low_level.float_t) : string * 'a list =
-      let num_typ = B.typ_of_prec prec in
       let loop = debug_float prec in
       match value with
       | Local_scope { id; _ } ->
           (* Not printing the inlined definition: (1) code complexity; (2) don't overload the debug
              logs. *)
           loop @@ Get_local id
       | Get_local id ->
-          let get_typ = B.typ_of_prec id.tn.prec in
-          let v =
-            (if not @@ String.equal num_typ get_typ then "(" ^ num_typ ^ ")" else "")
-            ^ "v" ^ Int.to_string id.scope_id
-          in
+          let prefix, postfix = B.convert_precision ~from:id.tn.prec ~to_:prec in
+          let v = String.concat [ prefix; "v"; Int.to_string id.scope_id; postfix ] in
           (v ^ "{=%g}", [ `Value v ])
       | Get_global (Ops.Merge_buffer { source_node_id }, Some idcs) ->
           let tn = Option.value_exn ~here:[%here] @@ Tn.find ~id:source_node_id in
+          let prefix, postfix = B.convert_precision ~from:tn.prec ~to_:prec in
           let dims = Lazy.force tn.dims in
-          let v = sprintf "@[<2>merge_buffer[%s@;<0 -2>]@]" (array_offset_to_string (idcs, dims)) in
-          ("merge_buffer[%u]{=%g}", [ `Accessor (idcs, dims); `Value v ])
+          let v =
+            sprintf "@[<2>%smerge_buffer[%s@;<0 -2>]%s@]" prefix
+              (array_offset_to_string (idcs, dims))
+              postfix
+          in
+          ( String.concat [ prefix; "merge_buffer[%u]"; postfix; "{=%g}" ],
+            [ `Accessor (idcs, dims); `Value v ] )
       | Get_global _ -> failwith "Exec_as_cuda: Get_global / FFI NOT IMPLEMENTED YET"
       | Get (tn, idcs) ->
           let dims = Lazy.force tn.dims in
           let ident = get_ident tn in
-          let v = sprintf "@[<2>%s[%s@;<0 -2>]@]" ident (array_offset_to_string (idcs, dims)) in
-          (ident ^ "[%u]{=%g}", [ `Accessor (idcs, dims); `Value v ])
+          let prefix, postfix = B.convert_precision ~from:tn.prec ~to_:prec in
+          let v =
+            sprintf "@[<2>%s%s[%s@;<0 -2>]%s@]" prefix ident
+              (array_offset_to_string (idcs, dims))
+              postfix
+          in
+          ( String.concat [ prefix; ident; "[%u]"; postfix; "{=%g}" ],
+            [ `Accessor (idcs, dims); `Value v ] )
       | Constant c ->
           let prefix, postfix = B.convert_precision ~from:Ops.double ~to_:prec in
           (prefix ^ Float.to_string c ^ postfix, [])
diff --git a/arrayjit/lib/cuda_backend.cudajit.ml b/arrayjit/lib/cuda_backend.cudajit.ml
@@ -376,7 +376,7 @@ struct
     | Half_prec _, Half_prec _
     | Byte_prec _, Byte_prec _
     | Void_prec, Void_prec ->
-        ("(", ")")
+        ("", "")
     | Double_prec _, Half_prec _ -> ("__double2half(", ")")
     | Single_prec _, Half_prec _ -> ("__float2half(", ")")
     | Byte_prec _, Half_prec _ -> ("__ushort2half_rn((unsigned short int)", ")")
diff --git a/arrayjit/lib/low_level.ml b/arrayjit/lib/low_level.ml
@@ -739,17 +739,14 @@ let simplify_llc llc =
   let check_constant =
     match Utils.settings.check_half_prec_constants_cutoff with
     | None -> fun _prec _c -> ()
-    | Some cutoff -> (
+    | Some cutoff ->
         fun tn c ->
-          match tn.Tn.prec with
-          | Ops.Half_prec _ ->
-              if Float.(abs c >= cutoff) then
-                raise
-                @@ Utils.User_error
-                     ("Constant " ^ Float.to_string c
-                    ^ " is too big for FP16 aka. half precision, risk of overflow; increase \
-                       precision of tensor node " ^ Tn.debug_name tn)
-          | _ -> ())
+          if Ops.is_fp16 tn.Tn.prec && Float.(abs c >= cutoff) then
+            raise
+            @@ Utils.User_error
+                 ("Constant " ^ Float.to_string c
+                ^ " is too big for FP16 aka. half precision, risk of overflow; increase precision \
+                   of tensor node " ^ Tn.debug_name tn)
   in
   let rec check_proc llc =
     let loop = check_proc in
diff --git a/arrayjit/lib/ops.ml b/arrayjit/lib/ops.ml
@@ -28,6 +28,7 @@ let byte = Byte_prec Byte
 let half = Half_prec Half
 let single = Single_prec Single
 let double = Double_prec Double
+let is_fp16 = function Half_prec _ -> true | _ -> false
 
 let sexp_of_prec = function
   | Void_prec -> Sexp.Atom "Void_prec"
@@ -226,7 +227,7 @@ let c_convert_precision ~from ~to_ =
   | Half_prec _, Half_prec _
   | Byte_prec _, Byte_prec _
   | Void_prec, Void_prec ->
-      ("(", ")")
+      ("", "")
   | _ -> ("(" ^ c_typ_of_prec to_ ^ ")(", ")")
 
 (** {2 *** Global references ***} *)