ahrefs
diff --git a/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 18 additions & 3 deletions b/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎arrayjit/lib/cc_backend.ml‎
Lines changed: 86 additions & 11 deletions b/‎arrayjit/lib/cc_backend.ml‎
Lines changed: 86 additions & 11 deletions
diff --git a/‎arrayjit/lib/cuda_backend.ml‎
Lines changed: 20 additions & 23 deletions b/‎arrayjit/lib/cuda_backend.ml‎
Lines changed: 20 additions & 23 deletions
diff --git a/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 17 additions & 10 deletions b/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 17 additions & 10 deletions
@@ -175,7 +175,7 @@ struct
     match op with
     | Ops.Satur01_gate -> (
         match prec with
-        | Ops.Byte_prec _ ->
+        | Ops.Byte_prec _ | Ops.Uint16_prec _ | Ops.Int32_prec _ | Ops.Fp8_prec _ ->
             let open PPrint in
             group
               (parens
@@ -185,10 +185,25 @@ struct
                       ^^ string " < 1.0f"))
                  ^^ ifflat
                       (space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
-                     ^^ string "(unsigned char)0")
+                     ^^ string "(" ^^ string (typ_of_prec prec) ^^ string ")0")
                       (nest 2
                          (break 1 ^^ string "?" ^^ space ^^ v2 ^^ break 1 ^^ string ":" ^^ space
-                        ^^ string "(unsigned char)0"))))
+                        ^^ string "(" ^^ string (typ_of_prec prec) ^^ string ")0"))))
+        | Ops.Bfloat16_prec _ ->
+            (* For CC backend, convert to float for computation *)
+            let open PPrint in
+            group
+              (parens
+                 (group
+                    (parens
+                       (string "(float)" ^^ v1 ^^ string " > 0.0f && (float)" ^^ v1
+                      ^^ string " < 1.0f"))
+                 ^^ ifflat
+                      (space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
+                     ^^ string "(unsigned short)0")
+                      (nest 2
+                         (break 1 ^^ string "?" ^^ space ^^ v2 ^^ break 1 ^^ string ":" ^^ space
+                        ^^ string "(unsigned short)0"))))
         | Ops.Half_prec _ ->
             let open PPrint in
             group
 
@@ -83,16 +83,30 @@ let%track7_sexp c_compile_and_load ~f_name =
   Stdlib.Gc.finalise finalize result;
   result
 
-let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized) : procedure =
-  let module Syntax = C_syntax.C_syntax (C_syntax.Pure_C_config (struct
+module CC_syntax_config (Procs : sig
+  val procs : Low_level.optimized array
+end) =
+struct
+  include C_syntax.Pure_C_config (struct
     type nonrec buffer_ptr = buffer_ptr
 
     let use_host_memory = use_host_memory
-    let procs = [| lowered |]
+    let procs = Procs.procs
 
     let full_printf_support =
       not @@ Bool.of_string
@@ Utils.get_global_arg ~default:"false" ~arg_name:"prefer_backend_uniformity"
+  end)
+
+  (* Override to add our custom type and conversion support *)
+  let typ_of_prec = typ_of_prec
+  let extra_declarations = extra_declarations  (* Our bfloat16/fp8 conversion functions *)
+  let convert_precision = convert_precision
+end
+
+let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized) : procedure =
+  let module Syntax = C_syntax.C_syntax (CC_syntax_config (struct
+    let procs = [| lowered |]
   end)) in
   (* FIXME: do we really want all of them, or only the used ones? *)
   let idx_params = Indexing.bound_symbols bindings in
@@ -110,15 +124,8 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)
 
 let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized option array) :
     procedure option array =
-  let module Syntax = C_syntax.C_syntax (C_syntax.Pure_C_config (struct
-    type nonrec buffer_ptr = buffer_ptr
-
-    let use_host_memory = use_host_memory
+  let module Syntax = C_syntax.C_syntax (CC_syntax_config (struct
     let procs = Array.filter_opt lowereds
-
-    let full_printf_support =
-      not @@ Bool.of_string
-      @@ Utils.get_global_arg ~default:"false" ~arg_name:"prefer_backend_uniformity"
   end)) in
   (* FIXME: do we really want all of them, or only the used ones? *)
   let idx_params = Indexing.bound_symbols bindings in
@@ -203,3 +210,71 @@ let%track3_sexp link_compiled ~merge_buffer ~runner_label ctx_arrays (code : pro
         description = "executes " ^ code.name ^ " on " ^ runner_label;
         work;
       } )
+(*
+let typ_of_prec = function
+  | Ops.Byte_prec _ -> "unsigned char"
+  | Ops.Uint16_prec _ -> "unsigned short"
+  | Ops.Int32_prec _ -> "int"
+  | Ops.Half_prec _ -> "_Float16"
+  | Ops.Bfloat16_prec _ -> "unsigned short"  (* Stored as uint16, emulated as float *)
+  | Ops.Fp8_prec _ -> "unsigned char"  (* Stored as uint8, emulated as float *)
+  | Ops.Single_prec _ -> "float"
+  | Ops.Double_prec _ -> "double"
+  | Ops.Void_prec -> "void"
+
+(* Helper functions for bfloat16 and fp8 conversions *)
+let extra_declarations =
+  [
+    "/* Emulation functions for special float types */";
+    "static inline float bfloat16_to_float(unsigned short bf16) {";
+    "    unsigned int f32 = ((unsigned int)bf16) << 16;";
+    "    return *(float*)&f32;";
+    "}";
+    "";
+    "static inline unsigned short float_to_bfloat16(float f) {";
+    "    unsigned int f32 = *(unsigned int*)&f;";
+    "    unsigned int rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);";
+    "    return (unsigned short)(rounded >> 16);";
+    "}";
+    "";
+    "/* Simplified FP8 E5M2 format emulation */";
+    "static inline float fp8_to_float(unsigned char fp8) {";
+    "    if (fp8 == 0) return 0.0f;";
+    "    unsigned int sign = (fp8 >> 7) & 1;";
+    "    unsigned int exp = (fp8 >> 2) & 0x1F;";
+    "    unsigned int mant = fp8 & 0x3;";
+    "    float result = (1.0f + mant * 0.25f) * powf(2.0f, (float)exp - 15.0f);";
+    "    return sign ? -result : result;";
+    "}";
+    "";
+    "static inline unsigned char float_to_fp8(float f) {";
+    "    if (f == 0.0f) return 0;";
+    "    unsigned int sign = (f < 0) ? 1 : 0;";
+    "    f = fabsf(f);";
+    "    int exp = (int)floorf(log2f(f)) + 15;";
+    "    if (exp < 0) return 0;";
+    "    if (exp > 31) return sign ? 0xFF : 0x7F;";
+    "    float mant = f / powf(2.0f, (float)exp - 15.0f) - 1.0f;";
+    "    unsigned int mant_bits = (unsigned int)(mant * 4.0f + 0.5f);";
+    "    if (mant_bits > 3) mant_bits = 3;";
+    "    return (unsigned char)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));";
+    "}";
+  ]
+
+let convert_precision ~from ~to_ =
+  match (from, to_) with
+  | p1, p2 when Ops.equal_prec p1 p2 -> ("", "")
+  | Ops.Bfloat16_prec _, Ops.Single_prec _ -> ("bfloat16_to_float(", ")")
+  | Ops.Bfloat16_prec _, Ops.Double_prec _ -> ("(double)bfloat16_to_float(", ")")
+  | Ops.Single_prec _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16(", ")")
+  | Ops.Double_prec _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16((float)", ")")
+  | Ops.Fp8_prec _, Ops.Single_prec _ -> ("fp8_to_float(", ")")
+  | Ops.Fp8_prec _, Ops.Double_prec _ -> ("(double)fp8_to_float(", ")")
+  | Ops.Single_prec _, Ops.Fp8_prec _ -> ("float_to_fp8(", ")")
+  | Ops.Double_prec _, Ops.Fp8_prec _ -> ("float_to_fp8((float)", ")")
+  | Ops.Bfloat16_prec _, _ -> ("(float)bfloat16_to_float(", ")")  (* Convert via float *)
+  | _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16((float)", ")")
+  | Ops.Fp8_prec _, _ -> ("(float)fp8_to_float(", ")")  (* Convert via float *)
+  | _, Ops.Fp8_prec _ -> ("float_to_fp8((float)", ")")
+  | _ -> Ops.c_convert_precision ~from ~to_
+*)
@@ -281,10 +281,14 @@ end) : Ir.Backend_impl.Lowered_backend = struct
 
     let typ_of_prec = function
       | Ops.Byte_prec _ -> "unsigned char"
-      | Half_prec _ -> "__half"
-      | Single_prec _ -> "float"
-      | Double_prec _ -> "double"
-      | Void_prec -> "void"
+      | Ops.Uint16_prec _ -> "unsigned short"
+      | Ops.Int32_prec _ -> "int"
+      | Ops.Half_prec _ -> "__half"
+      | Ops.Bfloat16_prec _ -> "__nv_bfloat16"  (* CUDA bfloat16 type *)
+      | Ops.Fp8_prec _ -> "__nv_fp8_e5m2"  (* CUDA FP8 type (E5M2 format) *)
+      | Ops.Single_prec _ -> "float"
+      | Ops.Double_prec _ -> "double"
+      | Ops.Void_prec -> "void"
 
     let binop_syntax prec v =
       (* TODO: consider using binop_syntax inherited from Pure_C_config and overriding only where
@@ -317,9 +321,14 @@ end) : Ir.Backend_impl.Lowered_backend = struct
               (string "hexp2(hlog2(" ^^ v1 ^^ string "),"
               ^^ ifflat (space ^^ v2) (nest 2 (break 1 ^^ v2))
               ^^ string ")")
-      | ToPowOf, Byte_prec _ ->
-          invalid_arg "Cuda_backend.binop_syntax: ToPowOf not supported for byte/integer precisions"
-      | Relu_gate, Byte_prec _ ->
+      | ToPowOf, (Byte_prec _ | Uint16_prec _ | Int32_prec _ | Fp8_prec _) ->
+          invalid_arg "Cuda_backend.binop_syntax: ToPowOf not supported for integer precisions"
+      | ToPowOf, Bfloat16_prec _ ->
+          fun v1 v2 ->
+            group
+              (string "__float2bfloat16(powf(__bfloat162float(" ^^ v1 ^^ string "), __bfloat162float("
+              ^^ v2 ^^ string ")))")
+      | Relu_gate, (Byte_prec _ | Uint16_prec _ | Int32_prec _ | Fp8_prec _) ->
           fun v1 v2 ->
             group
               (parens
@@ -330,31 +339,19 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                       (nest 2
                          (break 1 ^^ string "?" ^^ space ^^ v2 ^^ break 1 ^^ string ":" ^^ space
                         ^^ string "0"))))
-      | Relu_gate, Half_prec _ ->
+      | Relu_gate, Bfloat16_prec _ ->
           fun v1 v2 ->
             group
               (parens
                  (group
                     (parens
-                       (string "__hgt(" ^^ v1 ^^ comma
-                       ^^ string " __ushort_as_half((unsigned short)0x0000U))"))
-                 ^^ ifflat
-                      (space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
-                      ^^ string "__ushort_as_half((unsigned short)0x0000U)")
-                      (nest 2
-                         (break 1 ^^ string "?" ^^ space ^^ v2 ^^ break 1 ^^ string ":" ^^ space
-                         ^^ string "__ushort_as_half((unsigned short)0x0000U)"))))
-      | Relu_gate, _ ->
-          fun v1 v2 ->
-            group
-              (parens
-                 (group (parens (v1 ^^ string " > 0.0"))
+                       (string "__bfloat162float(" ^^ v1 ^^ string ") > 0.0f"))
                  ^^ ifflat
                       (space ^^ string "?" ^^ space ^^ v2 ^^ space ^^ string ":" ^^ space
-                     ^^ string "0.0")
+                      ^^ string "__float2bfloat16(0.0f)")
                       (nest 2
                          (break 1 ^^ string "?" ^^ space ^^ v2 ^^ break 1 ^^ string ":" ^^ space
-                        ^^ string "0.0"))))
+                         ^^ string "__float2bfloat16(0.0f)"))))
       | Satur01_gate, Byte_prec _ ->
           fun v1 v2 ->
             group
 
@@ -163,7 +163,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
            queue_desc / queue itself. *)
         let created_q = Me.CommandQueue.on_device_with_descriptor metal_device queue_desc in
         (* Store the log_entries_ref for later retrieval, associated with the stream_id which will
-           be assigned by make_stream shortly. We\'ll add it after make_stream. *)
+           be assigned by make_stream shortly. We'll add it after make_stream. *)
         (created_q, Some log_entries_ref))
       else (Me.CommandQueue.on_device metal_device, None)
     in
@@ -440,18 +440,25 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     let extra_declarations = [ "using namespace metal;" ]
 
     let typ_of_prec = function
-      | Ops.Byte_prec _ -> "uint8_t"
-      | Half_prec _ -> "half"
-      | Single_prec _ -> "float"
-      | Double_prec _ -> "double"
-      | Void_prec -> "void"
+      | Ops.Byte_prec _ -> "uchar"
+      | Ops.Uint16_prec _ -> "ushort"
+      | Ops.Int32_prec _ -> "int"
+      | Ops.Half_prec _ -> "half"
+      | Ops.Bfloat16_prec _ -> "bfloat"  (* Metal supports bfloat16 natively *)
+      | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
+      | Ops.Single_prec _ -> "float"
+      | Ops.Double_prec _ -> "double"
+      | Ops.Void_prec -> "void"
 
     let metal_prec_suffix_float = function
-      (* Suffix for float literals like 0.0, 1.0 *)
+      | Ops.Byte_prec _ -> ""
+      | Ops.Uint16_prec _ -> ""
+      | Ops.Int32_prec _ -> ""
       | Ops.Half_prec _ -> "h"
+      | Ops.Bfloat16_prec _ -> "bf"  (* TODO: Verify actual Metal suffix for bfloat16 *)
+      | Ops.Fp8_prec _ -> invalid_arg "Metal backend does not support FP8 precision"
       | Ops.Single_prec _ -> "f"
-      | Ops.Double_prec _ -> "" (* No suffix for double literals *)
-      | Ops.Byte_prec _ -> ""
+      | Ops.Double_prec _ -> ""
       | Ops.Void_prec -> ""
 
     let ternop_syntax _prec op =
@@ -661,7 +668,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
 
     let work () : unit =
       [%log3_result "Launching", func_name, "on", runner_label];
-      (* Unlike CUDA, we don\'t use Utils.add_log_processor here. Logs are captured by the LogState
+      (* Unlike CUDA, we don't use Utils.add_log_processor here. Logs are captured by the LogState
          handler installed on the CommandQueue. They will be processed by Utils.log_trace_tree in
          `await`. *)
       try