Claude's second pass at adding BF16, FP8: conversion functions

lukstafi · lukstafi · commit 97f756c4f7ee · 2025-05-30T16:11:50.000+02:00
Now struggling with a build bug causing arrayjit/test to read its parent ocannl_config.
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,7 @@ _opam/
 
 # Local configuration
 ocannl_config
+!arrayjit/test/ocannl_config
 !test/ocannl_config
 !test/config/ocannl_config
 !test_ppx/ocannl_config
diff --git a/arrayjit/lib/arrayjit_stubs.c b/arrayjit/lib/arrayjit_stubs.c
@@ -0,0 +1,127 @@
+#include <caml/alloc.h>
+#include <caml/memory.h>
+#include <caml/mlvalues.h>
+#include <math.h>
+#include <stdint.h>
+
+/* BFloat16 to Float conversion */
+CAMLprim value arrayjit_bfloat16_to_float(value v_bf16)
+{
+  CAMLparam1(v_bf16);
+  uint16_t bf16 = (uint16_t)Int_val(v_bf16);
+  
+  /* BFloat16 format: 1 sign bit, 8 exponent bits, 7 mantissa bits
+     To convert to float32, we shift left by 16 bits */
+  uint32_t f32 = ((uint32_t)bf16) << 16;
+  float result = *((float*)&f32);
+  
+  CAMLreturn(caml_copy_double((double)result));
+}
+
+/* Float to BFloat16 conversion */
+CAMLprim value arrayjit_float_to_bfloat16(value v_float) 
+{
+  CAMLparam1(v_float);
+  float f = (float)Double_val(v_float);
+  uint32_t f32 = *((uint32_t*)&f);
+  
+  /* Round to nearest even */
+  uint32_t rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);
+  uint16_t bf16 = (uint16_t)(rounded >> 16);
+  
+  CAMLreturn(Val_int(bf16));
+}
+
+/* FP8 E5M2 format to Float conversion 
+   Format: 1 sign bit, 5 exponent bits, 2 mantissa bits */
+CAMLprim value arrayjit_fp8_to_float(value v_fp8)
+{
+  CAMLparam1(v_fp8);
+  uint8_t fp8 = (uint8_t)Int_val(v_fp8);
+  
+  /* Handle zero */
+  if (fp8 == 0) {
+    CAMLreturn(caml_copy_double(0.0));
+  }
+  
+  uint32_t sign = (fp8 >> 7) & 1;
+  uint32_t exp = (fp8 >> 2) & 0x1F;
+  uint32_t mant = fp8 & 0x3;
+  
+  /* Handle special cases */
+  if (exp == 0x1F) {  /* Infinity or NaN */
+    if (mant == 0) {
+      float inf = sign ? -INFINITY : INFINITY;
+      CAMLreturn(caml_copy_double((double)inf));
+    } else {
+      CAMLreturn(caml_copy_double((double)NAN));
+    }
+  }
+  
+  /* Denormalized numbers */
+  if (exp == 0) {
+    float result = ldexpf((float)mant / 4.0f, -14);
+    if (sign) result = -result;
+    CAMLreturn(caml_copy_double((double)result));
+  }
+  
+  /* Normalized numbers */
+  float result = (1.0f + (float)mant * 0.25f) * ldexpf(1.0f, (int)exp - 15);
+  if (sign) result = -result;
+  
+  CAMLreturn(caml_copy_double((double)result));
+}
+
+/* Float to FP8 E5M2 conversion */
+CAMLprim value arrayjit_float_to_fp8(value v_float)
+{
+  CAMLparam1(v_float);
+  float f = (float)Double_val(v_float);
+  
+  /* Handle zero */
+  if (f == 0.0f) {
+    CAMLreturn(Val_int(0));
+  }
+  
+  uint32_t sign = (f < 0) ? 1 : 0;
+  f = fabsf(f);
+  
+  /* Handle special cases */
+  if (isinf(f)) {
+    CAMLreturn(Val_int((sign << 7) | 0x7C));  /* Infinity: exp=0x1F, mant=0 */
+  }
+  if (isnan(f)) {
+    CAMLreturn(Val_int((sign << 7) | 0x7F));  /* NaN: exp=0x1F, mant!=0 */
+  }
+  
+  /* Get exponent and mantissa */
+  int exp_val;
+  float mant_f = frexpf(f, &exp_val);
+  int exp = exp_val + 14;  /* Bias is 15, but frexp gives us mantissa in [0.5, 1) */
+  
+  /* Clamp to representable range */
+  if (exp < 0) {
+    /* Underflow to zero */
+    CAMLreturn(Val_int(sign << 7));
+  }
+  if (exp > 30) {
+    /* Overflow to infinity */
+    CAMLreturn(Val_int((sign << 7) | 0x7C));
+  }
+  
+  /* Handle denormalized numbers */
+  if (exp == 0) {
+    float denorm_mant = f * ldexpf(1.0f, 14) * 4.0f;
+    uint32_t mant_bits = (uint32_t)(denorm_mant + 0.5f);
+    if (mant_bits > 3) mant_bits = 3;
+    CAMLreturn(Val_int((sign << 7) | mant_bits));
+  }
+  
+  /* Normalized numbers: convert mantissa from [0.5, 1) to [0, 0.75] */
+  mant_f = (mant_f - 0.5f) * 4.0f;
+  uint32_t mant_bits = (uint32_t)(mant_f + 0.5f);  /* Round to nearest */
+  if (mant_bits > 3) mant_bits = 3;
+  
+  uint8_t result = (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
+  CAMLreturn(Val_int(result));
+} 
diff --git a/arrayjit/lib/cc_backend.ml b/arrayjit/lib/cc_backend.ml
@@ -209,72 +209,4 @@ let%track3_sexp link_compiled ~merge_buffer ~runner_label ctx_arrays (code : pro
         context_lifetime = (ctx_arrays, code);
         description = "executes " ^ code.name ^ " on " ^ runner_label;
         work;
-      } )
-(*
-let typ_of_prec = function
-  | Ops.Byte_prec _ -> "unsigned char"
-  | Ops.Uint16_prec _ -> "unsigned short"
-  | Ops.Int32_prec _ -> "int"
-  | Ops.Half_prec _ -> "_Float16"
-  | Ops.Bfloat16_prec _ -> "unsigned short"  (* Stored as uint16, emulated as float *)
-  | Ops.Fp8_prec _ -> "unsigned char"  (* Stored as uint8, emulated as float *)
-  | Ops.Single_prec _ -> "float"
-  | Ops.Double_prec _ -> "double"
-  | Ops.Void_prec -> "void"
-
-(* Helper functions for bfloat16 and fp8 conversions *)
-let extra_declarations =
-  [
-    "/* Emulation functions for special float types */";
-    "static inline float bfloat16_to_float(unsigned short bf16) {";
-    "    unsigned int f32 = ((unsigned int)bf16) << 16;";
-    "    return *(float*)&f32;";
-    "}";
-    "";
-    "static inline unsigned short float_to_bfloat16(float f) {";
-    "    unsigned int f32 = *(unsigned int*)&f;";
-    "    unsigned int rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);";
-    "    return (unsigned short)(rounded >> 16);";
-    "}";
-    "";
-    "/* Simplified FP8 E5M2 format emulation */";
-    "static inline float fp8_to_float(unsigned char fp8) {";
-    "    if (fp8 == 0) return 0.0f;";
-    "    unsigned int sign = (fp8 >> 7) & 1;";
-    "    unsigned int exp = (fp8 >> 2) & 0x1F;";
-    "    unsigned int mant = fp8 & 0x3;";
-    "    float result = (1.0f + mant * 0.25f) * powf(2.0f, (float)exp - 15.0f);";
-    "    return sign ? -result : result;";
-    "}";
-    "";
-    "static inline unsigned char float_to_fp8(float f) {";
-    "    if (f == 0.0f) return 0;";
-    "    unsigned int sign = (f < 0) ? 1 : 0;";
-    "    f = fabsf(f);";
-    "    int exp = (int)floorf(log2f(f)) + 15;";
-    "    if (exp < 0) return 0;";
-    "    if (exp > 31) return sign ? 0xFF : 0x7F;";
-    "    float mant = f / powf(2.0f, (float)exp - 15.0f) - 1.0f;";
-    "    unsigned int mant_bits = (unsigned int)(mant * 4.0f + 0.5f);";
-    "    if (mant_bits > 3) mant_bits = 3;";
-    "    return (unsigned char)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));";
-    "}";
-  ]
-
-let convert_precision ~from ~to_ =
-  match (from, to_) with
-  | p1, p2 when Ops.equal_prec p1 p2 -> ("", "")
-  | Ops.Bfloat16_prec _, Ops.Single_prec _ -> ("bfloat16_to_float(", ")")
-  | Ops.Bfloat16_prec _, Ops.Double_prec _ -> ("(double)bfloat16_to_float(", ")")
-  | Ops.Single_prec _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16(", ")")
-  | Ops.Double_prec _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16((float)", ")")
-  | Ops.Fp8_prec _, Ops.Single_prec _ -> ("fp8_to_float(", ")")
-  | Ops.Fp8_prec _, Ops.Double_prec _ -> ("(double)fp8_to_float(", ")")
-  | Ops.Single_prec _, Ops.Fp8_prec _ -> ("float_to_fp8(", ")")
-  | Ops.Double_prec _, Ops.Fp8_prec _ -> ("float_to_fp8((float)", ")")
-  | Ops.Bfloat16_prec _, _ -> ("(float)bfloat16_to_float(", ")")  (* Convert via float *)
-  | _, Ops.Bfloat16_prec _ -> ("float_to_bfloat16((float)", ")")
-  | Ops.Fp8_prec _, _ -> ("(float)fp8_to_float(", ")")  (* Convert via float *)
-  | _, Ops.Fp8_prec _ -> ("float_to_fp8((float)", ")")
-  | _ -> Ops.c_convert_precision ~from ~to_
-*)
+      } )
diff --git a/arrayjit/lib/dune b/arrayjit/lib/dune
@@ -33,6 +33,9 @@
   saturn_lockfree
   utils
   ppx_minidebug.runtime)
+ (foreign_stubs
+  (language c)
+  (names arrayjit_stubs))
  (preprocess
   (pps
    ppx_compare
diff --git a/arrayjit/lib/ndarray.ml b/arrayjit/lib/ndarray.ml
@@ -1,6 +1,15 @@
 open Base
+
+module Lazy = Utils.Lazy
+
 (** N-dimensional arrays: a precision-handling wrapper for [Bigarray.Genarray] and its utilities. *)
 
+(* External conversion functions for special float types *)
+external bfloat16_to_float : int -> float = "arrayjit_bfloat16_to_float"
+external float_to_bfloat16 : float -> int = "arrayjit_float_to_bfloat16"
+external fp8_to_float : int -> float = "arrayjit_fp8_to_float"
+external float_to_fp8 : float -> int = "arrayjit_float_to_fp8"
+
 let _get_local_debug_runtime = Utils.get_local_debug_runtime
 
 [%%global_debug_log_level 9]
@@ -160,15 +169,15 @@ let create_bigarray (type ocaml elt_t) (prec : (ocaml, elt_t) Ops.precision) ~di
       init_bigarray_of_prec prec dims ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
   | Ops.Half, Standard_uniform ->
       init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
-  | Ops.Bfloat16, Constant_fill { values; strict } -> constant_fill_f Int.of_float values strict  (* TODO: proper bfloat16 conversion *)
+  | Ops.Bfloat16, Constant_fill { values; strict } -> constant_fill_f float_to_bfloat16 values strict
   | Ops.Bfloat16, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> indices_to_offset ~dims ~idcs)  (* TODO: proper bfloat16 conversion *)
-  | Ops.Bfloat16, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> Random.int 65536)  (* TODO: proper bfloat16 conversion *)
-  | Ops.Fp8, Constant_fill { values; strict } -> constant_fill_f (Fn.compose Char.of_int_exn Int.of_float) values strict  (* TODO: proper fp8 conversion *)
+      init_bigarray_of_prec prec dims ~f:(fun idcs -> float_to_bfloat16 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+  | Ops.Bfloat16, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> float_to_bfloat16 @@ Rand.Lib.float_range 0.0 1.0)
+  | Ops.Fp8, Constant_fill { values; strict } -> constant_fill_f (Fn.compose Char.of_int_exn float_to_fp8) values strict
   | Ops.Fp8, Range_over_offsets ->
       init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          Char.of_int_exn @@ indices_to_offset ~dims ~idcs)
-  | Ops.Fp8, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.char ())
+          Char.of_int_exn @@ float_to_fp8 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+  | Ops.Fp8, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> Char.of_int_exn @@ float_to_fp8 @@ Rand.Lib.float_range 0.0 1.0)
   | Ops.Single, Constant_fill { values; strict } -> constant_fill_float values strict
   | Ops.Single, Range_over_offsets ->
       init_bigarray_of_prec prec dims ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
@@ -255,8 +264,8 @@ let set_from_float arr idx v =
   | Uint16_nd arr -> A.set arr idx @@ Int.of_float v
   | Int32_nd arr -> A.set arr idx @@ Int32.of_float v
   | Half_nd arr -> A.set arr idx v
-  | Bfloat16_nd arr -> A.set arr idx @@ Int.of_float v  (* TODO: proper bfloat16 conversion *)
-  | Fp8_nd arr -> A.set arr idx @@ Char.of_int_exn @@ Int.of_float v  (* TODO: proper fp8 conversion *)
+  | Bfloat16_nd arr -> A.set arr idx @@ float_to_bfloat16 v
+  | Fp8_nd arr -> A.set arr idx @@ Char.of_int_exn @@ float_to_fp8 v
   | Single_nd arr -> A.set arr idx v
   | Double_nd arr -> A.set arr idx v
 
@@ -266,8 +275,8 @@ let fill_from_float arr v =
   | Uint16_nd arr -> A.fill arr @@ Int.of_float v
   | Int32_nd arr -> A.fill arr @@ Int32.of_float v
   | Half_nd arr -> A.fill arr v
-  | Bfloat16_nd arr -> A.fill arr @@ Int.of_float v  (* TODO: proper bfloat16 conversion *)
-  | Fp8_nd arr -> A.fill arr @@ Char.of_int_exn @@ Int.of_float v  (* TODO: proper fp8 conversion *)
+  | Bfloat16_nd arr -> A.fill arr @@ float_to_bfloat16 v
+  | Fp8_nd arr -> A.fill arr @@ Char.of_int_exn @@ float_to_fp8 v
   | Single_nd arr -> A.fill arr v
   | Double_nd arr -> A.fill arr v
 
@@ -319,14 +328,15 @@ let reset_bigarray (init_op : Ops.init_op) (type o b) (prec : (o, b) Ops.precisi
   | Ops.Half, Range_over_offsets ->
       set_bigarray arr ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
   | Ops.Half, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
-  | Ops.Bfloat16, Constant_fill { values; strict } -> constant_set_f Int.of_float values strict  (* TODO: proper bfloat16 conversion *)
+  | Ops.Bfloat16, Constant_fill { values; strict } -> constant_set_f float_to_bfloat16 values strict
   | Ops.Bfloat16, Range_over_offsets ->
-      set_bigarray arr ~f:(fun idcs -> indices_to_offset ~dims ~idcs)  (* TODO: proper bfloat16 conversion *)
-  | Ops.Bfloat16, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Random.int 65536)  (* TODO: proper bfloat16 conversion *)
-  | Ops.Fp8, Constant_fill { values; strict } -> constant_set_f (Fn.compose Char.of_int_exn Int.of_float) values strict  (* TODO: proper fp8 conversion *)
+      set_bigarray arr ~f:(fun idcs -> float_to_bfloat16 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+  | Ops.Bfloat16, Standard_uniform -> set_bigarray arr ~f:(fun _ -> float_to_bfloat16 @@ Rand.Lib.float_range 0.0 1.0)
+  | Ops.Fp8, Constant_fill { values; strict } -> constant_set_f (Fn.compose Char.of_int_exn float_to_fp8) values strict
   | Ops.Fp8, Range_over_offsets ->
-      set_bigarray arr ~f:(fun idcs -> Char.of_int_exn @@ indices_to_offset ~dims ~idcs)
-  | Ops.Fp8, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Rand.Lib.char ())
+      set_bigarray arr ~f:(fun idcs ->
+          Char.of_int_exn @@ float_to_fp8 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+  | Ops.Fp8, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Char.of_int_exn @@ float_to_fp8 @@ Rand.Lib.float_range 0.0 1.0)
   | Ops.Single, Constant_fill { values; strict } -> constant_set_float values strict
   | Ops.Single, Range_over_offsets ->
       set_bigarray arr ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
@@ -363,8 +373,8 @@ let fold_as_float ~init ~f arr =
   | Uint16_nd arr -> fold_bigarray ~init ~f:(fun accu idx v -> f accu idx @@ Float.of_int v) arr
   | Int32_nd arr -> fold_bigarray ~init ~f:(fun accu idx v -> f accu idx @@ Int32.to_float v) arr
   | Half_nd arr -> fold_bigarray ~init ~f arr
-  | Bfloat16_nd arr -> fold_bigarray ~init ~f:(fun accu idx v -> f accu idx @@ Float.of_int v) arr  (* TODO: proper bfloat16 conversion *)
-  | Fp8_nd arr -> fold_bigarray ~init ~f:(fun accu idx c -> f accu idx @@ Float.of_int @@ Char.to_int c) arr  (* TODO: proper fp8 conversion *)
+  | Bfloat16_nd arr -> fold_bigarray ~init ~f:(fun accu idx v -> f accu idx @@ bfloat16_to_float v) arr
+  | Fp8_nd arr -> fold_bigarray ~init ~f:(fun accu idx c -> f accu idx @@ fp8_to_float @@ Char.to_int c) arr
   | Single_nd arr -> fold_bigarray ~init ~f arr
   | Double_nd arr -> fold_bigarray ~init ~f arr
 
@@ -380,8 +390,8 @@ let get_as_float arr idx =
   | Uint16_nd arr -> Float.of_int @@ A.get arr idx
   | Int32_nd arr -> Int32.to_float @@ A.get arr idx
   | Half_nd arr -> A.get arr idx
-  | Bfloat16_nd arr -> Float.of_int @@ A.get arr idx  (* TODO: proper bfloat16 conversion *)
-  | Fp8_nd arr -> Float.of_int @@ Char.to_int @@ A.get arr idx  (* TODO: proper fp8 conversion *)
+  | Bfloat16_nd arr -> bfloat16_to_float @@ A.get arr idx
+  | Fp8_nd arr -> fp8_to_float @@ Char.to_int @@ A.get arr idx
   | Single_nd arr -> A.get arr idx
   | Double_nd arr -> A.get arr idx
 
diff --git a/arrayjit/test/dune b/arrayjit/test/dune
@@ -0,0 +1,18 @@
+(executable
+ (name test_numerical_types)
+ (modules test_numerical_types)
+ (libraries base stdio arrayjit.ir)
+ (preprocess
+  (pps ppx_jane)))
+
+(rule
+ (target test_numerical_types.output)
+ (deps test_numerical_types.exe)
+ (action
+  (with-stdout-to %{target}
+   (run %{deps}))))
+
+(rule
+ (alias runtest)
+ (action
+  (diff test_numerical_types.expected test_numerical_types.output)))
diff --git a/arrayjit/test/ocannl_config b/arrayjit/test/ocannl_config
@@ -0,0 +1,6 @@
+randomness_lib=for_tests
+log_main_domain_to_stdout=true
+backend=multicore_cc
+log_level=0
+print_decimals_precision=2
+prefer_backend_uniformity=true
diff --git a/arrayjit/test/test_numerical_types.expected b/arrayjit/test/test_numerical_types.expected
diff --git a/arrayjit/test/test_numerical_types.ml b/arrayjit/test/test_numerical_types.ml