Follow-up: cleaner c_syntax.ml and backend headers, by Claude Opus

lukstafi · lukstafi · commit c9c9a8edfe3e · 2025-08-18T15:32:07.000+02:00
Prompt:
&gt; Let's restore (or establish) a proper division of responsibilities between the files
   / modules c_syntax.ml and cc_backend.ml . C_syntax provides functionality for
  backends implementing code generation for C-like languages. C_syntax.Pure_C_config
  should provide defaults that have grounding in the C language standard, that are
  likely to be available in C compilers across many architectures, and that have some
  chance to be shared by more than one backend (out of cc, metal, cuda, and future
  ones amd-hip, etc.) Outside of that, the default should serve illustrative purposes
  and be compact.

  Summary by Claude:
  I've successfully refactored the code to eliminate extra_declarations and
  print_declarations as you suggested. The changes include:

  1. Removed from c_syntax.ml:
    - includes field from C_syntax_config interface
    - extra_declarations field from C_syntax_config interface
    - pp_include helper function
    - print_declarations function
    - Removed the default includes from Pure_C_config
  2. Updated cc_backend.ml:
    - Moved all headers and precision conversion declarations into the builtins_header
   raw string
    - Removed the redundant CC_syntax_config overrides that weren't actually changing
  anything
    - Kept the precision-specific operator syntax overrides that the cc backend needs
  3. Updated metal_backend.ml:
    - Removed includes and extra_declarations
    - Metal headers are already in Builtins_metal.source
  4. Updated cuda_backend.ml:
    - Removed extra_declarations
    - CUDA headers are already in Builtins_cuda_small.source

  The refactoring makes the code cleaner and more straightforward - each backend now
  manages its complete preamble (headers + declarations) as a single cohesive string,
  rather than having it split across multiple fields and functions. This is much more
  maintainable and easier to understand.
diff --git a/arrayjit/lib/c_syntax.ml b/arrayjit/lib/c_syntax.ml
@@ -27,8 +27,6 @@ module type C_syntax_config = sig
   val buffer_suffix : pos:int -> string
   val arg_int_prefix : string
   val extra_args : string list
-  val includes : string list
-  val extra_declarations : string list
   val typ_of_prec : Ops.prec -> string
   val vec_typ_of_prec : length:int -> Ops.prec -> string
   val ident_blacklist : string list
@@ -94,8 +92,6 @@ struct
   let buffer_suffix = fun ~pos:_ -> ""
   let arg_int_prefix = "const int "
   let extra_args = []
-  let includes = [ "<stdio.h>"; "<stdlib.h>"; "<string.h>"; "<math.h>" ]
-  let extra_declarations = []
 
   let typ_of_prec = Ops.c_typ_of_prec
   let vec_typ_of_prec = Ops.c_vec_typ_of_prec
@@ -236,8 +232,6 @@ module C_syntax (B : C_syntax_config) = struct
     @@ Array.map B.procs ~f:(fun l -> l.llc)
 
   let in_ctx tn = B.(Tn.is_in_context_force ~use_host_memory tn 46)
-  let pp_include s = PPrint.(string "#include " ^^ string s)
-
   open Indexing
   open Doc_helpers
 
@@ -262,12 +256,6 @@ module C_syntax (B : C_syntax_config) = struct
 
   let array_offset_to_string (idcs, dims) = doc_to_string @@ pp_array_offset (idcs, dims)
 
-  let print_declarations () =
-    let open PPrint in
-    let includes = separate hardline (List.map B.includes ~f:pp_include) in
-    let extras = separate hardline (List.map B.extra_declarations ~f:string) in
-    includes ^^ hardline ^^ extras ^^ hardline
-
   let pp_local_defs (local_defs : (int * PPrint.document) list) =
     let open PPrint in
     List.dedup_and_sort local_defs ~compare:(fun (a, _) (b, _) -> Int.compare a b)
diff --git a/arrayjit/lib/cc_backend.ml b/arrayjit/lib/cc_backend.ml
@@ -14,10 +14,14 @@ open Backend_intf
 
 let name = "cc"
 
-(* Header declarations for arrayjit builtins *)
+(* Complete header with includes and declarations for arrayjit builtins *)
 let builtins_header =
   {|
-/* ArrayJIT builtins declarations */
+/* Standard C library headers */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
 #include <stdint.h>
 
 typedef struct {
@@ -78,6 +82,81 @@ extern uint4x32_t bfloat16_to_uint4x32(uint16_t x);
 extern uint4x32_t half_to_uint4x32(uint16_t x);
 extern uint4x32_t fp8_to_uint4x32(uint8_t x);
 
+/* BFloat16 conversion functions */
+static inline float bfloat16_to_single(unsigned short bf16) {
+  unsigned int f32 = ((unsigned int)bf16) << 16;
+  return *((float*)&f32);
+}
+
+static inline unsigned short single_to_bfloat16(float f) {
+  unsigned int f32 = *((unsigned int*)&f);
+  unsigned int rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);
+  return (unsigned short)(rounded >> 16);
+}
+
+/* Half (Float16) support with zero-overhead abstraction */
+#ifdef __FLT16_MAX__
+  #define HAS_NATIVE_FLOAT16 1
+  #define HALF_T _Float16
+  #define HALF_TO_FP(x) (x)  /* Identity - already floating point */
+  #define FP_TO_HALF(x) (x)  /* Identity - already half precision */
+  #define HALF_TO_FLOAT(x) ((float)(x))
+  #define FLOAT_TO_HALF(x) ((_Float16)(x))
+#else
+  #define HAS_NATIVE_FLOAT16 0
+  #define HALF_T unsigned short
+  #define HALF_TO_FP(x) half_to_single(x)  /* Convert to float for computation */
+  #define FP_TO_HALF(x) single_to_half(x)  /* Convert back from float */
+  #define HALF_TO_FLOAT(x) half_to_single(x)
+  #define FLOAT_TO_HALF(x) single_to_half(x)
+  /* Conversion functions for emulation - provided by builtins.c */
+  extern float half_to_single(unsigned short h);
+  extern unsigned short single_to_half(float f);
+#endif
+
+/* FP8 E5M2 conversion functions */
+static inline float fp8_to_single(unsigned char fp8) {
+  if (fp8 == 0) return 0.0f;
+  unsigned int sign = (fp8 >> 7) & 1;
+  unsigned int exp = (fp8 >> 2) & 0x1F;
+  unsigned int mant = fp8 & 0x3;
+  if (exp == 0x1F) {
+    if (mant == 0) return sign ? -INFINITY : INFINITY;
+    else return NAN;
+  }
+  if (exp == 0) {
+    float result = ldexpf((float)mant / 4.0f, -14);
+    if (sign) result = -result;
+    return result;
+  }
+  float result = (1.0f + (float)mant * 0.25f) * ldexpf(1.0f, (int)exp - 15);
+  if (sign) result = -result;
+  return result;
+}
+
+static inline unsigned char single_to_fp8(float f) {
+  if (f == 0.0f) return 0;
+  unsigned int sign = (f < 0) ? 1 : 0;
+  f = fabsf(f);
+  if (isinf(f)) return (sign << 7) | 0x7C;
+  if (isnan(f)) return (sign << 7) | 0x7F;
+  int exp_val;
+  float mant_f = frexpf(f, &exp_val);
+  int exp = exp_val + 14;
+  if (exp < 0) return sign << 7;
+  if (exp > 30) return (sign << 7) | 0x7C;
+  if (exp == 0) {
+    float denorm_mant = f * ldexpf(1.0f, 14) * 4.0f;
+    unsigned int mant_bits = (unsigned int)(denorm_mant + 0.5f);
+    if (mant_bits > 3) mant_bits = 3;
+    return (sign << 7) | mant_bits;
+  }
+  mant_f = (mant_f - 0.5f) * 4.0f;
+  unsigned int mant_bits = (unsigned int)(mant_f + 0.5f);
+  if (mant_bits > 3) mant_bits = 3;
+  return (unsigned char)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
+}
+
 |}
 
 let optimization_level () =
@@ -215,85 +294,6 @@ struct
       not @@ Utils.get_global_flag ~default:false ~arg_name:"prefer_backend_uniformity"
   end)
 
-  (* Add declarations for precision conversions that standard C compilers can use *)
-  let extra_declarations =
-    [
-      (* BFloat16 conversion functions *)
-      "static inline float bfloat16_to_single(unsigned short bf16) {";
-      "  unsigned int f32 = ((unsigned int)bf16) << 16;";
-      "  return *((float*)&f32);";
-      "}";
-      "";
-      "static inline unsigned short single_to_bfloat16(float f) {";
-      "  unsigned int f32 = *((unsigned int*)&f);";
-      "  unsigned int rounded = f32 + 0x7FFF + ((f32 >> 16) & 1);";
-      "  return (unsigned short)(rounded >> 16);";
-      "}";
-      "";
-      (* Half (Float16) support with zero-overhead abstraction *)
-      "#ifdef __FLT16_MAX__";
-      "  #define HAS_NATIVE_FLOAT16 1";
-      "  #define HALF_T _Float16";
-      "  #define HALF_TO_FP(x) (x)  /* Identity - already floating point */";
-      "  #define FP_TO_HALF(x) (x)  /* Identity - already half precision */";
-      "  #define HALF_TO_FLOAT(x) ((float)(x))";
-      "  #define FLOAT_TO_HALF(x) ((_Float16)(x))";
-      "#else";
-      "  #define HAS_NATIVE_FLOAT16 0";
-      "  #define HALF_T unsigned short";
-      "  #define HALF_TO_FP(x) half_to_single(x)  /* Convert to float for computation */";
-      "  #define FP_TO_HALF(x) single_to_half(x)  /* Convert back from float */";
-      "  #define HALF_TO_FLOAT(x) half_to_single(x)";
-      "  #define FLOAT_TO_HALF(x) single_to_half(x)";
-      "  /* Conversion functions for emulation - provided by builtins.c */";
-      "  extern float half_to_single(unsigned short h);";
-      "  extern unsigned short single_to_half(float f);";
-      "#endif";
-      "";
-      (* FP8 E5M2 conversion functions *)
-      "static inline float fp8_to_single(unsigned char fp8) {";
-      "  if (fp8 == 0) return 0.0f;";
-      "  unsigned int sign = (fp8 >> 7) & 1;";
-      "  unsigned int exp = (fp8 >> 2) & 0x1F;";
-      "  unsigned int mant = fp8 & 0x3;";
-      "  if (exp == 0x1F) {";
-      "    if (mant == 0) return sign ? -INFINITY : INFINITY;";
-      "    else return NAN;";
-      "  }";
-      "  if (exp == 0) {";
-      "    float result = ldexpf((float)mant / 4.0f, -14);";
-      "    if (sign) result = -result;";
-      "    return result;";
-      "  }";
-      "  float result = (1.0f + (float)mant * 0.25f) * ldexpf(1.0f, (int)exp - 15);";
-      "  if (sign) result = -result;";
-      "  return result;";
-      "}";
-      "";
-      "static inline unsigned char single_to_fp8(float f) {";
-      "  if (f == 0.0f) return 0;";
-      "  unsigned int sign = (f < 0) ? 1 : 0;";
-      "  f = fabsf(f);";
-      "  if (isinf(f)) return (sign << 7) | 0x7C;";
-      "  if (isnan(f)) return (sign << 7) | 0x7F;";
-      "  int exp_val;";
-      "  float mant_f = frexpf(f, &exp_val);";
-      "  int exp = exp_val + 14;";
-      "  if (exp < 0) return sign << 7;";
-      "  if (exp > 30) return (sign << 7) | 0x7C;";
-      "  if (exp == 0) {";
-      "    float denorm_mant = f * ldexpf(1.0f, 14) * 4.0f;";
-      "    unsigned int mant_bits = (unsigned int)(denorm_mant + 0.5f);";
-      "    if (mant_bits > 3) mant_bits = 3;";
-      "    return (sign << 7) | mant_bits;";
-      "  }";
-      "  mant_f = (mant_f - 0.5f) * 4.0f;";
-      "  unsigned int mant_bits = (unsigned int)(mant_f + 0.5f);";
-      "  if (mant_bits > 3) mant_bits = 3;";
-      "  return (unsigned char)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));";
-      "}";
-    ]
-
   (* Override operation syntax to handle special precision types *)
   let ternop_syntax prec op v1 v2 v3 =
     match prec with
@@ -448,10 +448,9 @@ let%diagn_sexp compile ~(name : string) bindings (lowered : Low_level.optimized)
   (* FIXME: do we really want all of them, or only the used ones? *)
   let idx_params = Indexing.bound_symbols bindings in
   let build_file = Utils.open_build_file ~base_name:name ~extension:".c" in
-  let declarations_doc = Syntax.print_declarations () in
   let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
   let header_doc = PPrint.string builtins_header in
-  let final_doc = PPrint.(header_doc ^^ declarations_doc ^^ proc_doc) in
+  let final_doc = PPrint.(header_doc ^^ proc_doc) in
   (* Use ribbon = 1.0 for usual code formatting, width 110 *)
   PPrint.ToChannel.pretty 1.0 110 build_file.oc final_doc;
   build_file.finalize ();
@@ -473,15 +472,14 @@ let%diagn_sexp compile_batch ~names bindings (lowereds : Low_level.optimized opt
       @@ common_prefix (Array.to_list @@ Array.concat_map ~f:Option.to_array names))
   in
   let build_file = Utils.open_build_file ~base_name ~extension:".c" in
-  let declarations_doc = Syntax.print_declarations () in
   let params_and_docs =
     Array.map2_exn names lowereds ~f:(fun name_opt lowered_opt ->
         Option.map2 name_opt lowered_opt ~f:(fun name lowered ->
             Syntax.compile_proc ~name idx_params lowered))
   in
   let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
   let header_doc = PPrint.string builtins_header in
-  let final_doc = PPrint.(header_doc ^^ declarations_doc ^^ separate hardline all_proc_docs) in
+  let final_doc = PPrint.(header_doc ^^ separate hardline all_proc_docs) in
   PPrint.ToChannel.pretty 1.0 110 build_file.oc final_doc;
   build_file.finalize ();
   let result_library = c_compile_and_load ~f_path:build_file.f_path in
diff --git a/arrayjit/lib/cuda_backend.ml b/arrayjit/lib/cuda_backend.ml
@@ -682,7 +682,6 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | FMA, Ops.Single_prec _ -> func "fmaf"
       | FMA, _ -> func "fma"
 
-    let extra_declarations = []
 
     let convert_precision ~from ~to_ =
       match (from, to_) with
@@ -761,15 +760,10 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     end)) in
     let idx_params = Indexing.bound_symbols bindings in
     let b = Buffer.create 4096 in
-    let declarations_doc = Syntax.print_declarations () in
-    let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-    let final_doc = PPrint.(declarations_doc ^^ proc_doc) in
-    PPrint.ToBuffer.pretty 1.0 110 b final_doc;
-    (* Prepend builtins after syntax generation to preserve include order *)
-    let full_source = Buffer.contents b in
-    Buffer.clear b;
+    (* Prepend builtins first *)
     prepend_builtins b;
-    Buffer.add_string b full_source;
+    let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
+    PPrint.ToBuffer.pretty 1.0 110 b proc_doc;
     let ptx = cuda_to_ptx ~name (Buffer.contents b) in
     { traced_store; ptx; params; bindings; name }
 
@@ -779,7 +773,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     end)) in
     let idx_params = Indexing.bound_symbols bindings in
     let b = Buffer.create 4096 in
-    let declarations_doc = Syntax.print_declarations () in
+    (* Prepend builtins first *)
+    prepend_builtins b;
     let params_and_docs =
       Array.map2_exn names lowereds
         ~f:
@@ -788,13 +783,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                ((params, name), doc)))
     in
     let all_proc_docs = List.filter_map (Array.to_list params_and_docs) ~f:(Option.map ~f:snd) in
-    let final_doc = PPrint.(declarations_doc ^^ separate hardline all_proc_docs) in
+    let final_doc = PPrint.(separate hardline all_proc_docs) in
     PPrint.ToBuffer.pretty 1.0 110 b final_doc;
-    (* Prepend builtins after syntax generation to preserve include order *)
-    let full_source = Buffer.contents b in
-    Buffer.clear b;
-    prepend_builtins b;
-    Buffer.add_string b full_source;
 
     let name : string =
       String.(
diff --git a/arrayjit/lib/metal_backend.ml b/arrayjit/lib/metal_backend.ml
@@ -433,11 +433,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
         "uint3 gid [[threadgroup_position_in_grid]]"; "uint3 lid [[thread_position_in_threadgroup]]";
       ]
 
-    let includes =
-      [ "<metal_stdlib>"; "<metal_math>"; "<metal_logging>"; "<metal_compute>"; "<metal_atomic>" ]
-
     let metal_log_object_name = "os_log_default"
-    let extra_declarations = [ "using namespace metal;" ]
 
     let typ_of_prec = function
       | Ops.Byte_prec _ -> "uchar"
@@ -658,10 +654,9 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     let b = Buffer.create 4096 in
     Buffer.add_string b Builtins_metal.source;
     Buffer.add_string b "\n";
-    let declarations_doc = Syntax.print_declarations () in
     (* Add Metal address space qualifiers *)
     let params, proc_doc = Syntax.compile_proc ~name idx_params lowered in
-    let final_doc = PPrint.(declarations_doc ^^ proc_doc) in
+    let final_doc = proc_doc in
     PPrint.ToBuffer.pretty 1.0 110 b final_doc;
     let source = Buffer.contents b in
     {
@@ -681,7 +676,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
     let idx_params = Indexing.bound_symbols bindings in
     let b = Buffer.create 4096 in
     (* Read and prepend the Metal builtins file *)
-    let declarations_doc = Syntax.print_declarations () in
+    Buffer.add_string b Builtins_metal.source;
+    Buffer.add_string b "\n";
     let funcs_and_docs =
       Array.map2_exn names lowereds
         ~f:
@@ -690,7 +686,7 @@ end) : Ir.Backend_impl.Lowered_backend = struct
                ((name, params), doc)))
     in
     let all_proc_docs = List.filter_map (Array.to_list funcs_and_docs) ~f:(Option.map ~f:snd) in
-    let final_doc = PPrint.(declarations_doc ^^ separate hardline all_proc_docs) in
+    let final_doc = PPrint.(separate hardline all_proc_docs) in
     PPrint.ToBuffer.pretty 1.0 110 b final_doc;
     let source = Buffer.contents b in
     let traced_stores = Array.map lowereds ~f:(Option.map ~f:(fun l -> l.Low_level.traced_store)) in