Big refactoring: Uint4x32_to_prec_uniform moves from a fetch op to a proper unary op (Ops) with dedicated shape and projections inference support (once done); getting rid of the remaining dedicated_access fetch ops with a migration of Merge_buffer to a stand-alone Get_merge_buffer variant in float_t (Low_level); and better consistency with the new terminal_type (Shape).

lukstafi · lukstafi · commit ad9a53e26542 · 2025-07-05T22:25:55.000+02:00
- Introduced a new `uint4x32_t` structure and a stub for the `arrayjit_threefry4x32` function.
- Updated `float_t` type to include `Get_merge_buffer` and removed the `dedicated_access` type.
- Added `Uint4x32_to_prec_uniform` operation in `ops.ml` and updated related type definitions in `shape.ml` and `shape.mli`.
- Modified tensor operation signatures to include a new `terminal_op` parameter for better expressivity.
diff --git a/arrayjit/lib/arrayjit_stubs.c b/arrayjit/lib/arrayjit_stubs.c
@@ -113,6 +113,22 @@ static inline uint8_t float_to_fp8(float f)
   return (uint8_t)((sign << 7) | ((exp & 0x1F) << 2) | (mant_bits & 0x3));
 }
 
+typedef struct {
+  uint32_t v[4];
+} uint4x32_t;
+
+/* Threefry4x32 implementation (C function) */
+uint4x32_t arrayjit_threefry4x32(uint4x32_t v1, uint4x32_t v2)
+{
+  /* FIXME: NOT IMPLEMENTED YET */
+  uint4x32_t result;
+  result.v[0] = 0;
+  result.v[1] = 0;
+  result.v[2] = 0;
+  result.v[3] = 0;
+  return result;
+}
+
 /* OCaml wrapper functions */
 
 /* BFloat16 to Float conversion (OCaml wrapper) */
@@ -188,7 +204,7 @@ CAMLprim value arrayjit_copy_with_padding(value v_source, value v_target,
     source_total *= source_dims_ba[i];
   }
   
-  /* FIXME: Simple memcpy for now - must be optimized later for proper padding */
+  /* FIXME: Simple memcpy for now - must implement proper padding */
   memcpy(target_data, source_data, source_total * elem_size);
   
   CAMLreturn(Val_unit);
diff --git a/arrayjit/lib/low_level.ml b/arrayjit/lib/low_level.ml
@@ -8,17 +8,6 @@ let _get_local_debug_runtime = Utils.get_local_debug_runtime
 [%%global_debug_log_level 9]
 [%%global_debug_log_level_from_env_var "OCANNL_LOG_LEVEL"]
 
-type dedicated_access =
-  | C_function of string
-  | External_unsafe of { ptr : Ops.voidptr; prec : Ops.prec; dims : int array Lazy.t }
-  | Merge_buffer of { source : Tnode.t }
-  | Uint4x32_to_prec_uniform of {
-      source : Tnode.t;
-      target_prec : Ops.prec;
-      target_dims : int array Lazy.t;
-    }
-[@@deriving sexp_of, equal, compare]
-
 module Scope_id = struct
   type t = { tn : Tn.t; scope_id : int } [@@deriving sexp_of, equal, hash, compare]
 
@@ -53,8 +42,8 @@ type t =
 and float_t =
   | Local_scope of { id : scope_id; body : t; orig_indices : Indexing.axis_index array }
   | Get_local of scope_id
-  | Access of dedicated_access * Indexing.axis_index array option
   | Get of Tn.t * Indexing.axis_index array
+  | Get_merge_buffer of Tn.t * Indexing.axis_index array
   | Ternop of Ops.ternop * float_t * float_t * float_t
   | Binop of Ops.binop * float_t * float_t
   | Unop of Ops.unop * float_t
diff --git a/arrayjit/lib/low_level.mli b/arrayjit/lib/low_level.mli
@@ -4,25 +4,6 @@ open Base
 
 (** {2 Global references} *)
 
-(** A dedicated access that might need to be implemented differently for each backend. *)
-type dedicated_access =
-  | C_function of string  (** Calls a no-argument or indices-arguments C function. *)
-  | External_unsafe of { ptr : Ops.voidptr; prec : Ops.prec; dims : int array Lazy.t }
-  | Merge_buffer of { source : Tnode.t }
-      (** Each device has at most one merge buffer, which is re-used, and re-allocated as needed, by
-          merge operations. The merge buffer is associated with the source node of the device's most
-          recent [device_to_device ~into_merge_buffer:true] operation. *)
-  | Uint4x32_to_prec_uniform of {
-      source : Tnode.t;
-      target_prec : Ops.prec;
-      target_dims : int array Lazy.t;
-    }
-      (** Converts the given Uint4x32 to the given precision in a bit-efficient manner. For random
-          bits, the result is uniform over the range of the precision for integer precisions, and
-          over the range \[0.0, 1.0) for floating point precisions. When used in an access pattern,
-          the indices are converted to a byte offset depending on the given precision. *)
-[@@deriving sexp_of, equal, compare]
-
 module Scope_id : sig
   type t = { tn : Tnode.t; scope_id : int } [@@deriving sexp_of, equal, hash, compare]
   type comparator_witness
@@ -50,8 +31,8 @@ type t =
 and float_t =
   | Local_scope of { id : scope_id; body : t; orig_indices : Indexing.axis_index array }
   | Get_local of scope_id
-  | Access of dedicated_access * Indexing.axis_index array option
   | Get of Tnode.t * Indexing.axis_index array
+  | Get_merge_buffer of Tnode.t * Indexing.axis_index array
   | Ternop of Ops.ternop * float_t * float_t * float_t
   | Binop of Ops.binop * float_t * float_t
   | Unop of Ops.unop * float_t
diff --git a/arrayjit/lib/ops.ml b/arrayjit/lib/ops.ml
@@ -295,6 +295,13 @@ type unop =
   | Neg
   | Tanh_approx
   | Not  (** 0. -> 1. | _ -> 0. *)
+  | Uint4x32_to_prec_uniform of prec
+      (** Converts the given Uint4x32 to the given precision in a bit-efficient manner. For random
+          bits, the result is uniform over the range of the precision for integer precisions, and
+          over the range \[0.0, 1.0) for floating point precisions. When used in an access pattern,
+          the indices are converted to a byte offset depending on the given precision. NOTE: this
+          operation, unlike any others, impacts projections and shape inference (one input cell
+          corresponds to a few output cells). *)
 [@@deriving sexp, compare, equal]
 
 type ternop =
diff --git a/lib/shape.ml b/lib/shape.ml
@@ -94,8 +94,14 @@ type transpose_type =
   | Pointwise_un
   | Permute of string
   | Batch_slice of Idx.static_symbol
+  | Uint4x32_to_prec of Ir.Ops.prec Lazy.t
 [@@deriving equal, sexp]
 
+type terminal_type =
+  | Data of Ir.Assignments.init_data
+  | Fetch of Ir.Assignments.fetch_op
+[@@deriving equal, sexp_of]
+
 type ternary_type = Pointwise_tern | Compose_accumulate [@@deriving sexp, equal]
 
 let identifier ~multichar =
diff --git a/lib/shape.mli b/lib/shape.mli
@@ -101,6 +101,10 @@ type transpose_type =
   | Pointwise_un  (** Preserves the shape. *)
   | Permute of string  (** The unary "einsum" syntax: RHS1=>LHS. *)
   | Batch_slice of Ir.Indexing.static_symbol  (** Removes the leftmost batch axis. *)
+  | Uint4x32_to_prec of Ir.Ops.prec Lazy.t
+      (** Converts precision in a bit-effient way, with a corresponding conversion in total number
+          of elements. Currently, assumes the incoming tensor (RHS) has just a single axis to not
+          force unnecessary minimum sizes on output axes. *)
 [@@deriving equal, sexp]
 
 (** If you miss expressivity here, leave a note on
@@ -110,6 +114,10 @@ type ternary_type =
   | Compose_accumulate  (** As in the operation [FMA]. *)
 [@@deriving equal, sexp]
 
+(** Extracts any available shape information from the initialization or fetch. *)
+type terminal_type = Data of Ir.Assignments.init_data | Fetch of Ir.Assignments.fetch_op
+[@@deriving equal, sexp_of]
+
 val make :
   ?batch_dims:int list ->
   ?input_dims:int list ->
@@ -148,7 +156,7 @@ type logic =
       (** Permutes the axes of a shape. One case of [Transpose] is to swap inputs with outputs of
           [s1], hence the name. *)
   | Broadcast_tern of ternary_type * t * t * t  (** Matches the shapes for a ternary operation. *)
-  | Terminal of [ `Data of Ir.Assignments.init_data | `Fetch of Ir.Assignments.fetch_op ]
+  | Terminal of terminal_type
       (** Extracts any available shape information from the initialization. *)
 [@@deriving equal, sexp_of]
 
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -129,19 +129,19 @@ val op :
   ?ternary_op:Shape.ternary_type ->
   ?compose_op:Shape.compose_type ->
   ?transpose_op:Shape.transpose_type ->
-  ?init_data:Ir.Assignments.init_data ->
-  ?fetch_op:fetch_op ->
+  ?terminal_op:Shape.terminal_type ->
   op_asn:(v:tn -> projections:projections Lazy.t -> comp) ->
   grad_asn:(t:t -> g:tn -> projections:projections Lazy.t -> comp) ->
   ?grad_spec:grad_spec ->
   (debug_name:string -> id:int -> Shape.t) ->
   t list ->
   t
-(** At most one of [?ternary_op] or [?compose_op] or [?transpose_op] or [?init_data] or [?fetch_op]
-    should be provided, except when the operation takes more than three arguments which uses both
+(** At most one of [?ternary_op] or [?compose_op] or [?transpose_op] or [?terminal_op] should be
+    provided, except when the operation takes more than three arguments which uses both
     [?compose_op] or [?transpose_op]. The defaults are pointwise operations. The [grad_asn] function
     receives the non-differentiable variant of the tensor as an argument, which can be used to
-    access the tensor's value in a tensor expression. *)
+    access the tensor's value in a tensor expression. The [terminal_op] is used to specify the
+    terminal operation of the tensor. *)
 
 val binop :
   label:string list ->