Refactoring progress: implement code expansion for the new fetch ops Constant_fill and Range_over_offsets

lukstafi · lukstafi · commit 874fa318a9f3 · 2025-06-26T13:32:58.000+02:00
diff --git a/arrayjit/lib/assignments.ml b/arrayjit/lib/assignments.ml
@@ -14,9 +14,11 @@ type buffer = Node of Tn.t | Merge_buffer of Tn.t [@@deriving sexp_of, equal]
 (** Resets a array by performing the specified computation or data fetching. *)
 type fetch_op =
   | Constant of float
-  | Constant_fill of { values : float array; strict : bool }
-      (** Fills in the numbers where the rightmost axis is contiguous. If [strict=false], loops over
-          the provided values. *)
+  | Constant_fill of float array
+      (** Fills in the numbers where the rightmost axis is contiguous. Does not loop over the
+          provided values; shape inference will require the assigned tensor to have the same number
+          of elements. This unrolls all assignments and should be used only for small arrays.
+          Consider using {!Tnode.set_values} instead for larger arrays. *)
   | Range_over_offsets
       (** Fills in the offset number of each cell, i.e. how many cells away it is from the
           beginning, in the logical representation of the tensor node. (The actual in-memory
@@ -141,8 +143,7 @@ let%diagn2_sexp to_low_level code =
     assert (Array.length idcs = Array.length (Lazy.force tn.Tn.dims));
     match buffer with
     | Node tn -> Low_level.Get (tn, idcs)
-    | Merge_buffer tn ->
-        Low_level.Access (Low_level.Merge_buffer { source = tn }, Some idcs)
+    | Merge_buffer tn -> Low_level.Access (Low_level.Merge_buffer { source = tn }, Some idcs)
   in
   let set tn idcs llv =
     if not (Array.length idcs = Array.length (Lazy.force tn.Tn.dims)) then
@@ -239,18 +240,13 @@ let%diagn2_sexp to_low_level code =
     | Fetch { array; fetch_op = Access global; dims } ->
         Low_level.loop_over_dims (Lazy.force dims) ~body:(fun idcs ->
             set array idcs @@ Access (global, Some idcs))
-    | Fetch { array; fetch_op = Range_over_offsets; dims } ->
-        Low_level.loop_over_dims (Lazy.force dims) ~body:(fun idcs ->
-            let offset = Array.foldi idcs ~init:0 ~f:(fun _i acc idx ->
-                match idx with
-                | Fixed_idx j -> acc + j
-                | Iterator _ -> acc  (* Will be computed dynamically *)
-                | Affine _ -> acc    (* Will be computed dynamically *)) in
-            set array idcs @@ Constant (Float.of_int offset))
-    | Fetch { array; fetch_op = Constant_fill { values; strict }; dims } ->
-        Low_level.loop_over_dims (Lazy.force dims) ~body:(fun idcs ->
-            let value = if strict then values.(0) else values.(0) in  (* TODO: implement proper indexing *)
-            set array idcs @@ Constant value)
+    | Fetch { array; fetch_op = Range_over_offsets; dims = (lazy dims) } ->
+        Low_level.loop_over_dims dims ~body:(fun idcs ->
+            let offset = Indexing.reflect_projection ~dims ~projection:idcs in
+            set array idcs @@ Embed_index offset)
+    | Fetch { array; fetch_op = Constant_fill values; dims = (lazy dims) } ->
+        Low_level.unroll_dims dims ~body:(fun idcs ~offset ->
+            set array idcs @@ Constant values.(offset))
   in
   loop code
 
@@ -315,13 +311,14 @@ let to_doc ?name ?static_indices () c =
   let doc_of_fetch_op (op : fetch_op) =
     match op with
     | Constant f -> string (Float.to_string f)
-    | Constant_fill { values; strict } ->
-        let values_str = String.concat ~sep:", " (Array.to_list (Array.map values ~f:Float.to_string)) in
-        string ("constant_fill([" ^ values_str ^ "], strict=" ^ Bool.to_string strict ^ ")")
+    | Constant_fill values ->
+        let values_str =
+          String.concat ~sep:", " (Array.to_list (Array.map values ~f:Float.to_string))
+        in
+        string ("constant_fill([" ^ values_str ^ "])")
     | Range_over_offsets -> string "range_over_offsets"
     | Access (Low_level.C_function c) -> string (c ^ "()")
-    | Access (Low_level.Merge_buffer { source }) ->
-        string (ident source ^ ".merge")
+    | Access (Low_level.Merge_buffer { source }) -> string (ident source ^ ".merge")
     | Access (Low_level.External_unsafe { ptr; prec; dims = _ }) ->
         string (Ops.ptr_to_string_hum ptr prec)
     | Access (Low_level.File_mapped (file, file_prec)) ->
diff --git a/arrayjit/lib/indexing.ml b/arrayjit/lib/indexing.ml
@@ -198,32 +198,18 @@ let identity_projections ?debug_info ?derived_for ~lhs_dims () =
     debug_info;
   }
 
-let derive_index ~product_syms ~(projection : axis_index array) =
-  let sym_to_i =
-    Array.mapi product_syms ~f:(fun i s -> (s, i))
-    |> Array.to_list
-    |> Map.of_alist_exn (module Symbol)
-  in
-  let positions =
-    Array.map projection ~f:(function
-      | Iterator s when Map.mem sym_to_i s -> Either.First (Map.find_exn sym_to_i s)
-      | Fixed_idx _ as it -> Second it
-      | Affine _ as it -> Second it
-      | Iterator _ as it -> Second it)
-  in
-  fun ~product ->
-    Array.map positions ~f:(function
-      | First p -> product.(p)
-      | Second (Fixed_idx i) -> i
-      | Second (Iterator s) ->
-          (* This shouldn't happen if sym_to_i is complete *)
-          failwith ("derive_index: unresolved iterator " ^ symbol_ident s)
-      | Second (Affine { symbols; offset }) ->
-          List.fold symbols ~init:offset ~f:(fun acc (coeff, s) ->
-              match Map.find sym_to_i s with
-              | Some idx -> acc + (coeff * product.(idx))
-              | None ->
-                  failwith ("derive_index: unresolved symbol in affine index " ^ symbol_ident s)))
+let reflect_projection ~(dims : int array) ~(projection : axis_index array) =
+  Array.zip_exn dims projection
+  |> Array.fold_right ~init:(1, [], 0) ~f:(fun (dim, idx) (stride, symbols, offset) ->
+         match idx with
+         | Fixed_idx fixed_offset -> (stride * dim, symbols, offset + (fixed_offset * stride))
+         | Iterator sym -> (stride * dim, (stride, sym) :: symbols, offset)
+         | Affine { symbols = affine_symbols; offset = affine_offset } ->
+             let new_symbols =
+               List.map affine_symbols ~f:(fun (coeff, sym) -> (coeff * stride, sym))
+             in
+             (stride * dim, new_symbols @ symbols, offset + (affine_offset * stride)))
+  |> fun (_, symbols, offset) -> Affine { symbols; offset }
 
 module Pp_helpers = struct
   open PPrint
diff --git a/arrayjit/lib/low_level.ml b/arrayjit/lib/low_level.ml
@@ -295,7 +295,7 @@ let%diagn2_sexp check_and_store_virtual traced static_indices top_llc =
                function
                | Fixed_idx _ -> None
                | Iterator s -> Option.some_if (not @@ Set.mem static_indices s) s
-               | Affine { symbols; offset } -> (
+               | Affine { symbols; offset = _ } -> (
                    (* For affine indices, collect all symbols that are not static *)
                    List.filter_map symbols ~f:(fun (_, s) ->
                        Option.some_if (not @@ Set.mem static_indices s) s)
@@ -991,18 +991,21 @@ let to_doc_cstyle ?name ?static_indices () llc =
         string (Ops.ptr_to_string_hum ptr prec)
     | Access (External_unsafe { ptr; prec; dims = _ }, Some idcs) ->
         string (Ops.ptr_to_string_hum ptr prec) ^^ brackets (pp_indices idcs)
-    | Access (Merge_buffer { source }, None) ->
-        doc_ident source ^^ string ".merge"
+    | Access (Merge_buffer { source }, None) -> doc_ident source ^^ string ".merge"
     | Access (Merge_buffer { source }, Some idcs) ->
         group (doc_ident source ^^ string ".merge" ^^ brackets (pp_indices idcs))
     | Access (File_mapped (file, prec), None) ->
-        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.precision_to_string prec ^ ")")
+        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.prec_string prec ^ ")")
     | Access (File_mapped (file, prec), Some idcs) ->
-        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.precision_to_string prec ^ ")") ^^ brackets (pp_indices idcs)
+        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.prec_string prec ^ ")")
+        ^^ brackets (pp_indices idcs)
     | Access (Uint4x32_to_prec_uniform { source; prec }, None) ->
-        string ("uint4x32_to_" ^ Ops.precision_to_string prec ^ "_uniform(") ^^ doc_ident source ^^ string ")"
+        string ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform(")
+        ^^ doc_ident source ^^ string ")"
     | Access (Uint4x32_to_prec_uniform { source; prec }, Some idcs) ->
-        string ("uint4x32_to_" ^ Ops.precision_to_string prec ^ "_uniform(") ^^ doc_ident source ^^ string ")" ^^ brackets (pp_indices idcs)
+        string ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform(")
+        ^^ doc_ident source ^^ string ")"
+        ^^ brackets (pp_indices idcs)
     | Get (tn, idcs) -> group (doc_ident tn ^^ brackets (pp_indices idcs))
     | Constant c -> string (Printf.sprintf "%.16g" c)
     | Embed_index idx -> pp_axis_index idx
@@ -1075,18 +1078,21 @@ let to_doc ?name ?static_indices () llc =
         string (Ops.ptr_to_string_hum ptr prec)
     | Access (External_unsafe { ptr; prec; dims = _ }, Some idcs) ->
         string (Ops.ptr_to_string_hum ptr prec) ^^ brackets (pp_indices idcs)
-    | Access (Merge_buffer { source }, None) ->
-        doc_ident source ^^ string ".merge"
+    | Access (Merge_buffer { source }, None) -> doc_ident source ^^ string ".merge"
     | Access (Merge_buffer { source }, Some idcs) ->
         group (doc_ident source ^^ string ".merge" ^^ brackets (pp_indices idcs))
     | Access (File_mapped (file, prec), None) ->
-        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.precision_to_string prec ^ ")")
+        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.prec_string prec ^ ")")
     | Access (File_mapped (file, prec), Some idcs) ->
-        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.precision_to_string prec ^ ")") ^^ brackets (pp_indices idcs)
+        string ("file_mapped(\"" ^ file ^ "\", " ^ Ops.prec_string prec ^ ")")
+        ^^ brackets (pp_indices idcs)
     | Access (Uint4x32_to_prec_uniform { source; prec }, None) ->
-        string ("uint4x32_to_" ^ Ops.precision_to_string prec ^ "_uniform(") ^^ doc_ident source ^^ string ")"
+        string ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform(")
+        ^^ doc_ident source ^^ string ")"
     | Access (Uint4x32_to_prec_uniform { source; prec }, Some idcs) ->
-        string ("uint4x32_to_" ^ Ops.precision_to_string prec ^ "_uniform(") ^^ doc_ident source ^^ string ")" ^^ brackets (pp_indices idcs)
+        string ("uint4x32_to_" ^ Ops.prec_string prec ^ "_uniform(")
+        ^^ doc_ident source ^^ string ")"
+        ^^ brackets (pp_indices idcs)
     | Get (tn, idcs) -> group (doc_ident tn ^^ brackets (pp_indices idcs))
     | Constant c -> string (Printf.sprintf "%.16g" c)
     | Embed_index idx -> pp_axis_index idx
@@ -1138,3 +1144,33 @@ let loop_over_dims dims ~body =
           }
   in
   for_loop [] (Array.to_list dims)
+
+let unroll_dims dims ~body =
+  if Array.is_empty dims then body [||] ~offset:0
+  else
+    (* Calculate strides for each dimension (rightmost changes fastest) *)
+    let strides = Array.create ~len:(Array.length dims) 1 in
+    for i = Array.length dims - 2 downto 0 do
+      strides.(i) <- strides.(i + 1) * dims.(i + 1)
+    done;
+
+    (* Generate all combinations of indices *)
+    let rec generate_all_combinations indices_so_far offset dim_index =
+      if dim_index >= Array.length dims then
+        (* We have a complete combination, call the body *)
+        body (Array.of_list_rev indices_so_far) ~offset
+      else
+        (* Generate all values for current dimension *)
+        let results = ref [] in
+        for i = 0 to dims.(dim_index) - 1 do
+          let new_offset = offset + (i * strides.(dim_index)) in
+          let result =
+            generate_all_combinations
+              (Indexing.Fixed_idx i :: indices_so_far)
+              new_offset (dim_index + 1)
+          in
+          results := result :: !results
+        done;
+        unflat_lines (List.rev !results)
+    in
+    generate_all_combinations [] 0 0
diff --git a/arrayjit/lib/low_level.mli b/arrayjit/lib/low_level.mli
@@ -66,6 +66,7 @@ val apply_op : Ops.op -> float_t array -> float_t
 val flat_lines : t list -> t list
 val unflat_lines : t list -> t
 val loop_over_dims : int array -> body:(Indexing.axis_index array -> t) -> t
+val unroll_dims : int array -> body:(Indexing.axis_index array -> offset:int -> t) -> t
 
 (** {2 Optimization} *)
 
diff --git a/arrayjit/lib/lowering_and_inlining.md b/arrayjit/lib/lowering_and_inlining.md
@@ -76,7 +76,7 @@ TODO: flesh out explanation.
 
 ## Translation
 
-The translation `Assignments.to_low_level` is straightforward. Commented code blocks are delineated by `Low_level.Comment "end"` statements. Indices into tensor nodes are derived from the `projections` fields by the `Indexing.derive_index` function. We translate `projections.product_space` elements into for loops. `to_low_level` returns all the data that `Low_level` optimizations generated, so that backends can make more informed decisions when jitting, i.e. emitting the backend-specific code.
+The translation `Assignments.to_low_level` is straightforward. Commented code blocks are delineated by `Low_level.Comment "end"` statements. Indices into tensor nodes are derived from the `projections` fields. We translate `projections.product_space` elements into for loops. `to_low_level` returns all the data that `Low_level` optimizations generated, so that backends can make more informed decisions when jitting, i.e. emitting the backend-specific code.
 
 ## Inlining
 
diff --git a/arrayjit/test/test_numerical_types.ml b/arrayjit/test/test_numerical_types.ml
@@ -16,7 +16,7 @@ let test_bfloat16_conversions () =
   (* Test round-trip through ndarray *)
   let arr =
     Ndarray.create_array ~debug:"test" Ops.bfloat16 ~dims:[| 3; 2 |] ~padding:None
-      (Ops.Constant_fill { values = [| 1.0; 2.0; 3.14; -1.5; 0.125; 1000.0 |]; strict = true })
+      (Assignments.Constant_fill [| 1.0; 2.0; 3.14; -1.5; 0.125; 1000.0 |])
   in
 
   Stdio.printf "\nBFloat16 array values:\n";
diff --git a/lib/operation.ml b/lib/operation.ml
@@ -386,9 +386,10 @@ let embed_symbol ?(label = []) static_sym : Tensor.t =
 
 let random_seed =
   let seed = Option.value ~default:42 @@ Utils.settings.fixed_state_for_init in
-  let res = Tensor.term ~label:[ "random_seed" ] ~grad_spec:Prohibit_grad
-    ~fetch_op:(Asgns.Constant_fill { values = [| seed |]; strict = true })
-    () in
+  let res =
+    Tensor.term ~label:[ "random_seed" ] ~grad_spec:Prohibit_grad
+      ~fetch_op:(Asgns.Constant_fill [| seed |]) ()
+  in
   Tn.update_memory_mode res.value Tn.Effectively_constant 24;
   Tn.update_prec res.value Ir.Ops.uint4x32;
   ref res
@@ -462,21 +463,15 @@ module TDSL = struct
   let stop_gradient = stop_gradient
 
   (** The input [i] dimensions default to empty. The batch dimensions will be inferred if omitted.
-      [strict] controls whether [Constant_fill] will try to fit the given values in the tensor and
-      contribute to shape inference. If it is not provided explicitly, it will be [true] if [b] is
-      omitted, and [false] otherwise. *)
-  let init_const ~l ?strict ?b ?(i = []) ~o values =
-    let strict =
-      match (strict, b) with Some s, _ -> s | None, Some _ -> false | None, None -> true
-    in
+  *)
+  let init_const ~l ?b ?(i = []) ~o values =
     Tensor.term ~label:[ l ] ~grad_spec:Prohibit_grad ?batch_dims:b ~input_dims:i ~output_dims:o
-      ~fetch_op:(Constant_fill { values; strict })
-      ()
+      ~fetch_op:(Asgns.Constant_fill values) ()
 
   (** It's like `Tensor.param` but without shape inference. *)
   let init_param ~l ?(b = []) ?(i = []) ?(o = []) values =
     Tensor.term ~label:[ l ] ~grad_spec:Require_grad ~batch_dims:b ~input_dims:i ~output_dims:o
-      ~fetch_op:(Constant_fill { values; strict = false })
+      ~fetch_op:(Asgns.Constant_fill values)
       ()
 end
 
diff --git a/lib/shape.ml b/lib/shape.ml
@@ -394,16 +394,16 @@ let%debug4_sexp get_inequalities ({ shape = cur_sh; logic; id = _ } as _upd : up
     [ Terminal_row cur_sh.batch; Terminal_row cur_sh.input; Terminal_row cur_sh.output ]
   in
   match logic with
-  | Terminal (Range_over_offsets | Standard_uniform | Constant_fill { strict = false; _ }) ->
+  | Terminal (Range_over_offsets | Standard_uniform) ->
       (Row.dim_map_empty, mark_terminal ())
-  | Terminal (Constant_fill { values; strict = true }) ->
+  | Terminal (Constant_fill values) ->
       let len = Array.length values in
       let io_dims =
         try List.map ~f:dim_to_int_exn @@ cur_sh.output.dims @ cur_sh.input.dims
         with Invalid_argument _ ->
           raise
           @@ Shape_error
-               ( "unify_shapes Constant_fill strict: non-batch dimensions must be known",
+               ( "unify_shapes Constant_fill: non-batch dimensions must be known",
                  [ Shape_mismatch [ cur_sh ] ] )
       in
       let batch_elems = len / abs (List.fold ~init:1 ~f:( * ) io_dims) in
@@ -423,7 +423,7 @@ let%debug4_sexp get_inequalities ({ shape = cur_sh; logic; id = _ } as _upd : up
         with Invalid_argument _ ->
           raise
           @@ Shape_error
-               ( "unify_shapes Constant_fill strict: non-batch dimensions must be known",
+               ( "unify_shapes Constant_fill: non-batch dimensions must be known",
                  [ Shape_mismatch [ cur_sh ] ] )
       in
       let batch_elems = len / abs (List.fold ~init:1 ~f:( * ) io_dims) in
diff --git a/lib/shape_inference.md b/lib/shape_inference.md
@@ -210,7 +210,7 @@ There is an important and intentional difference between `dims` in the `arrayjit
 Other important functions in the `Shape` module.
 
 * `einsum_slot_spec_to_dims_bio ~generative` parses an einsum spec for a single shape, returns the three rows and a mapping from axis (`dim`) variables to indices where the einsum specifies fixed indexing. When `generative` is true for the kind of a row, when an axis has a fixed projection to dimension 0, the axis is not a variable added to the fixed indexing mapping, but is instead dimension-1 (solved). The "generative" rows are the ones with no initial user-provided shape information. This is just a heuristic to avoid surprises where a tensor axis with only dimension 0 populated gets inferred a bigger dimension size -- it might be revisited in the future.
-* `get_inequalities` builds row inequalities by pairing the rows of the current shape (as `cur`) with the rows of sub-shapes (as `subr`). It also derives a batch row constraint for terminals initialized with `Constant_fill { values; strict = true }` and `File_mapped (filename, prec)` (where the file is scanned to get its length). For `Batch_slice` (the `@|` operation) it waits till the batch row variables (if any) are solved, and derives row equations (not inequalities) between the current shape and the sub-shape, with `cur_sh.batch.dims` expanded to account for the slicing / indexing. For einsum specs, it derives inequalities, roughly: _current shape ≥ lhs spec shape_, and _rhs spec shape ≥ sub-shape_.
+* `get_inequalities` builds row inequalities by pairing the rows of the current shape (as `cur`) with the rows of sub-shapes (as `subr`). It also derives a batch row constraint for terminals initialized with `Constant_fill values` and `File_mapped (filename, prec)` (where the file is scanned to get its length). For `Batch_slice` (the `@|` operation) it waits till the batch row variables (if any) are solved, and derives row equations (not inequalities) between the current shape and the sub-shape, with `cur_sh.batch.dims` expanded to account for the slicing / indexing. For einsum specs, it derives inequalities, roughly: _current shape ≥ lhs spec shape_, and _rhs spec shape ≥ sub-shape_.
 * `propagate_shapes` gets and then solves the inequalities, using a global state for the environment. It udpates the shapes in-place with the partial solution. It is invoked twice for each `update_step`: first during the bottom-up process of building tensors, and then in reverse order from `finish_inference`.
 * `finish_inference` is called right before some projections or array dimensions are required (typically, because of jitting). It performs a second round of `propagate_shapes`, and then once again attempts to solve any remaining constraints that `propagate_shapes` didn't solve. Then it "closes the shapes": substitutes out remaining shape variables by their LUBs if any, or dimension-1 / `Broadcastable` (no-more-axes). Then it resets the environment state, since the shapes are now guaranteed to not have variables.
 * `derive_projections` starts by freshening the `proj_id`s in the `update_step`. Then it generates and solves shape inequalities, and then generates and solves projection equations, and constructs the `projections` record.
diff --git a/lib/tensor.ml b/lib/tensor.ml
diff --git a/test/primitive_ops.ml b/test/primitive_ops.ml
diff --git a/test/zero2hero_1of7.ml b/test/zero2hero_1of7.ml