Support padding via pre-padded ndarrays (my idea, Claude's code)

lukstafi · lukstafi · commit 24ee10feb649 · 2025-06-05T12:49:56.000+02:00
diff --git a/arrayjit/lib/ndarray.ml b/arrayjit/lib/ndarray.ml
@@ -129,74 +129,126 @@ let init_bigarray_of_prec (type ocaml elt_t) (prec : (ocaml, elt_t) Ops.precisio
 let indices_to_offset ~dims ~idcs =
   Array.fold2_exn dims idcs ~init:0 ~f:(fun accu dim idx -> (accu * dim) + idx)
 
-let create_bigarray (type ocaml elt_t) (prec : (ocaml, elt_t) Ops.precision) ~dims
+let create_bigarray (type ocaml elt_t) (prec : (ocaml, elt_t) Ops.precision) ~dims ~padding
     (init_op : Ops.init_op) : (ocaml, elt_t) bigarray =
   Option.iter Utils.settings.fixed_state_for_init ~f:(fun seed -> Rand.Lib.init seed);
+  
+  (* Handle padding: dims already includes padding, compute unpadded dimensions *)
+  let unpadded_dims, padding_info = match padding with
+    | None -> dims, None
+    | Some (pad_config, pad_value) ->
+        let unpadded_dims = Array.map2_exn dims pad_config ~f:(fun dim (left, right) -> 
+          dim - left - right) in
+        (unpadded_dims, Some (pad_config, pad_value))
+  in
+  
+  let arr = create_bigarray_of_prec prec dims in
+  
+  (* Fill with padding value if padding is specified *)
+  (match padding_info with
+   | None -> ()
+   | Some (_, pad_value) ->
+       (* Fill the entire array with padding value using precision-specific fill *)
+       (match prec with
+        | Ops.Byte -> A.fill arr (Char.of_int_exn @@ Int.of_float pad_value)
+        | Ops.Uint16 -> A.fill arr (Int.of_float pad_value)
+        | Ops.Int32 -> A.fill arr (Int32.of_float pad_value)
+        | Ops.Half -> A.fill arr pad_value
+        | Ops.Bfloat16 -> A.fill arr (float_to_bfloat16 pad_value)
+        | Ops.Fp8 -> A.fill arr (Char.of_int_exn @@ float_to_fp8 pad_value)
+        | Ops.Single -> A.fill arr pad_value
+        | Ops.Double -> A.fill arr pad_value));
+  
+  (* Helper function to convert unpadded indices to padded indices *)
+  let unpadded_to_padded_indices idcs =
+    match padding_info with
+    | None -> idcs
+    | Some (pad_config, _) ->
+        Array.map2_exn idcs pad_config ~f:(fun idx (left, _) -> idx + left)
+  in
+  
+  (* For non-constant fill operations, we need to iterate over unpadded dimensions *)
+  let init_unpadded_region init_func =
+    let rec loop_dims idcs dim_idx =
+      if dim_idx = Array.length unpadded_dims then
+        let padded_idcs = unpadded_to_padded_indices idcs in
+        A.set arr padded_idcs (init_func idcs)
+      else
+        for i = 0 to unpadded_dims.(dim_idx) - 1 do
+          idcs.(dim_idx) <- i;
+          loop_dims idcs (dim_idx + 1)
+        done
+    in
+    loop_dims (Array.create ~len:(Array.length unpadded_dims) 0) 0
+  in
+  
   let constant_fill_f f values strict =
     let len = Array.length values in
     if strict then (
-      let size = Array.fold ~init:1 ~f:( * ) dims in
+      let size = Array.fold ~init:1 ~f:( * ) unpadded_dims in
       if size <> len then
         raise
         @@ Utils.User_error
              [%string
                "Ndarray.create_bigarray: Constant_fill: invalid data size %{len#Int}, expected \
                 %{size#Int}"];
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> f values.(indices_to_offset ~dims ~idcs)))
+      init_unpadded_region (fun idcs -> f values.(indices_to_offset ~dims:unpadded_dims ~idcs)))
     else
-      init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          f values.(indices_to_offset ~dims ~idcs % len))
+      init_unpadded_region (fun idcs ->
+        f values.(indices_to_offset ~dims:unpadded_dims ~idcs % len))
   in
   let constant_fill_float values strict = constant_fill_f Fn.id values strict in
-  match (prec, init_op) with
+  
+  (match (prec, init_op) with
   | Ops.Byte, Constant_fill { values; strict } ->
-      constant_fill_f (Fn.compose Char.of_int_exn Int.of_float) values strict
+      ignore (constant_fill_f (Fn.compose Char.of_int_exn Int.of_float) values strict)
   | Ops.Byte, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          Char.of_int_exn @@ indices_to_offset ~dims ~idcs)
-  | Ops.Byte, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.char ())
-  | Ops.Uint16, Constant_fill { values; strict } -> constant_fill_f Int.of_float values strict
+      init_unpadded_region (fun idcs -> Char.of_int_exn @@ indices_to_offset ~dims:unpadded_dims ~idcs)
+  | Ops.Byte, Standard_uniform -> init_unpadded_region (fun _ -> Rand.Lib.char ())
+  | Ops.Uint16, Constant_fill { values; strict } -> 
+      ignore (constant_fill_f Int.of_float values strict)
   | Ops.Uint16, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> indices_to_offset ~dims ~idcs)
-  | Ops.Uint16, Standard_uniform -> init_bigarray_of_prec prec dims ~f:(fun _ -> Random.int 65536)
-  | Ops.Int32, Constant_fill { values; strict } -> constant_fill_f Int32.of_float values strict
+      init_unpadded_region (fun idcs -> indices_to_offset ~dims:unpadded_dims ~idcs)
+  | Ops.Uint16, Standard_uniform -> init_unpadded_region (fun _ -> Random.int 65536)
+  | Ops.Int32, Constant_fill { values; strict } -> 
+      ignore (constant_fill_f Int32.of_float values strict)
   | Ops.Int32, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          Int32.of_int_exn @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs -> Int32.of_int_exn @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Int32, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ -> Random.int32 Int32.max_value)
-  | Ops.Half, Constant_fill { values; strict } -> constant_fill_float values strict
+      init_unpadded_region (fun _ -> Random.int32 Int32.max_value)
+  | Ops.Half, Constant_fill { values; strict } -> ignore (constant_fill_float values strict)
   | Ops.Half, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs -> Float.of_int @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Half, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
+      init_unpadded_region (fun _ -> Rand.Lib.float_range 0.0 1.0)
   | Ops.Bfloat16, Constant_fill { values; strict } ->
-      constant_fill_f float_to_bfloat16 values strict
+      ignore (constant_fill_f float_to_bfloat16 values strict)
   | Ops.Bfloat16, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          float_to_bfloat16 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs ->
+          float_to_bfloat16 @@ Float.of_int @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Bfloat16, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ ->
-          float_to_bfloat16 @@ Rand.Lib.float_range 0.0 1.0)
+      init_unpadded_region (fun _ -> float_to_bfloat16 @@ Rand.Lib.float_range 0.0 1.0)
   | Ops.Fp8, Constant_fill { values; strict } ->
-      constant_fill_f (Fn.compose Char.of_int_exn float_to_fp8) values strict
+      ignore (constant_fill_f (Fn.compose Char.of_int_exn float_to_fp8) values strict)
   | Ops.Fp8, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs ->
-          Char.of_int_exn @@ float_to_fp8 @@ Float.of_int @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs ->
+          Char.of_int_exn @@ float_to_fp8 @@ Float.of_int @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Fp8, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ ->
-          Char.of_int_exn @@ float_to_fp8 @@ Rand.Lib.float_range 0.0 1.0)
-  | Ops.Single, Constant_fill { values; strict } -> constant_fill_float values strict
+      init_unpadded_region (fun _ -> Char.of_int_exn @@ float_to_fp8 @@ Rand.Lib.float_range 0.0 1.0)
+  | Ops.Single, Constant_fill { values; strict } -> ignore (constant_fill_float values strict)
   | Ops.Single, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs -> Float.of_int @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Single, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
-  | Ops.Double, Constant_fill { values; strict } -> constant_fill_float values strict
+      init_unpadded_region (fun _ -> Rand.Lib.float_range 0.0 1.0)
+  | Ops.Double, Constant_fill { values; strict } -> ignore (constant_fill_float values strict)
   | Ops.Double, Range_over_offsets ->
-      init_bigarray_of_prec prec dims ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
+      init_unpadded_region (fun idcs -> Float.of_int @@ indices_to_offset ~dims:unpadded_dims ~idcs)
   | Ops.Double, Standard_uniform ->
-      init_bigarray_of_prec prec dims ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
+      init_unpadded_region (fun _ -> Rand.Lib.float_range 0.0 1.0)
   | _, File_mapped (filename, stored_prec) ->
+      (* For file mapping, we don't support padding yet - require no padding *)
+      if Option.is_some padding then
+        raise @@ Utils.User_error "Ndarray.create_bigarray: File_mapped initialization does not support padding";
       (* See: https://github.com/janestreet/torch/blob/master/src/torch/dataset_helper.ml#L3 *)
       if not @@ Ops.equal_prec stored_prec (Ops.pack_prec prec) then
         raise
@@ -215,12 +267,13 @@ let create_bigarray (type ocaml elt_t) (prec : (ocaml, elt_t) Ops.precision) ~di
              [%string
                "Ndarray.create_bigarray: File_mapped: invalid file bytes %{len#Int}, expected \
                 %{size * Ops.prec_in_bytes stored_prec#Int}"]);
-      let ba =
+      let file_arr =
         Unix.map_file fd (precision_to_bigarray_kind prec) Bigarray.c_layout false dims
           ~pos:(Int64.of_int 0)
       in
       Unix.close fd;
-      ba
+      A.blit file_arr arr);
+  arr
 
 (** {2 *** Accessing ***} *)
 
@@ -331,7 +384,8 @@ let reset_bigarray (init_op : Ops.init_op) (type o b) (prec : (o, b) Ops.precisi
   | Ops.Int32, Constant_fill { values; strict } -> constant_set_f Int32.of_float values strict
   | Ops.Int32, Range_over_offsets ->
       set_bigarray arr ~f:(fun idcs -> Int32.of_int_exn @@ indices_to_offset ~dims ~idcs)
-  | Ops.Int32, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Random.int32 Int32.max_value)
+  | Ops.Int32, Standard_uniform ->
+      set_bigarray arr ~f:(fun _ -> Random.int32 Int32.max_value)
   | Ops.Half, Constant_fill { values; strict } -> constant_set_float values strict
   | Ops.Half, Range_over_offsets ->
       set_bigarray arr ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
@@ -357,7 +411,7 @@ let reset_bigarray (init_op : Ops.init_op) (type o b) (prec : (o, b) Ops.precisi
   | Ops.Double, Range_over_offsets ->
       set_bigarray arr ~f:(fun idcs -> Float.of_int @@ indices_to_offset ~dims ~idcs)
   | Ops.Double, Standard_uniform -> set_bigarray arr ~f:(fun _ -> Rand.Lib.float_range 0.0 1.0)
-  | _, File_mapped _ -> A.blit (create_bigarray prec ~dims init_op) arr
+  | _, File_mapped _ -> A.blit (create_bigarray prec ~dims ~padding:None init_op) arr
 
 let reset init_op arr =
   let f arr = reset_bigarray init_op arr in
@@ -504,16 +558,17 @@ let hash_t nd = Nativeint.hash @@ to_native nd
 
 let used_memory = Atomic.make 0
 
-let%track7_sexp create_array ~debug:(_debug : string) (prec : Ops.prec) ~(dims : int array) init_op
-    =
+let%track7_sexp create_array ~debug:(_debug : string) (prec : Ops.prec) ~(dims : int array)
+    ~padding init_op =
+  (* dims already includes padding if padding is specified *)
   let size_in_bytes : int =
     (if Array.length dims = 0 then 0 else Array.reduce_exn dims ~f:( * )) * Ops.prec_in_bytes prec
   in
   let%track7_sexp finalizer (_result : t) =
     let _ : int = Atomic.fetch_and_add used_memory size_in_bytes in
     [%log3 "Deleting", _debug, ptr_to_string_hum _result]
   in
-  let f prec = as_array prec @@ create_bigarray prec ~dims init_op in
+  let f prec = as_array prec @@ create_bigarray prec ~dims ~padding init_op in
   let result = Ops.map_prec { f } prec in
   Stdlib.Gc.finalise finalizer result;
   let _ : int = Atomic.fetch_and_add used_memory size_in_bytes in
@@ -524,7 +579,7 @@ let%track7_sexp create_array ~debug:(_debug : string) (prec : Ops.prec) ~(dims :
   result
 
 let empty_array prec =
-  create_array prec ~dims:[||] (Constant_fill { values = [| 0.0 |]; strict = false })
+  create_array prec ~dims:[||] ~padding:None (Constant_fill { values = [| 0.0 |]; strict = false })
 
 let get_used_memory () = Atomic.get used_memory
 
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
@@ -71,6 +71,9 @@ type t = {
   array : Nd.t option Lazy.t;
   prec : Ops.prec Lazy.t;
   dims : int array Lazy.t;
+  padding : ((int * int) array * float) option Lazy.t;
+      (** If the tensor node is pre-padded, this is the pair (left padding, right padding) and the
+          padding value. *)
   size_in_bytes : int Lazy.t;
   id : int;
   label : string list;
@@ -94,6 +97,15 @@ let num_elems tn =
   let dims = Lazy.force tn.dims in
   if Array.is_empty dims then 0 else Array.reduce_exn dims ~f:( * )
 
+let dims_without_padding tn =
+  match Lazy.force tn.padding with
+  | None -> Lazy.force tn.dims
+  | Some (padding, _) ->
+      let dims = Lazy.force tn.dims in
+      Array.map2_exn dims padding ~f:(fun dim (left, right) -> dim - left - right)
+
+let get_padding tn = Lazy.force tn.padding
+
 let id { id; _ } = "n" ^ Int.to_string id
 let label a = String.concat ~sep:"_" a.label
 let is_alphanum_ = String.for_all ~f:(fun c -> Char.equal c '_' || Char.is_alphanum c)
@@ -522,12 +534,14 @@ end)
 
 let registry = Registry.create 16
 
-let create ?default_prec ~id ~label ~dims init_op =
+let create ?default_prec ~id ~label ~dims ~padding init_op =
   let debug = "Host array for " ^ get_debug_name ~id ~label () in
   let rec array =
     lazy
       (if is_hosted_force tn 30 then
-         Some (Nd.create_array ~debug (Lazy.force prec) ~dims:(Lazy.force dims) init_op)
+         Some
+           (Nd.create_array ~debug (Lazy.force prec) ~dims:(Lazy.force dims) ~padding:(Lazy.force padding)
+              init_op)
        else None)
   and prec =
     lazy
@@ -545,6 +559,7 @@ let create ?default_prec ~id ~label ~dims init_op =
       delayed_prec_unsafe;
       prec;
       dims;
+      padding;
       size_in_bytes;
       id;
       label;
@@ -572,6 +587,7 @@ let find =
       prec = lazy initial_default_prec;
       delayed_prec_unsafe = Specified initial_default_prec;
       dims = lazy [||];
+      padding = lazy None;
       size_in_bytes = lazy 0;
       id = -1;
       label = [];
diff --git a/arrayjit/test/test_numerical_types.ml b/arrayjit/test/test_numerical_types.ml
@@ -15,7 +15,7 @@ let test_bfloat16_conversions () =
 
   (* Test round-trip through ndarray *)
   let arr =
-    Ndarray.create_array ~debug:"test" Ops.bfloat16 ~dims:[| 3; 2 |]
+    Ndarray.create_array ~debug:"test" Ops.bfloat16 ~dims:[| 3; 2 |] ~padding:None
       (Ops.Constant_fill { values = [| 1.0; 2.0; 3.14; -1.5; 0.125; 1000.0 |]; strict = true })
   in
 
@@ -36,14 +36,43 @@ let test_fp8_conversions () =
 
   (* Test round-trip through ndarray *)
   let arr =
-    Ndarray.create_array ~debug:"test" Ops.fp8 ~dims:[| 2; 2 |]
+    Ndarray.create_array ~debug:"test" Ops.fp8 ~dims:[| 2; 2 |] ~padding:None
       (Ops.Constant_fill { values = [| 1.0; 0.5; 2.0; -1.0 |]; strict = true })
   in
 
   Stdio.printf "\nFP8 array values:\n";
   let flat_values = Ndarray.retrieve_flat_values arr in
   Array.iteri flat_values ~f:(fun i v -> Stdio.printf "  [%d] = %.6f\n" i v)
 
+let test_padding () =
+  Stdio.printf "\n\nTesting padding functionality:\n";
+
+  (* Test padding with float32 array *)
+  let padding_config = [| (1, 1); (2, 1) |] in (* left=1,right=1 for first dim; left=2,right=1 for second dim *)
+  let padding_value = -999.0 in
+
+  let padded_dims = [| 4; 6 |] in (* (2+1+1) x (3+2+1) *)
+  
+  let arr =
+    Ndarray.create_array ~debug:"padded_test" Ops.single ~dims:padded_dims 
+      ~padding:(Some (padding_config, padding_value))
+      (Ops.Constant_fill { values = [| 1.0; 2.0; 3.0; 4.0; 5.0; 6.0 |]; strict = true })
+  in
+
+  Stdio.printf "Padded array (dims 4x6, unpadded region 2x3):\n";
+  let dims = Ndarray.dims arr in
+  for i = 0 to dims.(0) - 1 do
+    for j = 0 to dims.(1) - 1 do
+      let idx = [| i; j |] in
+      let value = Ndarray.get_as_float arr idx in
+      Stdio.printf "%8.1f " value;
+    done;
+    Stdio.printf "\n"
+  done;
+  
+  Stdio.printf "\nExpected: padding value (-999.0) in margins, data values (1.0-6.0) in center region\n"
+
 let () =
   test_bfloat16_conversions ();
-  test_fp8_conversions ()
+  test_fp8_conversions ();
+  test_padding ()
diff --git a/lib/row.ml b/lib/row.ml
@@ -511,16 +511,12 @@ let rec subst_dim env = function
       | Some (Solved_dim (Var v2)) when equal_dim_var v v2 -> default
       | Some (Solved_dim d) -> subst_dim env d
       | _ -> default)
-  | Affine { solved; unsolved } ->
-      (* Substitute variables in affine expression *)
-      let new_unsolved = List.filter_map unsolved ~f:(fun (coeff, v) ->
+  | Affine { solved = _; unsolved } as init ->
+      List.fold unsolved ~init ~f:(fun acc (_coeff, v) ->
         match Map.find env.dim_env v with
-        | Some (Solved_dim _) -> 
-            (* Variable is solved, need to incorporate into affine expression *)
-            None  (* Will be handled below *)
-        | _ -> Some (coeff, v)) in
-      (* FIXME: properly handle substitution of solved variables into affine expressions *)
-      Affine { solved; unsolved = new_unsolved }
+        | Some (Solved_dim d) -> 
+            s_dim_one v ~value:d ~in_:acc
+        | _ -> acc)
 
 let s_row_one v ~value:{ dims = more_dims; bcast; id = _ } ~in_ =
   match in_ with
@@ -614,8 +610,7 @@ let%debug5_sexp rec unify_dim ~stage (eq : dim * dim) (env : environment) :
       (* FIXME: For now, we can only unify identical affine expressions *)
       if equal_dim dim1 dim2 then ([], env)
       else raise @@ Shape_error ("Cannot unify different affine dimensions", [ Dim_mismatch [ dim1; dim2 ] ])
-  | Affine _, _ | _, Affine _ ->
-      (* Cannot unify affine with non-affine dimensions *)
+  | Affine _, Dim _ | Dim _, Affine _ ->
       raise @@ Shape_error ("Cannot unify affine dimension with non-affine", [ Dim_mismatch [ dim1; dim2 ] ])
   | Var v, dim2 | dim2, Var v ->
       let ineqs = ref [] in
diff --git a/lib/tensor.ml b/lib/tensor.ml