Yay, a better design for convolution shape and projection inference

lukstafi · lukstafi · commit f2e075607d99 · 2025-06-15T17:48:58.000+02:00
TODO: don't pass all pre-existing paddings as resolved_padding, some can still be updated
(based on whether a tensor node's fields are forced).
diff --git a/lib/row.ml b/lib/row.ml
diff --git a/lib/row.mli b/lib/row.mli
@@ -2,6 +2,7 @@
 
 open Base
 
+type axis_padding = Ir.Ndarray.axis_padding [@@deriving equal, sexp]
 type kind = [ `Batch | `Input | `Output ] [@@deriving equal, compare, sexp, hash, variants]
 type dim_var [@@deriving equal, hash, compare, sexp]
 type proj_id [@@deriving equal, hash, compare, sexp]
@@ -26,15 +27,9 @@ type solved_dim = { d : int; label : string option; proj_id : proj_id option }
 type dim =
   | Var of dim_var
   | Dim of solved_dim
-  | Conv_input of {
-      stride : int;
-      output : dim;
-      solved_kernel : solved_dim option;
-      unsolved_kernel : (int * dim_var) list;
-    }
-      (** The offset is implicit, automatically derived. Most frequent use case: convolutions. If
-          [!use_padding] is [true], the offset is the dimensionality-preserving left padding,
-          otherwise it is 0. *)
+  | Conv_input of { stride : int; output : dim; dilation : int; kernel : dim }
+      (** The offset is implicit, automatically derived. If [!use_padding] is [true], the offset is
+          the left part of the dimensionality-preserving symmetric padding, otherwise it is 0. *)
 [@@deriving equal, hash, compare, sexp, variants]
 
 val get_dim : d:int -> ?label:string -> unit -> dim
@@ -151,7 +146,7 @@ type proj_equation =
 val get_proj_equations :
   constraint_ list -> Ir.Indexing.axis_index dim_map -> environment -> proj_equation list
 
-val solve_proj_equations : proj_equation list -> proj_env
+val solve_proj_equations : proj_equation list -> resolved_padding:(proj_id, axis_padding) List.Assoc.t -> proj_env
 val get_proj_index : proj_env -> proj -> Ir.Indexing.axis_index
 val get_dim_index : proj_env -> dim -> Ir.Indexing.axis_index
 val get_product_proj : proj_env -> dim -> (proj_id * int) option
diff --git a/lib/shape.ml b/lib/shape.ml
@@ -60,10 +60,15 @@ type parsed_axis_labels = {
 
 let axis_labels parsed = parsed.labels
 
+type padding = Row.axis_padding array option [@@deriving sexp, equal]
+
 type t = {
   mutable batch : Row.t;
   mutable input : Row.t;
   mutable output : Row.t;
+  mutable batch_padding : padding;
+  mutable input_padding : padding;
+  mutable output_padding : padding;
   id : int;  (** A node that has the same shape as this shape. *)
   debug_name : string;
 }
@@ -676,13 +681,24 @@ let () =
 (** *** Projection inference *** *)
 
 let fresh_proj_ids update =
+  let resolved_padding = ref [] in
+  let fetch_padding row row_padding =
+    Option.iter row_padding ~f:(fun padding ->
+        Array.iter2_exn (Array.of_list row.Row.dims) padding ~f:(fun d p ->
+            match d with
+            | Row.Dim { proj_id = Some proj_id; _ } -> resolved_padding := (proj_id, p) :: !resolved_padding
+            | _ -> ()))
+  in
   let fresh_shape (sh : t) =
     sh.batch <- Row.fresh_row_proj sh.batch;
     sh.input <- Row.fresh_row_proj sh.input;
-    sh.output <- Row.fresh_row_proj sh.output
+    sh.output <- Row.fresh_row_proj sh.output;
+    fetch_padding sh.batch sh.batch_padding;
+    fetch_padding sh.input sh.input_padding;
+    fetch_padding sh.output sh.output_padding
   in
   fresh_shape update.shape;
-  match update.logic with
+  (match update.logic with
   | Terminal _ -> ()
   | Transpose (_, sh) -> fresh_shape sh
   | Broadcast (_, sh1, sh2) ->
@@ -691,13 +707,14 @@ let fresh_proj_ids update =
   | Broadcast_tern (_, sh1, sh2, sh3) ->
       fresh_shape sh1;
       fresh_shape sh2;
-      fresh_shape sh3
+      fresh_shape sh3);
+  !resolved_padding
 
 (** Computes the indexing into subtensors given the shape information of a tensor.
     [derive_projections] should only be invoked when the shapes are fully inferred already! *)
 let derive_projections (update_step : update_step) : Idx.projections =
   finish_inference ();
-  fresh_proj_ids update_step;
+  let resolved_padding = fresh_proj_ids update_step in
   let _debug_update_step : update_step = update_step in
   let (proj_axis_env, ineqs) : proj_axis_env * Row.constraint_ list =
     get_inequalities update_step
@@ -717,7 +734,7 @@ let derive_projections (update_step : update_step) : Idx.projections =
   (* Important: ineqs must not be substituted / solved before getting proj_equations, because
      get_inequalities provides indexing information that is lost after substitution. *)
   let proj_eqs : Row.proj_equation list = Row.get_proj_equations ineqs proj_axis_env local_env in
-  let proj_env : Row.proj_env = Row.solve_proj_equations proj_eqs in
+  let proj_env : Row.proj_env = Row.solve_proj_equations ~resolved_padding proj_eqs in
   let dims_of (sh : t) = sh.batch.dims @ sh.output.dims @ sh.input.dims in
   let lhs = update_step.shape in
   let rhs =
@@ -809,7 +826,18 @@ let make ?batch_dims ?input_dims ?output_dims ?batch_axes ?input_axes ?output_ax
     | None, None -> make_unknown `Output
     | Some _, Some _ -> invalid_arg "Shape.make: do not provide both output_dims, output_axes"
   in
-  let result = { input; output; batch; id; debug_name } in
+  let result =
+    {
+      input;
+      output;
+      batch;
+      id;
+      debug_name;
+      batch_padding = None;
+      input_padding = None;
+      output_padding = None;
+    }
+  in
   (match deduced with
   | Not_constrained -> ()
   | Input_equals_output -> (
@@ -841,7 +869,18 @@ let shape_spec_to_dims_bio labels =
 
 let of_spec ?(deduced = Not_constrained) ~debug_name ~id spec =
   let batch, input, output = shape_spec_to_dims_bio ~sh_id:id @@ axis_labels_of_spec spec in
-  let result = { input; output; batch; id; debug_name } in
+  let result =
+    {
+      input;
+      output;
+      batch;
+      id;
+      debug_name;
+      batch_padding = None;
+      input_padding = None;
+      output_padding = None;
+    }
+  in
   (match deduced with
   | Not_constrained -> ()
   | Input_equals_output -> (
diff --git a/lib/shape.mli b/lib/shape.mli
@@ -47,7 +47,7 @@
 
 open Base
 
-type padding = Ir.Ndarray.axis_padding array option
+type padding = Row.axis_padding array option [@@deriving sexp, equal]
 
 type t = {
   mutable batch : Row.t;
@@ -113,10 +113,7 @@ val make :
     that these are dimensions labels and not axis labels: they need not be unique for a row, are
     inferred when provided, and must match whenever the axis sizes must match. *)
 
-val to_string_hum :
-  ?style:Row.print_style ->
-  t ->
-  string
+val to_string_hum : ?style:Row.print_style -> t -> string
 
 val unsafe_reinitialize : unit -> unit
 (** Bring global state to its initialization values. This invalidates any unfinished inference. *)
diff --git a/lib/shape_inference.md b/lib/shape_inference.md
@@ -13,26 +13,15 @@ A tensor shape in OCANNL is composed of three rows of axes: batch, input and out
 A row is a sequence of axes of a single kind: batch, input, or output. The shape type incorporates information relevant to inference, in particular shape variables: both for individual axes (`dim` variables), and for extending a row with more axes (`row` variables). Currently, all rows are (independently) broadcastable: can be broadcasted to a larger number of axes. However, in OCANNL the broadcasting can happen "in the middle", with not only the given trailing axes fixed, but also with the given leading axes fixed.
 
 ```ocaml
-type solved_dim = {
-  d : int;
-  label : string option;
-  proj_id : proj_id option;
-}
+type solved_dim = { d : int; label : string option; proj_id : proj_id option }
 (** A single axis in a shape. *)
 
 type dim =
   | Var of dim_var
   | Dim of solved_dim
-  | Conv_input of {
-      stride : int;
-      output : dim;
-      solved_kernel : solved_dim option;
-      unsolved_kernel : (int * dim_var) list;
-    }
-      (** Represents convolution-style input dimensions where the output dimension
-          relates to the input dimension through: input = stride * output + kernel_terms.
-          This is a generalization of convolutions that supports affine indexing patterns.
-          The offset is implicit and depends on the global setting use_padding. *)
+  | Conv_input of { stride : int; output : dim; dilation : int; kernel : dim }
+      (** The offset is implicit, automatically derived. If [!use_padding] is [true], the offset is
+          the left part of the dimensionality-preserving symmetric padding, otherwise it is 0. *)
 
 type bcast =
   | Row_var of row_var  (** The row can be inferred to have more axes. *)
@@ -214,7 +203,7 @@ The projection inference functions.
 
 ### Convolutions
 
-There is an important and intentional disconnect between `dims` in the `arrayjit` part of the project: tensor nodes, `Ndarray` buffers, code generation: they include padding in the dimension sizes -- and on the other hand shape types, shape inference and tensors exclude padding from the dimension sizes. There is a tension: once the delayed computations of padding, projections and dims (dimension sizes) are forced for a particular node, the padding can no longer be updated (the underlying `Ndarray` buffer might already be created). Since during inference we update the padding incrementally without variables standing in for insufficient information, this unfortunately causes observability of the during-inference and post-inference distinction for the padding of a tensor node.
+There is an important and intentional difference between `dims` in the `arrayjit` part of the project: tensor nodes, `Ndarray` buffers, code generation -- they include padding in the dimension sizes; and on the other hand shape types, shape inference and tensors exclude padding from the dimension sizes. There is a tension: once the delayed computations of padding, projections and dims (dimension sizes) are forced for a particular node, the padding can no longer be updated (the underlying `Ndarray` buffer might already be created). Since during inference we update the padding incrementally without variables standing in for insufficient information, this unfortunately causes observability of the during-inference and post-inference distinction for the padding of a tensor node.
 
 ## Deriving the constraints
 
diff --git a/test/test_print_style.expected b/test/test_print_style.expected
@@ -7,28 +7,24 @@ Testing print_style functionality:
 === Testing solved_dim_to_string ===
 Full attributes (d=28, padding=2, label=height, proj_id):
   Only_labels: height
-  Axis_size: height=28+2
-  Axis_number_and_size: height=28+2
-  Projection_and_size: height=28+2
+  Axis_size: height=28
+  Axis_number_and_size: height=28
+  Projection_and_size: height=28
 
 Minimal attributes (d=64, no padding, no label, no proj_id):
   Only_labels: _
   Axis_size: 64
   Projection_and_size: 64
 
-With padding only (d=32, padding=3, label=width, no proj_id):
-  Axis_size: width=32+3
-  Projection_and_size: width=32+3
-
 With projection (d=32, label=width, proj_id):
   Axis_size: width=32
-  Projection_and_size: width=32[p1]
+  Projection_and_size: width=32p1
 
 === Testing dim_to_string ===
 Solved dimensions:
   Only_labels (full): height
-  Axis_size (full): height=28+2
-  Projection_and_size (full): height=28+2
+  Axis_size (full): height=28
+  Projection_and_size (full): height=28
   Only_labels (minimal): _
   Axis_size (minimal): 64