The previous change is_bijective -> is_surjective was a mistake, but we can do better altogether, collab with Claude

lukstafi · lukstafi · commit 075bc6a5afac · 2025-08-14T23:08:20.000+02:00
Claude:
  The problem is that when the surjectivity check was fixed, it now
  correctly identifies that this mapping is non-surjective (because we're
  writing to the same ho position multiple times as i12 varies from 0 to
  3). But instead of switching to accumulation mode, it seems to be
  zero-initializing and then just doing assignments, which means only the
  last value (when i12 = 3) survives.

Me:
&gt; The problem is with lowering. The high-level representation shows
  accumulating assignment, with the syntax I invented: `ho =:+ r2x4x3` ,
  but the unoptimized lowered representation doesn't expand it as expected,
   it should use two binary operators, the `accum` operator and the `op`
  operator.

Claude: Perfect! Now I found the bug. is_total at line 99 returns true when both:
  1. initialize_neutral is true
  2. is_surjective returns true

  The bug is that the code assumes surjective mappings don't need
  accumulation, but that's wrong. We need to check for bijectivity
  (one-to-one mapping) to use plain assignment. Non-bijective mappings need
   accumulation even if they're surjective.

  The fix should be to use is_bijective instead of is_surjective in the
  is_total function, or better yet, check specifically whether multiple
  source positions map to the same target position (which would require
  checking injectivity).

Signed-off-by: Lukasz Stafiniak &lt;lukstafi@gmail.com&gt;
diff --git a/arrayjit/lib/assignments.ml b/arrayjit/lib/assignments.ml
@@ -98,6 +98,10 @@ let get_name_exn asgns =
 let is_total ~initialize_neutral ~projections =
   initialize_neutral && Indexing.is_surjective projections
 
+let can_skip_accumulation ~projections =
+  (* We can skip accumulation (use = instead of +=) only if the projection is injective *)
+  Indexing.is_injective projections
+
 (** Returns materialized nodes in the sense of {!Tnode.is_in_context_force}. NOTE: it must be called
     after compilation; otherwise, it will disrupt memory mode inference. *)
 let%debug3_sexp context_nodes ~(use_host_memory : 'a option) (asgns : t) : Tn.t_set =
@@ -226,7 +230,7 @@ let%track4_sexp to_low_level code =
       let lhs_ll = get (Node lhs) lhs_idcs in
       let rhses_ll = Array.mapi rhses_idcs ~f:(fun i rhs_idcs -> get rhses.(i) rhs_idcs) in
       let rhs2 = apply_op op rhses_ll in
-      if is_total ~initialize_neutral ~projections then set lhs lhs_idcs rhs2
+      if can_skip_accumulation ~projections then set lhs lhs_idcs rhs2
       else set lhs lhs_idcs @@ apply_op (Ops.Binop accum) [| lhs_ll; rhs2 |]
     in
     let rec for_loop rev_iters = function
@@ -243,7 +247,16 @@ let%track4_sexp to_low_level code =
             }
     in
     let for_loops = for_loop [] (Array.to_list projections.product_space) in
-    if initialize_neutral && not (is_total ~initialize_neutral ~projections) then
+    (* Need initialization if:
+       - initialize_neutral is true AND
+       - (not surjective OR not injective)
+       Not surjective: some positions never written (need init to avoid garbage)
+       Not injective: accumulation needed (need init for first += operation) *)
+    let needs_init = 
+      initialize_neutral && 
+      not (Indexing.is_surjective projections && Indexing.is_injective projections)
+    in
+    if needs_init then
       let dims = lazy projections.lhs_dims in
       let fetch_op = Constant (Ops.neutral_elem accum) in
       Low_level.Seq (loop (Fetch { array = lhs; fetch_op; dims }), for_loops)
diff --git a/arrayjit/lib/indexing.ml b/arrayjit/lib/indexing.ml
@@ -150,66 +150,61 @@ let opt_symbol d = if iterated d then Some (get_symbol ()) else None
 let opt_iterator = function None -> Fixed_idx 0 | Some sym -> Iterator sym
 
 let is_surjective proj =
-  (* For surjectivity, we check if all target (LHS) positions will be written to.
-     This is used to determine if we need to zero-initialize before assignment. *)
-  
+  (* For surjectivity, we check if all target (LHS) positions will be written to. This is used to
+     determine if we need to zero-initialize before assignment. *)
+
   (* Check if there are any fixed indices (except Fixed_idx 0 when dim is 1) *)
   let has_non_trivial_fixed =
     Array.exists2_exn proj.project_lhs proj.lhs_dims ~f:(fun idx dim ->
         match idx with
-        | Fixed_idx i -> not (i = 0 && dim <= 1)  (* Fixed_idx 0 is OK only when dim is 0 or 1 *)
+        | Fixed_idx i -> not (i = 0 && dim <= 1) (* Fixed_idx 0 is OK only when dim is 0 or 1 *)
         | _ -> false)
   in
   if has_non_trivial_fixed then false
   else
     (* Collect symbols used in LHS *)
-    let lhs_symbols, has_affine, has_sub_axis = 
-      Array.fold proj.project_lhs ~init:([], false, false) 
-        ~f:(fun (syms, has_aff, has_sub) idx ->
+    let lhs_symbols, has_affine, has_sub_axis =
+      Array.fold proj.project_lhs ~init:([], false, false) ~f:(fun (syms, has_aff, has_sub) idx ->
           match idx with
           | Iterator s -> (s :: syms, has_aff, has_sub)
           | Fixed_idx _ -> (syms, has_aff, has_sub)
           | Affine { symbols; _ } ->
-              let coeff1_syms = 
-                List.filter_map symbols ~f:(fun (coeff, s) -> 
-                  if coeff = 1 then Some s else None) 
+              let coeff1_syms =
+                List.filter_map symbols ~f:(fun (coeff, s) -> if coeff = 1 then Some s else None)
               in
               (coeff1_syms @ syms, true, has_sub)
           | Sub_axis -> (syms, has_aff, true))
     in
     let lhs_symbol_set = Set.of_list (module Symbol) lhs_symbols in
     let product_symbol_set = Set.of_array (module Symbol) proj.product_iterators in
-    
+
     (* All lhs symbols must be from product iterators (no bound symbols) *)
     if not (Set.is_subset lhs_symbol_set ~of_:product_symbol_set) then false
-    else if has_sub_axis then 
-      (* Conservative: Sub_axis case is complex, so assume non-surjective.
-         This is pessimistic but safe - Sub_axis would require comparing
-         lhs_dims and product_space dimensions carefully. *)
+    else if has_sub_axis then
+      (* Conservative: Sub_axis case is complex, so assume non-surjective. This is pessimistic but
+         safe - Sub_axis would require comparing lhs_dims and product_space dimensions carefully. *)
       false
     else if has_affine then
-      (* For Affine indices with strides: check coefficient compatibility.
-         A strided access pattern may skip elements. *)
-      let symbol_dims = 
+      (* For Affine indices with strides: check coefficient compatibility. A strided access pattern
+         may skip elements. *)
+      let symbol_dims =
         Array.filter_mapi proj.product_iterators ~f:(fun i sym ->
-          if Set.mem lhs_symbol_set sym then Some (sym, proj.product_space.(i))
-          else None)
+            if Set.mem lhs_symbol_set sym then Some (sym, proj.product_space.(i)) else None)
         |> Array.to_list
         |> Map.of_alist_exn (module Symbol)
       in
       let check_affine_surjective =
         Array.for_all proj.project_lhs ~f:(function
           | Affine { symbols; _ } ->
               (* Find max dimension of coeff=1 symbols *)
-              let max_coeff1_dim = 
+              let max_coeff1_dim =
                 List.filter_map symbols ~f:(fun (coeff, s) ->
-                  if coeff = 1 then Map.find symbol_dims s else None)
+                    if coeff = 1 then Map.find symbol_dims s else None)
                 |> List.max_elt ~compare:Int.compare
                 |> Option.value ~default:Int.max_value
               in
               (* Check that coeff=1 dimension is not smaller than any stride *)
-              List.for_all symbols ~f:(fun (coeff, _) ->
-                coeff = 1 || max_coeff1_dim >= coeff)
+              List.for_all symbols ~f:(fun (coeff, _) -> coeff = 1 || max_coeff1_dim >= coeff)
           | _ -> true)
       in
       if not check_affine_surjective then false
@@ -220,24 +215,38 @@ let is_surjective proj =
       (* Simple case: only Iterator and Fixed_idx *)
       (* Need enough unique symbols to cover all dimensions *)
       Set.length lhs_symbol_set >= Array.length proj.project_lhs
-      
-(* For backwards compatibility, keep is_bijective as an alias that checks 
-   both surjectivity and injectivity (stricter than just surjectivity) *)
-let is_bijective proj =
-  is_surjective proj && 
-  let lhs_symbols = 
-    Array.concat_map proj.project_lhs ~f:(function
-      | Iterator s -> [| s |]
-      | Fixed_idx _ -> [||]
-      | Affine { symbols; _ } ->
-          List.filter_map symbols ~f:(fun (coeff, s) -> 
-            if coeff = 1 then Some s else None)
-          |> Array.of_list
-      | Sub_axis -> [||])
-    |> Set.of_array (module Symbol)
+
+let is_injective proj =
+  let product_iterator_set = Set.of_array (module Symbol) proj.product_iterators in
+  
+  (* Check each LHS index for injectivity *)
+  let lhs_symbols, is_injective_mapping =
+    Array.fold proj.project_lhs ~init:([], true) ~f:(fun (syms, still_injective) idx ->
+        if not still_injective then (syms, false)
+        else
+          match idx with
+          | Iterator s -> (s :: syms, true)
+          | Fixed_idx _ -> (syms, true)
+          | Affine { symbols; _ } ->
+              (* Filter for symbols that are product iterators *)
+              let product_symbols = 
+                List.filter symbols ~f:(fun (_coeff, s) -> 
+                  Set.mem product_iterator_set s)
+              in
+              (* If more than one product iterator in this Affine index, not injective *)
+              if List.length product_symbols > 1 then 
+                (syms, false)
+              else
+                (* (coefficients don't matter for injectivity) *)
+                (List.map product_symbols ~f:snd @ syms, true)
+          | Sub_axis -> (syms, true))
   in
-  (* For bijectivity, also need exact match of symbols *)
-  Set.equal lhs_symbols (Set.of_array (module Symbol) proj.product_iterators)
+  
+  if not is_injective_mapping then false
+  else
+    let lhs_symbol_set = Set.of_list (module Symbol) lhs_symbols in
+    (* For injectivity, each product iterator must map to at most one position *)
+    Set.is_subset (Set.of_array (module Symbol) proj.product_iterators) ~of_:lhs_symbol_set
 
 (** Projections for a pointwise unary operator. Provide only one of [debug_info] or [derived_for].
 *)
diff --git a/test/einsum/dune b/test/einsum/dune
@@ -41,6 +41,14 @@
  (preprocess
   (pps ppx_here ppx_ocannl)))
 
+(test
+ (name test_accumulation_semantics)
+ (deps ocannl_config)
+ (modules test_accumulation_semantics)
+ (libraries ocannl)
+ (preprocess
+  (pps ppx_here ppx_ocannl)))
+
 (library
  (name einsum_tutorials)
  (package neural_nets_lib)
diff --git a/test/einsum/moons_demo_variant.expected b/test/einsum/moons_demo_variant.expected
@@ -24,7 +24,7 @@ n19 w2 as w2: Host&stream/412410; single prec 1x16; mem in bytes: <not-in-yet>
 n20 grad_w2 as w2.grad: Local/26046; single prec 1x16; mem in bytes: <not-in-yet>
 n21 @|_moons_input as moons_input: Virt/15; single prec 10x2; mem in bytes: <not-in-yet>
 n24 @|_moons_class as moons_class: Virt/15; single prec 10x1; mem in bytes: <not-in-yet>
-n27 * as n27: Virt/151; single prec 10x16; mem in bytes: <not-in-yet>
+n27 * as n27: Virt/15; single prec 10x16; mem in bytes: <not-in-yet>
 n28 grad_* as n27.grad: Local/1046; single prec 10x16; mem in bytes: <not-in-yet>
 n29 + as n29: Virt/15; single prec 10x16; mem in bytes: <not-in-yet>
 n30 grad_+ as n29.grad: Virt/151; single prec 10x16; mem in bytes: <not-in-yet>
@@ -65,7 +65,7 @@ n66 sgd_momentum_w2 as sgd_momentum_w2: unknown; single prec <not-in-yet>; mem i
 n67 0.0001 as n67: Virt/40; single prec 1; mem in bytes: <not-in-yet>
 n68 *. as n68: Virt/15; single prec 1x16; mem in bytes: <not-in-yet>
 n69 point_mlp_result as point_mlp_result: Host&shared/37039; single prec 2; mem in bytes: <not-in-yet>
-n70 * as n70: Virt/151; single prec 16; mem in bytes: <not-in-yet>
+n70 * as n70: Virt/15; single prec 16; mem in bytes: <not-in-yet>
 n71 grad_* as n70.grad: unknown; single prec 16; mem in bytes: <not-in-yet>
 n72 + as n72: Virt/15; single prec 16; mem in bytes: <not-in-yet>
 n73 grad_+ as n72.grad: unknown; single prec 16; mem in bytes: <not-in-yet>
diff --git a/test/einsum/surjectivity.ml b/test/einsum/surjectivity.ml
@@ -66,7 +66,7 @@ let%expect_test "sparse_assignment_with_fixed_indices" =
   
   Train.printf ~here:[%here] ~with_code:false ~with_grad:false sparse;
   [%expect {|
-    HERE: test/einsum/surjectivity.ml:64:21
+    HERE: test/einsum/surjectivity.ml:67:21
     ┌─────────────────────────────────┐
     │[1]: =>_sparse shape 0:5,1:1,2:1 │
     │┌──────┬──────┐                  │
@@ -111,7 +111,7 @@ let%expect_test "multiple_sparse_axes" =
   
   Train.printf ~here:[%here] ~with_code:false ~with_grad:false sparse_multi;
   [%expect {|
-    HERE: test/einsum/surjectivity.ml:113:21
+    HERE: test/einsum/surjectivity.ml:112:21
     ┌───────────────────────────────────────────┐
     │[1]: =>_sparse_multi shape 0:3,1:2,2:4,3:3 │
     │┌──────┬──────────────────┐                │
diff --git a/test/einsum/test_accumulation_semantics.expected b/test/einsum/test_accumulation_semantics.expected
@@ -0,0 +1,101 @@
+Retrieving commandline, environment, or config file variable ocannl_log_level
+Found 0, in the config file
+
+=== Testing einsum reduction (surjective but not injective) ===
+Input tensor (shape: batch=2, input=3, output=4):
+HERE: test/einsum/test_accumulation_semantics.ml:32:21
+┌─────────────────────────────────────────────────────────────┐
+│[0]: r2x4x3 shape 0:2|2:3->1:4                               │
+│┌──────┬────────────────────────┬───────────────────────────┐│
+││      │0 @ 0                   │1 @ 0                      ││
+││      │axis 2                  │axis 2                     ││
+│├──────┼────────────────────────┼───────────────────────────┤│
+││axis 1│ 0.00  1.00     2.00    │ 1.20e+1  1.30e+1  1.40e+1 ││
+││      │ 3.00  4.00     5.00    │ 1.50e+1  1.60e+1  1.70e+1 ││
+││      │ 6.00  7.00     8.00    │ 1.80e+1  1.90e+1  2.00e+1 ││
+││      │ 9.00  1.00e+1  1.10e+1 │ 2.10e+1  2.20e+1  2.30e+1 ││
+│└──────┴────────────────────────┴───────────────────────────┘│
+└─────────────────────────────────────────────────────────────┘
+
+Result after reduction 'b|i->o => b|i' (should sum over output dimension):
+HERE: test/einsum/test_accumulation_semantics.ml:35:21
+┌────────────────────────────────────┐
+│[1]: =>_result shape 0:2|1:3        │
+│┌──────┬───────────────────────────┐│
+││      │axis 1                     ││
+│├──────┼───────────────────────────┤│
+││axis 0│ 1.80e+1  2.20e+1  2.60e+1 ││
+││      │ 6.60e+1  7.00e+1  7.40e+1 ││
+│└──────┴───────────────────────────┘│
+└────────────────────────────────────┘
+
+Expected values (summing over output dimension):
+  Batch 0: [0+3+6+9=18, 1+4+7+10=22, 2+5+8+11=26]
+  Batch 1: [12+15+18+21=66, 13+16+19+22=70, 14+17+20+23=74]
+
+=== Testing diagonal tensor (not surjective) ===
+Input (1D tensor of size 5):
+HERE: test/einsum/test_accumulation_semantics.ml:59:21
+[0]: 0...5 shape 0:6  <not-hosted>
+
+Diagonal tensor 'i=>ii' (5x5 with zeros off-diagonal):
+HERE: test/einsum/test_accumulation_semantics.ml:62:21
+┌──────────────────────────────────────┐
+│[1]: =>_diagonal shape 0:6,1:6        │
+│┌──────┬─────────────────────────────┐│
+││      │axis 1                       ││
+│├──────┼─────────────────────────────┤│
+││axis 0│ 0.00  0.00  ...  0.00  0.00 ││
+││      │ 0.00  1.00  ...  0.00  0.00 ││
+││      │ ...   ...   ...  ...   ...  ││
+││      │ 0.00  0.00  ...  4.00  0.00 ││
+││      │ 0.00  0.00  ...  0.00  5.00 ││
+│└──────┴─────────────────────────────┘│
+└──────────────────────────────────────┘
+
+Note: Off-diagonal elements should be zero (initialized by Zero_out)
+
+=== Testing fixed index projection (not surjective) ===
+Input (1D tensor of size 4):
+HERE: test/einsum/test_accumulation_semantics.ml:84:21
+[0]: 0...4 shape 0:5  <not-hosted>
+
+Sparse tensor 'i=>i0' (only first column populated):
+HERE: test/einsum/test_accumulation_semantics.ml:87:21
+┌─────────────────────────────┐
+│[1]: =>_sparse shape 0:5,1:3 │
+│┌──────┬──────────────────┐  │
+││      │axis 1            │  │
+│├──────┼──────────────────┤  │
+││axis 0│ 0.00  0.00  0.00 │  │
+││      │ 1.00  0.00  0.00 │  │
+││      │ 2.00  0.00  0.00 │  │
+││      │ 3.00  0.00  0.00 │  │
+││      │ 4.00  0.00  0.00 │  │
+│└──────┴──────────────────┘  │
+└─────────────────────────────┘
+
+Note: Only column 0 should have values, others should be zero
+
+=== Testing bijective transpose (optimization case) ===
+Input (3x4 matrix):
+HERE: test/einsum/test_accumulation_semantics.ml:109:21
+[0]: r3x4 shape 0:3,1:4  <not-hosted>
+
+Transposed 'ij=>ji' (4x3 matrix):
+HERE: test/einsum/test_accumulation_semantics.ml:112:21
+┌─────────────────────────────────┐
+│[1]: =>_transposed shape 0:4,1:3 │
+│┌──────┬─────────────────────┐   │
+││      │axis 1               │   │
+│├──────┼─────────────────────┤   │
+││axis 0│ 0.00  4.00  8.00    │   │
+││      │ 1.00  5.00  9.00    │   │
+││      │ 2.00  6.00  1.00e+1 │   │
+││      │ 3.00  7.00  1.10e+1 │   │
+│└──────┴─────────────────────┘   │
+└─────────────────────────────────┘
+
+Note: Simple bijective mapping - no initialization or accumulation needed
+
+=== All accumulation semantics tests completed ===
diff --git a/test/einsum/test_accumulation_semantics.ml b/test/einsum/test_accumulation_semantics.ml