ahrefs
diff --git a/‎CLAUDE.md‎
Lines changed: 4 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎arrayjit/lib/assignments.ml‎
Lines changed: 0 additions & 1 deletion b/‎arrayjit/lib/assignments.ml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 20 additions & 4 deletions b/‎arrayjit/lib/c_syntax.ml‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 2 additions & 0 deletions b/‎arrayjit/lib/metal_backend.ml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎arrayjit/lib/ops.ml‎
Lines changed: 11 additions & 0 deletions b/‎arrayjit/lib/ops.ml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/operation.ml‎
Lines changed: 48 additions & 3 deletions b/‎lib/operation.ml‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎lib/ppx_cd.ml‎
Lines changed: 1 addition & 1 deletion b/‎lib/ppx_cd.ml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/ppx_shared.ml‎
Lines changed: 2 additions & 0 deletions b/‎lib/ppx_shared.ml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/tensor.mli‎
Lines changed: 3 additions & 1 deletion b/‎lib/tensor.mli‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/einsum/einsum_trivia.ml‎
Lines changed: 3 additions & 3 deletions b/‎test/einsum/einsum_trivia.ml‎
Lines changed: 3 additions & 3 deletions
@@ -77,8 +77,10 @@ opam install cudajit  # for CUDA backend
 
 ### Testing
 
-- Tests are implemented either as inline expectations using `ppx_expect`; or as cram-style tests where an `.ml` file is compiled, executed, and its output compared against an `.expected` file
-- Tutorial files in `test/` serve as both documentation and integration tests
+- Tests are implemented either as inline expectations using `ppx_expect`; or as cram-style tests using Dune's `test` stanza where an `.ml` file is compiled, executed, and its output compared against an `.expected` file
+- The two approaches are exclusive: a test using using `.expected` file target cannot also use `%expect` inline expectations
+- `.expected` tests are easier to debug, `%expect` tests should only be used when the outputs are illustrative
+- Tutorial files, i.e. `%expect` tests, in `test/` serve as both documentation and integration tests
 - Use `dune promote` to accept test output changes
 - **Test Placement Guidelines**:
   * Always add tests under one of the test subdirectories
 
@@ -284,7 +284,6 @@ let%track4_sexp to_low_level code =
           let rhs_idcs = Array.map projections.project_rhs.(0) ~f:subst_index in
           let open Low_level in
           let rhs_ll = get rhs rhs_idcs in
-          (* For now, we know the only vec_unop is Uint4x32_to_prec_uniform *)
           let length =
             match op with
             | Ops.Uint4x32_to_prec_uniform -> (
 
@@ -403,6 +403,7 @@ module C_syntax (B : C_syntax_config) = struct
         let ident_doc = string (get_ident tn) in
         let dims = Lazy.force tn.dims in
         let prec = Lazy.force tn.prec in
+        (* FIXME: this precision is hardcoded, bad, bad practice. *)
         let arg_prec = Ops.uint4x32 in
         let local_defs, arg_doc = pp_scalar arg_prec arg in
         let local_defs = pp_local_defs local_defs in
@@ -427,9 +428,15 @@ module C_syntax (B : C_syntax_config) = struct
                       (* For non-Fixed_idx (Iterator, etc), add i to the computed offset *)
                       pp_array_offset (idcs, dims) ^^ string (" + " ^ Int.to_string i)
                 in
-                ident_doc ^^ brackets offset_doc ^^ string " = " ^^ vec_var
-                ^^ string (".v[" ^ Int.to_string i ^ "]")
-                ^^ semi)
+                let value_doc =
+                  if length = 1 then
+                    (* When length=1, vec_typ_of_prec returns a scalar type, so no .v[] access *)
+                    vec_var
+                  else
+                    (* When length>1, access the vector element *)
+                    vec_var ^^ string (".v[" ^ Int.to_string i ^ "]")
+                in
+                ident_doc ^^ brackets offset_doc ^^ string " = " ^^ value_doc ^^ semi)
           in
           separate hardline elem_assigns
         in
@@ -574,7 +581,16 @@ module C_syntax (B : C_syntax_config) = struct
         let expr = group (B.binop_syntax prec op e1 e2) in
         (defs, expr)
     | Unop (op, v) ->
-        let defs, expr_v = pp_scalar prec v in
+        let arg_prec = 
+          match op with
+          | Ops.Uint4x32_to_prec_uniform1 -> 
+            (* The argument to Uint4x32_to_prec_uniform1 must be evaluated with uint4x32 precision,
+               regardless of the target precision. This handles the case where the operation is
+               inlined as part of a scalar expression. *)
+            Ops.uint4x32
+          | _ -> prec
+        in
+        let defs, expr_v = pp_scalar arg_prec v in
         let expr = group (B.unop_syntax prec op expr_v) in
         (defs, expr)
 
 
@@ -580,6 +580,8 @@ end) : Ir.Backend_impl.Lowered_backend = struct
       | Recip_sqrt, _ -> func_doc "rsqrt"
       | Tanh_approx, _ -> func_doc "tanh"
       | Not, _ -> fun v -> string "!" ^^ v
+      | Uint4x32_to_prec_uniform1, _ ->
+          fun v -> func_doc "uint4x32_to_prec_uniform1" v
     (* Logical not *)
 
     (* Keep vec_unop_syntax same as in pure C syntax. *)
 
@@ -350,6 +350,11 @@ type unop =
   | Neg
   | Tanh_approx
   | Not  (** 0. -> 1. | _ -> 0. *)
+  | Uint4x32_to_prec_uniform1
+      (** Non-vectorized variant of [Uint4x32_to_prec_uniform] that converts the given Uint4x32 to a
+          single value of the output precision. Less bit-efficient but operates poitwise. For random
+          bits, the result is uniform over the range of the precision for integer precisions, and
+          over the range \[0.0, 1.0) for floating point precisions. *)
 [@@deriving sexp, compare, equal]
 
 type vec_unop =
@@ -431,6 +436,8 @@ let interpret_unop op v =
   | Neg -> ~-.v
   | Tanh_approx -> tanh v
   | Not -> if v = 0. then 1. else 0.
+  | Uint4x32_to_prec_uniform1 ->
+      invalid_arg "Ops.interpret_unop: Uint4x32_to_prec_uniform1 argument outside the domain of float"
 
 let interpret_ternop op v1 v2 v3 =
   let open Float in
@@ -580,6 +587,7 @@ let unop_cd_syntax = function
   | Neg -> "neg"
   | Tanh_approx -> "tanh"
   | Not -> "not"
+  | Uint4x32_to_prec_uniform1 -> "uint4x32_to_prec_uniform1"
 
 let vec_unop_cd_syntax = function Uint4x32_to_prec_uniform -> "uint4x32_to_prec_uniform"
 
@@ -627,6 +635,9 @@ let unop_c_syntax prec op =
       invalid_arg "Ops.unop_c_syntax: Tanh_approx not supported for integer precisions"
   | Tanh_approx, _ -> ("tanhf(", ")")
   | Not, _ -> ("(", " == 0.0 ? 1.0 : 0.0)")
+  | Uint4x32_to_prec_uniform1, Uint4x32_prec _ ->
+      invalid_arg "Ops.vec_unop_c_syntax: Uint4x32_to_prec_uniform1 not supported for Uint4x32"
+  | Uint4x32_to_prec_uniform1, _ -> ("uint4x32_to_" ^ prec_string prec ^ "_uniform(", ")")
 
 let vec_unop_c_syntax prec op =
   match (op, prec) with
 
@@ -289,6 +289,19 @@ let uint4x32_to_prec_uniform ?grad_spec =
       ~op_asn ~grad_asn ?grad_spec (* Modifying the label would cause identifier pollution. *)
       ?label ~top_down_prec:true t1
 
+let uint4x32_to_prec_uniform1 ?grad_spec =
+  let module NTDSL = Initial_NTDSL in
+  let%cd op_asn ~v ~t1 ~projections = v =: uint4x32_to_prec_uniform1 v1 in
+  let%cd grad_asn ~t:_ ~g:_ ~t1:_ ~projections:_ = Asgns.empty_comp in
+  fun t1 ?label ?top_down_prec ->
+    (* Ignore what the caller says, since we must learn the precision from the outside. *)
+    ignore (top_down_prec : bool option);
+    Tn.update_prec t1.Tensor.value Ir.Ops.uint4x32;
+    Tensor.unop (* A placeholder that will be replaced by the actual precision by Tensor.op. *)
+      ~transpose_op:Pointwise_un ~op_asn ~grad_asn
+      ?grad_spec (* Modifying the label would cause identifier pollution. *)
+      ?label ~top_down_prec:true t1
+
 let lt ?(label = []) =
   let module NTDSL = Initial_NTDSL in
   let%cd op_asn ~v ~t1 ~t2 ~projections = v =: (v1 < v2) in
@@ -355,15 +368,16 @@ let where ?(label = []) ~grad_spec t1 t2 t3 =
   Tensor.ternop ~label:("where" :: label) ~ternary_op:Pointwise_tern ~op_asn ~grad_asn ~grad_spec t1
     t2 t3
 
+(** [range] is a 1D tensor of shape [upto], spans [[0, upto)]. *)
 let range ?(label = []) ?(grad_spec = Tensor.Prohibit_grad) ?axis_label upto =
   let result =
     Tensor.term ~fetch_op:Range_over_offsets ~grad_spec ~batch_dims:[]
-      ~label:(("0" ^ "..." ^ Int.to_string upto) :: label)
+      ~label:(("0" ^ "..." ^ Int.to_string (upto - 1)) :: label)
       ~input_dims:[]
   in
   match axis_label with
-  | None -> result ~output_dims:[ upto + 1 ] ()
-  | Some l -> result ~output_axes:[ (l, upto + 1) ] ()
+  | None -> result ~output_dims:[ upto ] ()
+  | Some l -> result ~output_axes:[ (l, upto) ] ()
 
 let range_of_shape ?(label = []) ?(grad_spec = Tensor.Prohibit_grad) ?batch_dims ?input_dims
     ?output_dims ?batch_axes ?input_axes ?output_axes () =
@@ -433,6 +447,24 @@ let uniform_at ?grad_spec counter =
           ~label:[ "range_over_offsets" ] ())
        ())
 
+(** A wasteful variant of {!uniform} that produces a single value from each 4x32 random bits. *)
+let uniform1 ?grad_spec () =
+  uint4x32_to_prec_uniform1 ?grad_spec
+    (threefry4x32
+       (threefry4x32 (embed_self_id ()) (Tensor.get_random_seed ()) ())
+       (Tensor.term ~fetch_op:Range_over_offsets ~grad_spec:Prohibit_grad
+          ~label:[ "range_over_offsets" ] ())
+       ())
+
+(** A wasteful variant of {!uniform_at} that produces a single value from each 4x32 random bits. *)
+let uniform_at1 ?grad_spec counter =
+  uint4x32_to_prec_uniform1 ?grad_spec
+    (threefry4x32
+       (threefry4x32 (threefry4x32 (embed_self_id ()) (Tensor.get_random_seed ()) ()) counter ())
+       (Tensor.term ~fetch_op:Range_over_offsets ~grad_spec:Prohibit_grad
+          ~label:[ "range_over_offsets" ] ())
+       ())
+
 module DO = struct
   let ( * ) ?label t1 t2 = matmul ~grad_spec:If_needed ?label t1 t2 ()
   let ( *. ) ?label t1 t2 = pointmul ~grad_spec:If_needed ?label t1 t2 ()
@@ -442,6 +474,9 @@ module DO = struct
   let uint4x32_to_prec_uniform ?label t1 =
     uint4x32_to_prec_uniform ~grad_spec:If_needed t1 ?label ()
 
+  let uint4x32_to_prec_uniform1 ?label t1 =
+    uint4x32_to_prec_uniform1 ~grad_spec:If_needed t1 ?label ()
+
   let ( **. ) ?label base exp = pointpow ?label exp base ~grad_spec:If_needed ()
   let relu ?label t = relu ~grad_spec:If_needed ?label t ()
   let sat01 ?label t = sat01 ~grad_spec:If_needed ?label t ()
@@ -478,6 +513,8 @@ module DO = struct
   let ndarray = Tensor.ndarray ~grad_spec:If_needed
   let uniform ?label () = uniform ~grad_spec:Require_grad () ?label ()
   let uniform_at ?label counter = uniform_at ~grad_spec:Require_grad ?label counter ()
+  let uniform1 ?label () = uniform1 ~grad_spec:Require_grad () ?label ()
+  let uniform_at1 ?label counter = uniform_at1 ~grad_spec:Require_grad ?label counter ()
 end
 
 module NDO = struct
@@ -502,6 +539,9 @@ module NDO = struct
   let uint4x32_to_prec_uniform ?label t1 =
     uint4x32_to_prec_uniform ~grad_spec:Prohibit_grad ?label t1 ()
 
+  let uint4x32_to_prec_uniform1 ?label t1 =
+    uint4x32_to_prec_uniform1 ~grad_spec:Prohibit_grad ?label t1 ()
+
   let recip ?label t = recip ~grad_spec:Prohibit_grad ?label t ()
   let recip_sqrt ?label t = recip_sqrt ~grad_spec:Prohibit_grad ?label t ()
   let tanh ?label t = tanh ~grad_spec:Prohibit_grad ?label t ()
@@ -515,6 +555,8 @@ module NDO = struct
   let ndarray = Tensor.ndarray ~grad_spec:Prohibit_grad
   let uniform ?label () = uniform ~grad_spec:Prohibit_grad () ?label ()
   let uniform_at ?label counter = uniform_at ~grad_spec:Prohibit_grad ?label counter ()
+  let uniform1 ?label () = uniform1 ~grad_spec:Prohibit_grad () ?label ()
+  let uniform_at1 ?label counter = uniform_at1 ~grad_spec:Prohibit_grad ?label counter ()
 end
 
 (** The input [i] dimensions default to empty. The batch and output dimensions will be inferred if
@@ -555,6 +597,7 @@ module TDSL = struct
   let ndarray = Tensor.ndarray ~grad_spec:If_needed
   let threefry4x32 = threefry4x32 ~grad_spec:If_needed
   let uint4x32_to_prec_uniform = uint4x32_to_prec_uniform ~grad_spec:If_needed
+  let uint4x32_to_prec_uniform1 = uint4x32_to_prec_uniform1 ~grad_spec:If_needed
   let embed_self_id = embed_self_id
 
   (** The default initialization operation for {!param} calls. *)
@@ -615,6 +658,8 @@ module NTDSL = struct
   let embed_self_id = embed_self_id
   let uniform = uniform ~grad_spec:Prohibit_grad
   let uniform_at = uniform_at ~grad_spec:Prohibit_grad
+  let uniform1 = uniform1 ~grad_spec:Prohibit_grad
+  let uniform_at1 = uniform_at1 ~grad_spec:Prohibit_grad
 
   let counter ?(label = []) =
     let module NTDSL = Initial_NTDSL in
 
@@ -454,7 +454,7 @@ let translate ?ident_label (expr : expression) : result =
@@ Location.error_extensionf ~loc
                     "ppx_ocannl %%cd: expected a unary operator, one of: %s"
                     "id, relu, sat01, exp, log, exp2, log2, sin, cos, sqrt, recip, recip_sqrt, \
-                     neg, tanh" ))
+                     neg, tanh, uint4x32_to_prec_uniform1" ))
     in
     let vec_unary_op vec_un_op =
       loc
 
@@ -190,6 +190,8 @@ let unary_ops =
       ("neg", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Neg]));
       ("tanh", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Tanh_approx]));
       ("not", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Not]));
+      ( "uint4x32_to_prec_uniform1",
+        fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Uint4x32_to_prec_uniform1]) );
     ]
 
 (** Vector unary primitive ops. *)
 
@@ -245,7 +245,9 @@ val set_random_seed : ?seed:int -> unit -> unit
 
 val get_random_seed : unit -> t
 (** Returns a tensor with the current random seed. Lazily initialized using {!set_random_seed} and
-    reset when {!unsafe_reinitialize} is called. *)
+    reset when {!unsafe_reinitialize} is called. IMPORTANT: all sites using the same global random
+    seed, e.g. using [get_random_seed ()] not separated by a call to {!unsafe_reinitialize}, must
+    descend from the first caller's optimization context. *)
 
 (** {2 Printing.} *)
 
 
@@ -961,13 +961,13 @@ let%expect_test "outer_sum simulating axis concatenation" =
        and type optimize_ctx = Backend.optimize_ctx)
   in
 
-  let ri = TDSL.range 3 in
+  let ri = TDSL.range 4 in
   let%op ti = ri ++ "i=>i0" in
   (* Write position 2 of ti, otherwise shape inference concludes it's dim-1 and broadcasted. *)
   let%cd _ = ti =: 0 ++ "i=>i2" in
-  let rj = TDSL.range 4 in
+  let rj = TDSL.range 5 in
   let%op tj = rj ++ "j=>j1" in
-  let rk = TDSL.range 5 in
+  let rk = TDSL.range 6 in
   let%op tk = rk ++ "k=>k2" in
   let positions = TDSL.outer_sum "ijl;kl=>ijkl" (TDSL.outer_sum "il;jl=>ijl" ti tj ()) tk () in
   Train.set_hosted tk.value;
Original file line number	Diff line number	Diff line change
`@@ -190,6 +190,8 @@ let unary_ops =`
`190`	`190`	`("neg", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Neg]));`
`191`	`191`	`("tanh", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Tanh_approx]));`
`192`	`192`	`("not", fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Not]));`
	`193`	`+ ( "uint4x32_to_prec_uniform1",`
	`194`	`+ fun loc -> ([%expr Shape.Pointwise_un], [%expr Ir.Ops.Uint4x32_to_prec_uniform1]) );`
`193`	`195`	`]`
`194`	`196`
`195`	`197`	`(** Vector unary primitive ops. *)`