ahrefs
diff --git a/‎CLAUDE.md‎
Lines changed: 3 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎arrayjit/lib/context.ml‎
Lines changed: 2 additions & 1 deletion b/‎arrayjit/lib/context.ml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎arrayjit/lib/context.mli‎
Lines changed: 1 addition & 1 deletion b/‎arrayjit/lib/context.mli‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎arrayjit/lib/ndarray.ml‎
Lines changed: 4 additions & 5 deletions b/‎arrayjit/lib/ndarray.ml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎docs/shape_inference.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/shape_inference.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/nn_blocks.ml‎
Lines changed: 20 additions & 0 deletions b/‎lib/nn_blocks.ml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎lib/train.ml‎
Lines changed: 4 additions & 4 deletions b/‎lib/train.ml‎
Lines changed: 4 additions & 4 deletions
@@ -79,6 +79,7 @@ opam install cudajit  # for CUDA backend
    - Row variables (`..d..`) enable flexible axis handling and broadcasting
    - Einsum notation supports convolutions, reductions, and arbitrary permutations
    - "Principle of least commitment": use row variables where axis count doesn't matter
+   - Shape inference completion is forced by lowering: via `Context.compile`, or wrappers such as `Train.to_routine`, `Train.run_once` or `Train.forward_once`
 
 3. **Backend Architecture**: Unified interface supporting CPU (multicore), CUDA, and Metal backends
 
@@ -90,8 +91,8 @@ opam install cudajit  # for CUDA backend
 
 - Tests are implemented either as inline expectations using `ppx_expect`; or as cram-style tests using Dune's `test` stanza where an `.ml` file is compiled, executed, and its output compared against an `.expected` file
 - The two approaches are exclusive: a test using using `.expected` file target cannot also use `%expect` inline expectations
-- `.expected` tests are easier to debug, `%expect` tests should only be used when the outputs are illustrative
-- Tutorial files, i.e. `%expect` tests, in `test/` serve as both documentation and integration tests
+- `.expected` tests, i.e. using the `test` stanza, are easier to debug, use them for testing new features
+- Tutorial files, i.e. `%expect` tests, in `test/` serve as both documentation and integration tests, should only be used when the outputs are illustrative
 
 **Running Tests**:
 - `dune runtest` - runs all tests including inline tests and cram-style tests
 
@@ -24,11 +24,12 @@ type backend_wrapper =
       -> backend_wrapper
 
 type t = {
-  backend_wrapper : backend_wrapper;
+  backend_wrapper : (backend_wrapper [@sexp.opaque]);
   device_id : int;
   backend_name : string;
   initialized_nodes : Set.M(Tn).t; (* Track which nodes have been initialized *)
 }
+[@@deriving sexp_of]
 
 type routine = {
   (* TODO: Remove commented out fields if they prove to be unnecessary *)
 
@@ -2,7 +2,7 @@
 
 module Backends_deprecated = Backends
 
-type t
+type t [@@deriving sexp_of]
 (** Execution context managing device, compilation, and buffers *)
 
 type routine
 
@@ -672,11 +672,10 @@ let render_array ?(brief = false) ?(prefix = "") ?(entries_per_axis = 4) ?(label
             else
               concise_float ~prec:Utils.settings.print_decimals_precision (get_as_float arr indices)
           with Invalid_argument _ ->
-            raise
-            @@ Utils.User_error
-                 [%string
-                   "Invalid indices: %{int_dims_to_string indices} into array: \
-                    %{(int_dims_to_string dims)}"])
+            failwith
+              [%string
+                "Invalid indices: %{int_dims_to_string indices} into array: %{(int_dims_to_string \
+                 dims)}"])
     in
     let tag ?pos label ind =
       if ind = -1 then ""
 
@@ -1,6 +1,6 @@
 # Shape inference and projection inference
 
-To separate concerns, OCANNL is split into the `arrayjit` library, responsible for compilation of high-level n-D array operation sequences (`Assignments.comp`) via the gccjit and cuda backends, and the main `ocannl` library, responsible for deriving the operations computing the forward propagation and backpropagation from tensor expressions. In particular, `arrayjit` contains `Indexing`, which represents complex indexing into arrays, and the main library `ocannl` has `Row` and `Shape` modules, which do the most "heavy-lifting" in the translation from concise tensor expressions to sequences of assignments.
+To separate concerns, OCANNL is split into the `arrayjit` library, responsible for compilation of high-level n-D array operation sequences (`Assignments.comp`) via backends such as sync_cc, metal and cuda, and the main `ocannl` library, responsible for deriving the operations computing the forward propagation and backpropagation from tensor expressions. In particular, `arrayjit` contains `Indexing`, which represents complex indexing into arrays, and the main library `ocannl` has `Row` and `Shape` modules, which do the most "heavy-lifting" in the translation from concise tensor expressions to sequences of assignments.
 
 Shape inference broadly speaking consists in OCANNL of inferring the `Shape.t` record -- shape inference proper, and inferring the `Indexing.projections` record -- projections inference. `Shape.t` records are mutable, so that the partially inferred shapes can be observed by the user. Shape and projections inference is intended to be declarative -- independent of the order in which constraints are added. There is one aspect that is not declarative: when tensor expressions are compiled to assignments, i.e. jitted, still-unsolved shape variables in terminal nodes are substituted by their least upper bounds if any, or by dimension-1 / no-more-axes.
 
 
@@ -180,6 +180,26 @@ let%op transformer ~label ~num_encoder_layers ~num_decoder_layers ~num_heads ~d_
     let tgt_embedded = ({ tgt_embed; o = [ d_dec ] } * tgt) + pos_encoding_tgt in
     { w_out } * decoder ~train_step tgt_embedded ~enc_output ~mask
 
+(** Transformer with teacher forcing for autoregressive training.
+
+    TODO: Simplify once tensor shifting/slicing is better supported in shape inference. Currently
+    requires pre-shifted tgt_input (all but last token) and tgt_target (all but first token). During
+    training, the model learns to predict tgt_target given tgt_input. *)
+let%op transformer_with_loss ~label:_ ~model () ~train_step ~src ~tgt_input ~tgt_target ~mask =
+  (* Get model predictions for the input sequence *)
+  let logits = model ~train_step ~src ~tgt:tgt_input ~mask in
+
+  (* Compute cross-entropy loss between predictions and target *)
+  (* softmax over vocabulary dimension *)
+  let log_probs = log (softmax ~spec:"... | v" () logits) in
+
+  (* Negative log likelihood loss: -sum(target * log_probs) *)
+  (* tgt_target should be one-hot encoded or use label smoothing *)
+  let loss = -(tgt_target *. log_probs) ++ "...|... => 0" in
+
+  (* Return both loss and logits for potential additional metrics *)
+  (loss, logits)
+
 (** {2 Convolutional Neural Network Building Blocks} *)
 
 (** 2D convolution layer with flexible padding and stride options. *)
 
@@ -183,7 +183,7 @@ let every_non_literal_on_host =
 
 module Lazy = Utils.Lazy
 
-let to_routine (ctx : Context.t) ?(hosted = true) bindings comp =
+let%track7_sexp to_routine (ctx : Context.t) ?(hosted = true) bindings comp =
   if hosted then Set.iter (snd @@ Asgns.collect_nodes_guess_output comp.Asgns.asgns) ~f:set_hosted;
   let _ctx, routine = Context.compile ctx comp bindings in
   (* Return just the routine for backward compatibility - ctx is discarded here *)
@@ -234,7 +234,7 @@ type example_train_result = {
     true, and the update code is output to a file before shape inference potentially crashes at
     [init_params]. *)
 let%track3_sexp run_once ?(output_cd_file = false) ?(hosted = true) ?(skip_init = false) ?reinit_all
-    ?(bindings = IDX.empty) ~f ctx t =
+    ?(bindings = IDX.empty) ~f ctx (t : Tensor.t) : Context.t =
   if hosted then set_hosted t.Tensor.value;
   (* Compute the update early, to ensure the shape inference is done. *)
   let update = f t in
@@ -275,8 +275,8 @@ let update_once ?output_cd_file ?(hosted = true) ?(skip_init = false) ?reinit_al
 
 (** [printf] is a wrapper around {!Tensor.print} that assumes [~force:true], and by default sets
     [~with_code:false], [~with_grad:true], and [~style:`Default]. *)
-let printf ?here ?(with_grad = true) ?(with_code = false) ?(with_low_level = false)
-    ?(style = `Default) t =
+let%debug7_sexp printf ?here ?(with_grad = true) ?(with_code = false) ?(with_low_level = false)
+    ?(style = `Default) (t : Tensor.t) : unit =
   Tensor.print ?here ~force:true ~with_grad ~with_code ~with_low_level style t
 
 (** [printf_tree] is a wrapper around {!Tensor.print_tree} that assumes [~force:true], and by