Fixes #324: Make Tensor.print non-forcing by default; refactor forward_and_forget to forward_and_force

lukstafi · lukstafi · commit 3c0a07cffdc7 · 2025-07-20T19:42:56.000+02:00
This change ensures that tensor values are forced to the host as needed, but aren't forced by mistake.
diff --git a/bin/einsum_trivia.ml b/bin/einsum_trivia.ml
@@ -15,7 +15,7 @@ let _suspended () =
   let a = TDSL.range_of_shape ~label:[ "a" ] ~input_dims:[ 2 ] ~output_dims:[ 2 ] () in
   let b = TDSL.range_of_shape ~label:[ "b" ] ~input_dims:[ 2; 3; 4 ] ~output_dims:[ 2 ] () in
   let%op c = a *+ "i->1; ij...->0 => ...->ji" b in
-  Train.forward_and_forget (module Backend) ctx c;
+  Train.forward_and_force (module Backend) ctx c;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ c;
   Stdio.printf "\n%!"
 
@@ -40,7 +40,7 @@ let _suspended () =
   in
   let%op ho2 = hey2 ++ "ab|cd->ef => cf|ae->db" in
   Utils.capture_stdout_logs @@ fun () ->
-  Train.forward_and_forget backend ctx ho2;
+  Train.forward_and_force backend ctx ho2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ ho2
 
 let () =
@@ -61,11 +61,11 @@ let () =
   let%op a2 = a *+ "b|i->o; b|i->o => b|i->o" a in
   let ctx = Utils.capture_stdout_logs (fun () -> Train.forward_and_ctx backend ctx a2) in
   let%op c = b *+ "b|h->o; b|i->h => b|i->o" a in
-  Utils.capture_stdout_logs (fun () -> Train.forward_and_forget backend ctx c);
+  Utils.capture_stdout_logs (fun () -> Train.forward_and_force backend ctx c);
   (* let%op d = a *+ "a|i->h; b|h->o => ab|i->o" b in Utils.capture_stdout_logs (fun () ->
-     Train.forward_and_forget backend ctx d); let%op e = a *+ "b|i->h; b|h->o => i->o" b in
-     Utils.capture_stdout_logs (fun () -> Train.forward_and_forget backend ctx e); let%op f = a *+
-     "a|i->h; b|h->o => i->o" b in Utils.capture_stdout_logs (fun () -> Train.forward_and_forget
+     Train.forward_and_force backend ctx d); let%op e = a *+ "b|i->h; b|h->o => i->o" b in
+     Utils.capture_stdout_logs (fun () -> Train.forward_and_force backend ctx e); let%op f = a *+
+     "a|i->h; b|h->o => i->o" b in Utils.capture_stdout_logs (fun () -> Train.forward_and_force
      backend ctx f); *)
   (* Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ a2; *)
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ c
diff --git a/bin/hello_world.ml b/bin/hello_world.ml
@@ -19,7 +19,7 @@ let hello1 () =
   let hey = range_of_shape ~batch_dims:[ 7 ] ~input_dims:[ 9; 10; 11 ] ~output_dims:[ 13; 14 ] () in
   let%op hoo = ((1 + 1) * hey) - 10 in
   (* For convenience, Train.forward will set hoo.value as fully on host. *)
-  Train.forward_and_forget (module Backend) ctx hoo;
+  Train.forward_and_force (module Backend) ctx hoo;
   (* Disable line wrapping for viewing the output. In VSCode: `View: Toggle Word Wrap`. *)
   Tensor.print_tree ~with_grad:false ~depth:99 hoo;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default hoo
@@ -33,7 +33,7 @@ let hello2 () =
   let%op y = ("hey" * 'q' 2.0) + 'p' 1.0 in
   (* Punning for ["hey"] above introduced the [hey] identifier. *)
   Train.every_non_literal_on_host y;
-  Train.forward_and_forget (module Backend) ctx y;
+  Train.forward_and_force (module Backend) ctx y;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ y
 
@@ -79,7 +79,7 @@ let hello4 () =
   let positions = TDSL.outer_sum "ijl;kl=>ijkl" (TDSL.outer_sum "il;jl=>ijl" ti tj) tk in
   Train.set_hosted ti.value;
   Train.set_hosted tk.value;
-  Train.forward_and_forget backend ctx positions;
+  Train.forward_and_force backend ctx positions;
   Stdio.print_endline "positions:";
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ positions;
   Stdio.print_endline "tk:";
@@ -103,7 +103,7 @@ let hello5 () =
   Rand.init 0;
   let hey = TDSL.range_of_shape ~batch_dims:[ 2 ] ~input_dims:[ 3 ] ~output_dims:[ 4 ] () in
   let%op ho = hey ++ "...|1->... => ...|..." in
-  Train.forward_and_forget backend ctx ho;
+  Train.forward_and_force backend ctx ho;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ ho
 
 let hello6 () =
@@ -121,7 +121,7 @@ let hello6 () =
   Rand.init 0;
   (* "Hey" is inferred to be a scalar. *)
   let%op y = 2 *. "hey" in
-  Train.forward_and_forget backend ctx y;
+  Train.forward_and_force backend ctx y;
   (* Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey; *)
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ y
 
diff --git a/bin/hello_world_op.ml b/bin/hello_world_op.ml
@@ -29,7 +29,7 @@ let%track2_sexp _Pointwise_multiplication_dims_1 (() : unit) : unit =
   Rand.init 0;
   (* "Hey" is inferred to be a scalar. *)
   let%op ya = 2 *. "hey" 7.0 in
-  Train.forward_and_forget backend ctx ya;
+  Train.forward_and_force backend ctx ya;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ ya
 
 let%track2_sexp _Matrix_multiplication_dims_1x1 (() : unit) : unit =
@@ -48,11 +48,25 @@ let%track2_sexp _Matrix_multiplication_dims_1x1 (() : unit) : unit =
   Rand.init 0;
   (* Hey is inferred to be a matrix because of matrix multiplication [*]. *)
   let%op yb = ("hey" 7.0 * 'q' 2.0) + 'p' 1.0 in
-  Train.forward_and_forget backend ctx yb;
+  Train.forward_and_force backend ctx yb;
   (* Punning for ["hey"] above introduced the [hey] identifier. *)
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ yb
 
+let%track2_sexp _Print_constant_tensor_too_early (() : unit) : unit =
+  Tensor.unsafe_reinitialize ();
+  let module Backend = (val Backends.fresh_backend ()) in
+  let print_tensor = Tensor.print ~with_code:false ~with_grad:false in
+
+  let%op a = [| 1.; 2.; 3.; 4. |] in
+  let%op b = [| 2.; 3.; 4.; 5. |] in
+  print_tensor ~here:[%here] `Default a;
+  print_tensor ~here:[%here] `Default b;
+  let%op c = a *. b in
+  let ctx = Train.init_params (module Backend) IDX.empty c in
+  Train.forward_and_force (module Backend) ctx c;
+  print_tensor ~here:[%here] `Default c
+
 let%track2_sexp _Print_constant_tensor (() : unit) : unit =
   Tensor.unsafe_reinitialize ();
   let module Backend = (val Backends.fresh_backend ()) in
@@ -68,11 +82,11 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
   let ctx = Backend.make_context stream in
   Rand.init 0;
   let%op hey = [ (1, 2, 3); (4, 5, 6) ] in
-  Train.forward_and_forget backend ctx hey;
+  Train.forward_and_force backend ctx hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey;
   let%op hoo = [| [ 1; 2; 3 ]; [ 4; 5; 6 ] |] in
-  Train.forward_and_forget backend ctx hoo;
+  Train.forward_and_force backend ctx hoo;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ hoo;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hoo;
   let%op hey2 =
@@ -83,7 +97,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       ((19, 20, 21), (22, 23, 24));
     ]
   in
-  Train.forward_and_forget backend ctx hey2;
+  Train.forward_and_force backend ctx hey2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ hey2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey2;
   let%op hoo2 =
@@ -94,7 +108,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       [ [ 19; 20; 21 ]; [ 22; 23; 24 ] ];
     |]
   in
-  Train.forward_and_forget backend ctx hoo2;
+  Train.forward_and_force backend ctx hoo2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ hoo2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hoo2;
   let%op heyhoo =
@@ -105,7 +119,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       [| [ 19; 20; 21 ]; [ 22; 23; 24 ] |];
     |]
   in
-  Train.forward_and_forget backend ctx heyhoo;
+  Train.forward_and_force backend ctx heyhoo;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ heyhoo;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ heyhoo;
   let%op heyhoo2 =
@@ -116,7 +130,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       [| [ [ 19; 49 ]; [ 20; 50 ]; [ 21; 51 ] ]; [ [ 22; 52 ]; [ 23; 53 ]; [ 24; 54 ] ] |];
     |]
   in
-  Train.forward_and_forget backend ctx heyhoo2;
+  Train.forward_and_force backend ctx heyhoo2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ heyhoo2;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ heyhoo2;
   let%op heyhoo3 =
@@ -131,7 +145,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       |];
     |]
   in
-  Train.forward_and_forget backend ctx heyhoo3;
+  Train.forward_and_force backend ctx heyhoo3;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ heyhoo3;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ heyhoo3;
   let%op heyhoo4 =
@@ -146,7 +160,7 @@ let%track2_sexp _Print_constant_tensor (() : unit) : unit =
       ];
     |]
   in
-  Train.forward_and_forget backend ctx heyhoo4;
+  Train.forward_and_force backend ctx heyhoo4;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Inline @@ heyhoo4;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ heyhoo4
 
@@ -166,7 +180,7 @@ let%track2_sexp _Matrix_multiplication_dims_2x3 (() : unit) : unit =
   Rand.init 0;
   (* Hey is inferred to be a matrix. *)
   let%op yc = ("hey" 7.0 * [ 2; 3 ]) + [ 4; 5; 6 ] in
-  Train.forward_and_forget backend ctx yc;
+  Train.forward_and_force backend ctx yc;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default @@ yc
 
@@ -188,7 +202,7 @@ let%track2_sexp _Big_matrix (() : unit) : unit =
   let hey = TDSL.param ~value:0.5 "hey" in
   let zero_to_twenty = TDSL.range 20 in
   let%op yd = (hey * zero_to_twenty) + zero_to_twenty in
-  Train.forward_and_forget backend ctx yd;
+  Train.forward_and_force backend ctx yd;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default hey;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default yd
 
@@ -208,7 +222,7 @@ let%track2_sexp _Very_big_tensor (() : unit) : unit =
   Rand.init 0;
   let hey = TDSL.range_of_shape ~batch_dims:[ 6 ] ~input_dims:[ 7; 8 ] ~output_dims:[ 9 ] () in
   let%op ye = (hey * (1 + 1)) - 10 in
-  Train.forward_and_forget backend ctx ye;
+  Train.forward_and_force backend ctx ye;
   Tensor.print ~here:[%here] ~with_code:false ~with_grad:false `Default ye
 
 let _suspended (() : unit) : unit =
@@ -223,7 +237,4 @@ let _suspended (() : unit) : unit =
   _Big_matrix ();
   _Very_big_tensor ()
 
-let (() : unit) : unit =
-  _Matrix_multiplication_dims_2x3 ();
-  _Big_matrix ();
-  _Very_big_tensor ()
+let (() : unit) : unit = _Print_constant_tensor_too_early ()
diff --git a/bin/zero2hero_1of7.ml b/bin/zero2hero_1of7.ml
@@ -38,7 +38,7 @@ let _suspended () =
   let%op f5 = f 5 in
   let module Backend = (val Backends.fresh_backend ()) in
   Train.every_non_literal_on_host f5;
-  Train.forward_and_forget
+  Train.forward_and_force
     (module Backend)
     Backend.(make_context @@ new_stream @@ get_device ~ordinal:0)
     f5;
@@ -100,7 +100,7 @@ let _suspended () =
   let%op f x = (3 *. (x **. 2)) - (4 *. x) + 5 in
   let%op f5 = f 5 in
   Train.every_non_literal_on_host f5;
-  Train.forward_and_forget (module Backend) ctx f5;
+  Train.forward_and_force (module Backend) ctx f5;
   Tensor.print_tree ~with_grad:false ~depth:9 f5;
   let size = 100 in
   let xs = Array.init size ~f:Float.(fun i -> (of_int i / 10.) - 5.) in
diff --git a/lib/operation.ml b/lib/operation.ml
@@ -375,6 +375,7 @@ let embed_symbol ?(label = []) static_sym : Tensor.t =
     (Shape.make ~batch_dims:[] ~input_dims:[] ~output_dims:[ 1 ] ())
     []
 
+(* 
 let random_seed =
   let seed = Option.value ~default:42 @@ Utils.settings.fixed_state_for_init in
   let res =
@@ -384,7 +385,7 @@ let random_seed =
   in
   Tn.update_memory_mode res.value Tn.Effectively_constant 24;
   Tn.update_prec res.value Ir.Ops.uint4x32;
-  ref res
+  ref res *)
 
 module DO = struct
   let ( * ) = matmul ~grad_spec:If_needed
diff --git a/lib/tensor.ml b/lib/tensor.ml
@@ -610,11 +610,13 @@ let to_dag ?(single_node = false) ?(embedded_only = false) ?entries_per_axis ~sp
           grad_txt diff ^ if (not should_elide) && not embedded then " non-emb" else ""
         in
         let node =
+          if Lazy.is_val diff.grad.array then
           match Lazy.force diff.grad.array with
           | Some g_array ->
               Tn.do_read diff.grad;
               `Box (Nd.render_array ~brief:true ~prefix ?entries_per_axis ~labels ~indices g_array)
           | None -> `Text (prefix ^ " " ^ where_located diff.grad)
+          else `Text (prefix ^ " <not-in-yet> " ^ where_located diff.grad)
         in
         `Subtree_with_ID (id, `Tree (add_shape [ node ], children))
     | _, true, true, Some diff ->
@@ -666,7 +668,7 @@ let log_debug_info ~from_log_level t =
             Tn.log_debug_info ~from_log_level diff.grad]);
       List.iter ~f:log_child t.children]]
 
-let to_doc ?(spy = false) ~with_grad ~with_code ?(with_low_level = false)
+let to_doc ?(force_read = false) ~with_grad ~with_code ?(with_low_level = false)
     (style : array_print_style) t =
   let sh = t.shape in
   let label = Tn.label t.value in
@@ -724,7 +726,7 @@ let to_doc ?(spy = false) ~with_grad ~with_code ?(with_low_level = false)
   let open PPrint in
   (* Create document for tensor value *)
   let value_doc =
-    if spy && not (Lazy.is_val t.value.array) then
+    if not force_read && not (Lazy.is_val t.value.array) then
       string prefix_str ^^ string " <not-in-yet>" ^^ space
     else
       match (style, Lazy.force t.value.array) with
@@ -743,7 +745,7 @@ let to_doc ?(spy = false) ~with_grad ~with_code ?(with_low_level = false)
     if with_grad then
       match t.diff with
       | Some diff -> (
-          if spy && not (Lazy.is_val diff.grad.array) then
+          if not force_read && not (Lazy.is_val diff.grad.array) then
             string (grad_txt diff) ^^ string " <not-in-yet>" ^^ space
           else
             match Lazy.force diff.grad.array with
@@ -816,12 +818,12 @@ let to_doc ?(spy = false) ~with_grad ~with_code ?(with_low_level = false)
   (* Combine all documents and print *)
   group (value_doc ^^ break 1 ^^ grad_doc ^^ break 1 ^^ code_doc ^^ break 1 ^^ low_level_doc)
 
-let print ?here ?(spy = false) ~with_grad ~with_code ?(with_low_level = false)
+let print ?here ?(force_read = false) ~with_grad ~with_code ?(with_low_level = false)
     (style : array_print_style) t =
   Option.iter here ~f:(fun here ->
       Stdio.printf "HERE: %s\n%!" (Source_code_position.to_string here));
   PPrint.ToChannel.pretty 0.7 100 Stdio.stdout
-    (to_doc ~spy ~with_grad ~with_code ~with_low_level style t)
+    (to_doc ~force_read ~with_grad ~with_code ~with_low_level style t)
 
 let print_forward_roots ~with_grad ~with_code (style : array_print_style) =
   List.iter (Map.to_alist ~key_order:`Increasing session_state.forward_roots) ~f:(fun (id, root) ->
diff --git a/lib/tensor.mli b/lib/tensor.mli
@@ -345,7 +345,7 @@ val to_printbox :
   PrintBox.t
 
 val to_doc :
-  ?spy:bool ->
+  ?force_read:bool ->
   with_grad:bool ->
   with_code:bool ->
   ?with_low_level:bool ->
@@ -355,7 +355,7 @@ val to_doc :
 
 val print :
   ?here:Ppx_here_lib.position ->
-  ?spy:bool ->
+  ?force_read:bool ->
   with_grad:bool ->
   with_code:bool ->
   ?with_low_level:bool ->
diff --git a/lib/train.ml b/lib/train.ml
@@ -550,6 +550,15 @@ let%track3_sexp forward_and_ctx ?(hosted = true) ?(skip_init = false)
   Task.run routine.schedule;
   routine.context
 
-let forward_and_forget ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t =
+(** [forward_and_force] is a wrapper around {!forward_and_ctx} that additionally forces the
+    tensor's value and ensures it is transferred back to host as needed, see the setting
+    {!Utils.settings.automatic_host_transfers}. The resulting context is ignored.
+
+    Note: [Tensor.print ~force_read:true] also has this effect, so: using [forward_and_force] you
+    don't need to pass [~force_read:true], and if you need the context and also to print the result,
+    you can combine {!forward_and_ctx} and [Tensor.print ~force_read:true]. *)
+let forward_and_force ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t =
   (* FIXME: to properly forget we need to free the incrementally-allocated memory! *)
-  ignore @@ forward_and_ctx ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t
+  ignore @@ forward_and_ctx ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t;
+  ignore (Lazy.force t.value.array);
+  Tn.do_read t.value
diff --git a/test/einsum/einsum_trivia.ml b/test/einsum/einsum_trivia.ml
diff --git a/test/einsum/einsum_trivia_exec.ml b/test/einsum/einsum_trivia_exec.ml
diff --git a/test/operations/hello_world_op.ml b/test/operations/hello_world_op.ml
diff --git a/test/operations/zero2hero_1of7.ml b/test/operations/zero2hero_1of7.ml