Automatically init in Train.forward_and_ctx / forward_and_forget; refactoring for bin/ examples

lukstafi · lukstafi · commit c465f79832d6 · 2025-07-01T22:31:24.000+02:00
The examples are still often broken and  will be audited after another round of refactoring.
diff --git a/bin/micrograd_basic.ml b/bin/micrograd_basic.ml
@@ -51,9 +51,12 @@ let%diagn_sexp () : unit =
   let%op g = g + (10. /. f) in
   List.iter ~f:(function Some diff -> Train.set_hosted diff.grad | None -> ()) [ a.diff; b.diff ];
   (* Train.every_non_literal_on_host g; *)
+  let init_params = Tensor.init_params g in
   let update = Train.grad_update g in
-  let routine = Train.to_routine (module Backend) ctx IDX.empty update in
+  let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx IDX.empty init_params in
+  let routine = Train.to_routine (module Backend) init.context IDX.empty update in
   Utils.capture_stdout_logs @@ fun () ->
+  Train.run init;
   Train.run routine;
   (* Tensor.print_tree ~with_grad:true ~depth:9 g; *)
   Tensor.print ~with_code:false ~with_grad:false `Default @@ g;
diff --git a/bin/micrograd_demo.ml b/bin/micrograd_demo.ml
@@ -74,6 +74,7 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
       (scalar_loss, 0.0)
   in
   (* So that we can inspect them. *)
+  let init_params = Tensor.init_params scalar_loss in
   let update = Train.grad_update scalar_loss in
   let%op learning_rate = 0.1 *. (!..steps - !@step_n) /. !..steps in
   Train.set_hosted learning_rate.value;
@@ -82,6 +83,9 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   let module Backend = (val Backends.fresh_backend ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
+  let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx IDX.empty init_params in
+  let ctx = init.context in
+  Train.run init;
   let routine = Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update; sgd ]) in
   (* Stdio.print_endline "\n******** scalar_loss **********"; Tensor.print_tree ~with_id:true
      ~with_grad:false ~depth:9 scalar_loss; Stdio.print_endline "\n******** learning_rate
@@ -114,7 +118,7 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   let points = Tn.points_2d ~xdim:0 ~ydim:1 moons_flat.value in
   let classes = Tn.points_1d ~xdim:0 moons_classes.value in
   let points1, points2 = Array.partitioni_tf points ~f:Float.(fun i _ -> classes.(i) > 0.) in
-  let%op mlp_result = mlp "point" in
+  let%cd mlp_result = mlp "point" in
   Train.set_on_host mlp_result.value;
   (* By using jitted.context here, we don't need to copy the parameters back to the host. *)
   let result_routine =
diff --git a/bin/micrograd_demo_logging.ml b/bin/micrograd_demo_logging.ml
@@ -30,9 +30,13 @@ let () =
   let%op g = f /. 2 in
   let%op g = g + (10. /. f) in
   List.iter ~f:(Option.iter ~f:(fun diff -> Train.set_hosted diff.Tensor.grad)) [ a.diff; b.diff ];
+  let init_params = Tensor.init_params g in
+  let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx IDX.empty init_params in
+  let ctx = init.context in
   let update = Train.grad_update g in
   let step = Train.to_routine (module Backend) ctx IDX.empty update in
   Utils.capture_stdout_logs @@ fun () ->
+  Train.run init;
   Train.run step;
   Tensor.print ~with_code:false ~with_grad:false `Default g;
   Tensor.print ~with_code:false ~with_grad:true `Default a;
diff --git a/bin/moons_benchmark.ml b/bin/moons_benchmark.ml
@@ -89,14 +89,14 @@ let classify_moons ~seed ~on_device ~inlining_cutoff ~num_streams ~batch_size ~b
   in
   Stdlib.Format.printf "Initial backend global debug info: %a\n%!" Sexp.pp_hum
   @@ Backend.get_global_debug_info ();
-  let per_batch_callback ~at_batch:_ ~at_step:_ ~learning_rate:_ ~batch_loss:_ ~epoch_loss:_ =
-    (* Stdio.printf "Batch=%d, step=%d, lr=%f, batch loss=%f, epoch loss=%f\n%!" at_batch at_step
-       learning_rate batch_loss epoch_loss; *)
+  let per_batch_callback ~at_batch ~at_step ~learning_rate ~batch_loss ~epoch_loss =
+    Stdio.printf "Batch=%d, step=%d, lr=%f, batch loss=%f, epoch loss=%f\n%!" at_batch at_step
+       learning_rate batch_loss epoch_loss;
     if Option.is_none !start_time then start_time := Some (Time_now.nanoseconds_since_unix_epoch ())
   in
   (* Tn.print_accessible_headers (); *)
   let per_epoch_callback ~at_step ~at_epoch ~learning_rate ~epoch_loss =
-    if at_epoch % 10 = 9 then
+    (* if at_epoch % 10 = 9 then *)
       Stdio.printf "Epoch=%d, step=%d, lr=%f, epoch loss=%f\n%!" at_epoch at_step learning_rate
         epoch_loss
   in
diff --git a/bin/moons_demo.ml b/bin/moons_demo.ml
@@ -57,9 +57,12 @@ let demo () =
   Train.set_hosted learning_rate.value;
   let sgd = Train.sgd_update ~learning_rate ~weight_decay scalar_loss in
 
-  let module Backend = (val Backends.fresh_backend ~backend_name:"cuda" ()) in
+  let module Backend = (val Backends.fresh_backend ~backend_name:"metal" ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
+  let init_params = Tensor.init_params scalar_loss in
+  let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx IDX.empty init_params in
+  let ctx = init.context in
   let routine = Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update; sgd ]) in
 
   let points = Tn.points_2d ~xdim:0 ~ydim:1 moons_flat.value in
@@ -81,6 +84,7 @@ let demo () =
   let batch_ref = IDX.find_exn routine.bindings batch_n in
   let epoch_loss = ref 0. in
   step_ref := 0;
+  Train.run init;
   let%track_sexp _train_loop : unit =
     for epoch = 0 to epochs - 1 do
       for batch = 0 to n_batches - 1 do
@@ -95,7 +99,7 @@ let demo () =
     done
   in
 
-  let%op mlp_result = mlp "point" in
+  let%cd mlp_result = mlp "point" in
   Train.set_on_host mlp_result.value;
   let result_routine =
     Train.to_routine
diff --git a/bin/zero2hero_1of7.ml b/bin/zero2hero_1of7.ml
@@ -154,8 +154,13 @@ let () =
   Train.every_non_literal_on_host l;
   let module Backend = (val Backends.fresh_backend ()) in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
+  let init_params = Tensor.init_params l in
   let update = Train.grad_update l in
-  let routine = Train.to_routine (module Backend) (Backend.make_context stream) IDX.empty update in
+  let ctx = Backend.make_context stream in
+  let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx IDX.empty init_params in
+  let ctx = init.context in
+  let routine = Train.to_routine (module Backend) ctx IDX.empty update in
+  Train.run init;
   Train.run routine;
   (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
      Backend.await stream; *)
@@ -181,7 +186,6 @@ let () =
       only params values will change, compared to the above.|};
   Tensor.print_tree ~with_grad:true ~depth:9 l;
   (* We could reuse the jitted code if we did not use `jit_and_run`. *)
-  let update = Train.grad_update l in
   let routine = Train.to_routine (module Backend) routine.context IDX.empty update in
   Train.run routine;
   (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
diff --git a/lib/train.ml b/lib/train.ml
@@ -504,22 +504,30 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
   }
 
 (* Note: this will get nicer with modular explicits. *)
-let%track3_sexp forward_and_ctx ?(hosted = true) ?(disable_rootness_check = false)
-    (type buffer_ptr dev runner event optimize_ctx)
+let%track3_sexp forward_and_ctx ?(hosted = true) ?(skip_init = false)
+    ?(disable_rootness_check = false) (type buffer_ptr dev runner event optimize_ctx)
     (module Backend : Backend
       with type buffer_ptr = buffer_ptr
        and type dev = dev
        and type runner = runner
        and type optimize_ctx = optimize_ctx
        and type event = event) ctx ?(bindings = IDX.empty) t =
   if hosted then set_hosted t.Tensor.value;
+  let ctx =
+    if skip_init || Set.is_empty t.params then ctx
+    else
+      let init_params = Tensor.init_params t in
+      let init = Backend.link ctx @@ Backend.compile ctx.optimize_ctx bindings init_params in
+      run init;
+      init.context
+  in
   let routine =
     Backend.(link ctx @@ compile ctx.optimize_ctx bindings @@ forward ~disable_rootness_check t)
   in
   if not disable_rootness_check then Tensor.remove_bprop_root t;
   Task.run routine.schedule;
   routine.context
 
-let forward_and_forget ?hosted ?disable_rootness_check backend ctx ?bindings t =
+let forward_and_forget ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t =
   (* FIXME: to properly forget we need to free the incrementally-allocated memory! *)
-  ignore @@ forward_and_ctx ?hosted ?disable_rootness_check backend ctx ?bindings t
+  ignore @@ forward_and_ctx ?hosted ?skip_init ?disable_rootness_check backend ctx ?bindings t