Changed %cd syntax ~~ to allow detailed structuring.

lukstafi · lukstafi · commit 8a26b97bbeef · 2024-09-14T18:32:43.000+02:00
Rewrote `Train.grad_update` to use the `%cd` syntax.
diff --git a/CHANGES.md b/CHANGES.md
@@ -13,6 +13,7 @@
 ### Changed
 
 - Removed the `pipes_cc, pipes_gccjit` backends (`Pipes_multicore_backend`) -- I had fixed `Pipes_multicore_backend` by using the `poll` library instead of `Unix.select`, but it turns out to be very very slow.
+- Changed the `%cd` block comment syntax `~~` to allow detailed structuring. Rewrote `Train.grad_update` to use the `%cd` syntax.
 
 ### Fixed
 
diff --git a/lib/ppx_cd.ml b/lib/ppx_cd.ml
@@ -738,6 +738,26 @@ let translate (expr : expression) : result =
                 Ast_builder.Default.pexp_extension ~loc
                 @@ Location.error_extensionf ~loc "ppx_ocannl %%cd: repeated .merge not allowed";
             })
+    | [%expr
+        ~~([%e? { pexp_desc = Pexp_apply (expr, exprs); pexp_loc; _ }];
+           [%e? expr2])] ->
+        let elements =
+          expr :: List.map ~f:snd exprs
+          |> List.map ~f:(function
+               | { pexp_desc = Pexp_constant (Pconst_string _); _ } as s -> s
+               | [%expr [%e? t].value] -> [%expr Arrayjit.Tnode.debug_name [%e t].value]
+               | [%expr [%e? t].grad] -> [%expr Arrayjit.Tnode.debug_name [%e t].value ^ ".grad"]
+               | t -> [%expr Arrayjit.Tnode.debug_name [%e t].value])
+        in
+        let res2 = loop ~proj_in_scope expr2 in
+        {
+          res2 with
+          expr =
+            [%expr
+              Arrayjit.Assignments.Block_comment
+                ( String.concat_array ~sep:" " [%e Ast_helper.Exp.array ~loc:pexp_loc elements],
+                  [%e res2.expr] )];
+        }
     | [%expr
         [%e? accu_op]
           [%e? lhs]
@@ -916,26 +936,6 @@ let translate (expr : expression) : result =
             @@ Location.error_extensionf ~loc
                  "ppx_ocannl %%cd: for-downto: low-level code embeddings not supported yet";
         }
-    | [%expr
-        ~~[%e? { pexp_desc = Pexp_apply (expr, exprs); pexp_loc; _ }];
-        [%e? expr2]] ->
-        let elements =
-          expr :: List.map ~f:snd exprs
-          |> List.map ~f:(function
-               | { pexp_desc = Pexp_constant (Pconst_string _); _ } as s -> s
-               | [%expr [%e? t].value] -> [%expr Arrayjit.Tnode.debug_name [%e t].value]
-               | [%expr [%e? t].grad] -> [%expr Arrayjit.Tnode.debug_name [%e t].value ^ ".grad"]
-               | t -> [%expr Arrayjit.Tnode.debug_name [%e t].value])
-        in
-        let res2 = loop ~proj_in_scope expr2 in
-        {
-          res2 with
-          expr =
-            [%expr
-              Arrayjit.Assignments.Block_comment
-                ( String.concat_array ~sep:" " [%e Ast_helper.Exp.array ~loc:pexp_loc elements],
-                  [%e res2.expr] )];
-        }
     | [%expr
         [%e? expr1];
         [%e? expr2]] ->
diff --git a/lib/syntax_extensions.md b/lib/syntax_extensions.md
@@ -204,7 +204,7 @@ type Assignments.t =
   ...
 ```
 
- Schematic example: `~~("space" "separated" "comment" "tensor p debug_name:" p); <scope of the comment>`. The content of the comment uses application syntax, must be composed of strings, `<tensor>`, `<tensor>.value` (equivalent to `<tensor>`), `<tensor>.grad` components, where `<tensor>` is any tensor expression or tensor identifier.
+ Schematic example: `~~("space" "separated" "comment" "tensor p debug_name:" p; <scope of the comment>)`. The content of the comment uses application syntax, must be composed of strings, `<tensor>`, `<tensor>.value` (equivalent to `<tensor>`), `<tensor>.grad` components, where `<tensor>` is any tensor expression or tensor identifier.
 
 ## Further features of the syntax extension `%op` {#features-of-syntax-op}
 
diff --git a/lib/train.ml b/lib/train.ml
@@ -111,27 +111,41 @@ let set_hosted (a : Tn.t) =
   if Tn.known_constant a then Tn.update_memory_mode a (Hosted Constant) 41
   else Tn.update_memory_mode a (Hosted Changed_on_devices) 41
 
-let label_suffix label =
-  (* FIXME: this should be label prefix, as most valuable label components come first. *)
-  Option.value ~default:"unknown"
-  @@ List.find ~f:(String.for_all ~f:(fun c -> Char.is_alphanum c || equal_char '_' c))
-  @@ List.rev label
-
 (** Sets the tensor's value as "fully on host", returns the tensor's forward code with a
     label-derived comment. *)
 let forward ?(disable_rootness_check = false) t =
   let fwd = if disable_rootness_check then t.Tensor.forward else Tensor.consume_forward_code t in
   set_hosted t.Tensor.value;
-  let label = label_suffix t.Tensor.value.label in
+  let label = Tn.debug_name t.value in
   Asgns.Block_comment (label ^ " fwd", fwd)
 
 type updaten = {
   loss : Tensor.t;
-  label : string;
   params : (Tensor.t, Tensor.comparator_witness) Base.Set.t;
   fwd_bprop : Asgns.t;
 }
 
+let diff_or_error t provenance =
+  Option.value_or_thunk t.Tensor.diff ~default:(fun () ->
+      raise @@ Tensor.Session_error (provenance ^ ": tensor is not differentiable", Some t))
+
+let grad_update_nochecks loss =
+  let params = get_params loss in
+  let diff = diff_or_error loss "Train.grad_update_nochecks" in
+  let fwd_bprop =
+    let%cd init_grad = loss.grad =: 1 in
+    [%cd
+      ~~(loss "gradient update";
+         ~~(loss "fwd";
+            loss.forward);
+         ~~(loss "zero grads";
+            diff.zero_grads);
+         init_grad;
+         ~~(loss "bprop";
+            diff.backprop))]
+  in
+  { loss; params; fwd_bprop }
+
 (** Returns the tensor's forward, zeroing gradients, and backprop code wrapped with label-derived
     comments. Sets the tensor's value as "fully on host". If [setup_for_parallel] is true (false by
     default), sets the parameters and their gradients as "non-local" (on-device). *)
@@ -140,52 +154,49 @@ let grad_update ?(disable_rootness_check = false) ?(setup_for_parallel = false)
   let params = get_params loss in
   if setup_for_parallel then
     Set.iter params ~f:(fun p -> set_materialized (Option.value_exn ~here:[%here] p.diff).grad);
-  let label = label_suffix loss.value.label in
   let fwd =
     if disable_rootness_check then loss.Tensor.forward else Tensor.consume_forward_code loss
   in
+  let diff = diff_or_error loss "Train.grad_update" in
   let fwd_bprop =
-    match loss.Tensor.diff with
-    | Some diff ->
-        let zero_grads, bprop =
-          if disable_rootness_check then (diff.zero_grads, diff.backprop)
-          else Tensor.consume_backprop_code loss
-        in
-        (* Note: the %cd syntax for [loss.grad] does not modify roots. *)
-        let%cd init_grad = loss.grad =: 1 in
-        Asgns.(
-          Block_comment
-            ( label ^ " gradient update",
-              sequential
-                [
-                  Block_comment (label ^ " fwd", fwd);
-                  Block_comment (label ^ " zero grads", zero_grads);
-                  init_grad;
-                  Block_comment (label ^ " bprop", bprop);
-                ] ))
-    | None ->
-        raise @@ Tensor.Session_error ("Train.grad_update: tensor is not differentiable", Some loss)
+    let zero_grads, bprop =
+      if disable_rootness_check then (diff.zero_grads, diff.backprop)
+      else Tensor.consume_backprop_code loss
+    in
+    (* Note: the %cd syntax for [loss.grad] does not modify roots. *)
+    let%cd init_grad = loss.grad =: 1 in
+    [%cd
+      ~~(loss "gradient update";
+         ~~(loss "fwd";
+            fwd);
+         ~~(loss "zero grads";
+            zero_grads);
+         init_grad;
+         ~~(loss "bprop";
+            bprop))]
   in
-  { loss; label; params; fwd_bprop }
+  { loss; params; fwd_bprop }
 
 (** See: https://github.com/tinygrad/tinygrad/blob/master/tinygrad/nn/optim.py *)
 let sgd_one ~learning_rate ?(momentum = 0.0) ?(weight_decay = 0.0) ?(nesterov = false) p =
   if not @@ is_param p then raise @@ Tensor.Session_error ("Train.sgd_one: not a parameter", Some p);
   [%cd
-    ~~(p "param sgd step");
-    "sgd_delta" =: p.grad + (!.weight_decay *. p);
-    if Float.(momentum > 0.0) then (
-      "sgd_momentum" =: (!.momentum *. sgd_momentum) + sgd_delta;
-      if nesterov then sgd_delta =+ !.momentum *. sgd_momentum else sgd_delta =: sgd_momentum);
-    p =- learning_rate *. sgd_delta]
+    ~~(p "param sgd step";
+       "sgd_delta" =: p.grad + (!.weight_decay *. p);
+       if Float.(momentum > 0.0) then (
+         "sgd_momentum" =: (!.momentum *. sgd_momentum) + sgd_delta;
+         if nesterov then sgd_delta =+ !.momentum *. sgd_momentum else sgd_delta =: sgd_momentum);
+       p =- learning_rate *. sgd_delta)]
 
 let sgd_update ~learning_rate ?momentum ?weight_decay ?nesterov l =
   let code =
     l.params |> Set.to_list
     |> List.map ~f:(sgd_one ~learning_rate ?momentum ?weight_decay ?nesterov)
     |> Asgns.sequential
   in
-  Asgns.Block_comment (l.label ^ " sgd update", code)
+  [%cd
+    ~~(l.loss "sgd update";
+       code)]
 
 (** All and only bindings with associated ranges are iterated, with the binding's initial value
     lost. Bindings without ranges remain at their initial values. *)
@@ -328,8 +339,8 @@ let%track3_sexp parallel_update (type context)
   let grad_merges : Asgns.t array =
     Array.map all_params ~f:(fun p ->
         [%cd
-          ~~("merging gradient of" p);
-          p.grad =+ p.grad.merge])
+          ~~("merging gradient of" p;
+             p.grad =+ p.grad.merge)])
   in
   let grad_merges_to : Backend.routine option array array =
     (* For now, we need all params on all devices. *)
@@ -346,8 +357,8 @@ let%track3_sexp parallel_update (type context)
       link ~from_prior_context:(needs_prior_context updaten.loss) sgd_update.context
       @@ compile Idx.Empty
            [%cd
-             ~~("merging" updaten.loss);
-             updaten.loss.value =+ updaten.loss.value.merge])
+             ~~("merging" updaten.loss;
+                updaten.loss.value =+ updaten.loss.value.merge)])
   in
   let into_merge_buffer = if copy_to_merge then BT.Copy else BT.Streaming in
   (* Since each device has its own queue, we can iterate over devices in the outer loop. *)