Restore the functionality of debug logging from the cuda backend

lukstafi · lukstafi · commit 56f3e7bfb472 · 2024-09-13T14:55:03.000+02:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -24,6 +24,7 @@
 - Reduced busy-waiting inside `c_compile_and_load`, propagating compilation errors now instead of infinite loop on error.
 - Fixed loss of significant digits for small numbers when outputting files.
 - Added missing mixed-precision conversions in the `C_syntax` backend builder.
+- Restored the functionality of debug logging from the cuda backend.
 
 ## [0.4.0] -- 2024-09-04
 
diff --git a/README.md b/README.md
@@ -52,6 +52,12 @@ A possible route to learning OCANNL:
    2. Backend-independent optimizations [arrayjit/lib/lowering_and_inlining.md](arrayjit/lib/lowering_and_inlining.md) -- _lowering_ means translating (compiling) from the high-level representation (as assignments) to the low-level representation.
    3. More documentation to come.
 
+### Using the tracing debugger with CUDA computations
+
+To use debugging as provided by configuring `Utils.settings.debug_log_from_routines <- true` with the `cuda` backend, you need to wrap the code scheduling tasks and synchronizing `cuda` devices with `Utils.capture_stdout_logs`. The reason is that CUDA kernels are allowed to use `printf`, but not `fprintf` -- the driver dumps the printing buffer of a device to `stdout` at certain times (e.g. when synchronizing the device). For an example, see the implementation of `Train.example_train_loop`. Specifically, it wraps two sections: the call to `Train.parallel_update`, and the body of the returned `infer_callback`.
+
+IMPORTANT: due to potential bugs, debug logging from CUDA in complex settings currently only works as intended for _very_ small computation sizes.
+
 ## Upcoming milestones
 
 This is very tentative.
diff --git a/arrayjit/lib/cuda_backend.cudajit.ml b/arrayjit/lib/cuda_backend.cudajit.ml
@@ -63,6 +63,9 @@ let is_initialized, initialize =
 let num_physical_devices = Cudajit.device_get_count
 let devices = ref @@ Core.Weak.create 0
 
+(* Unlike [devices] above, [initialized_devices] never forgets its entries. *)
+let initialized_devices = Hash_set.create (module Int)
+
 let set_ctx ctx =
   let cur_ctx = Cudajit.ctx_get_current () in
   if not @@ phys_equal ctx cur_ctx then Cudajit.ctx_set_current ctx
@@ -98,6 +101,9 @@ let get_device ~(ordinal : int) : physical_device =
       let primary_context = Cudajit.device_primary_ctx_retain dev in
       let copy_merge_buffer_capacity = 8 in
       set_ctx primary_context;
+      if Utils.debug_log_from_routines () && not (Hash_set.mem initialized_devices ordinal) then
+        Option.iter Utils.settings.cuda_printf_fifo_size ~f:Cudajit.(ctx_set_limit PRINTF_FIFO_SIZE);
+      Hash_set.add initialized_devices ordinal;
       let copy_merge_buffer = Cudajit.mem_alloc ~size_in_bytes:copy_merge_buffer_capacity in
       let result =
         {
@@ -147,7 +153,8 @@ let get_name device =
 
 let await device : unit =
   set_ctx device.physical.primary_context;
-  Cudajit.stream_synchronize device.stream
+  Cudajit.stream_synchronize device.stream;
+  Option.iter !Utils.advance_captured_logs ~f:(fun callback -> callback ())
 
 let is_idle device = Cudajit.stream_is_ready device.stream
 
@@ -188,6 +195,7 @@ let unsafe_cleanup () =
         if Atomic.compare_and_set device.released false true then (
           Cudajit.ctx_set_current device.primary_context;
           Cudajit.ctx_synchronize ();
+          Option.iter !Utils.advance_captured_logs ~f:(fun callback -> callback ());
           Cudajit.device_primary_ctx_release device.dev))
   done;
   Core.Weak.fill !devices 0 len None
@@ -477,12 +485,13 @@ let link_proc ~prior_context ~name ~(params : (string * param_source) list) ~glo
     (* Map.iteri global_arrays ~f:(fun ~key ~data:ptr -> if key.Low_level.zero_initialized then
        Cu.memset_d8_async ptr Unsigned.UChar.zero ~length:(Tn.size_in_bytes key.Low_level.tn)); *)
     [%log "launching the kernel"];
+    (* TODO: This doesn't help. *)
+    (* Option.iter !Utils.advance_captured_logs ~f:(fun callback -> callback ()); *)
     (if Utils.debug_log_from_routines () then
        Utils.add_log_processor ~prefix:log_id_prefix @@ fun _output ->
        [%log_block
          context.label;
          Utils.log_trace_tree _output]);
-    (* if Utils.debug_log_from_routines () then Cu.ctx_set_limit CU_LIMIT_PRINTF_FIFO_SIZE 4096; *)
     Cu.launch_kernel func ~grid_dim_x:1 ~block_dim_x:1 ~shared_mem_bytes:0 context.device.stream
       args;
     [%log "kernel launched"]
diff --git a/arrayjit/lib/utils.ml b/arrayjit/lib/utils.ml
@@ -43,6 +43,10 @@ type settings = {
   mutable check_half_prec_constants_cutoff : float option;
       (** If given, generic code optimization should fail if a half precision FP16 constant exceeds
           the cutoff. *)
+  mutable cuda_printf_fifo_size : int option;
+      (** If not [None], the setting will be used for the size of the CUDA devices buffer for
+          storing logs, see [debug_log_from_routines] above. If [None], the default buffer size on
+          the devices is not altered. *)
 }
 [@@deriving sexp]
 
@@ -55,6 +59,7 @@ let settings =
     fixed_state_for_init = None;
     print_decimals_precision = 2;
     check_half_prec_constants_cutoff = Some (2. **. 14.);
+    cuda_printf_fifo_size = None;
   }
 
 let accessed_global_args = Hash_set.create (module String)
@@ -321,7 +326,9 @@ let restore_settings () =
     Int.of_string @@ get_global_arg ~arg_name:"print_decimals_precision" ~default:"2";
   settings.check_half_prec_constants_cutoff <-
     Float.of_string_opt
-    @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0"
+    @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0";
+  settings.cuda_printf_fifo_size <-
+    Int.of_string_opt @@ get_global_arg ~arg_name:"cuda_printf_fifo_size" ~default:""
 
 let () = restore_settings ()
 let with_runtime_debug () = settings.output_debug_files_in_build_directory && settings.log_level > 1
@@ -507,6 +514,10 @@ let pp_file ~base_name ~extension =
 
 let captured_log_prefix = ref "!@#"
 
+(** To avoid the complication of a concurrent thread, we expose a callback for collaborative log
+    processing. *)
+let advance_captured_logs = ref None
+
 type captured_log_processor = { log_processor_prefix : string; process_logs : string list -> unit }
 
 let captured_log_processors : captured_log_processor list ref = ref []
@@ -515,39 +526,69 @@ let add_log_processor ~prefix process_logs =
   captured_log_processors :=
     { log_processor_prefix = prefix; process_logs } :: !captured_log_processors
 
+external input_scan_line : Stdlib.in_channel -> int = "caml_ml_input_scan_line"
+
+let input_line chan =
+  let n = input_scan_line chan in
+  if n = 0 then raise End_of_file;
+  let line = Stdlib.really_input_string chan (abs n) in
+  ( n > 0,
+    String.chop_suffix_if_exists ~suffix:"\n" @@ String.chop_suffix_if_exists line ~suffix:"\r\n" )
+
 let capture_stdout_logs ?(never_skip = false) arg =
   if (not never_skip) && not (debug_log_from_routines ()) then arg ()
   else (
     Stdlib.flush Stdlib.stdout;
-    let exitp, entrancep = Unix.pipe () and backup = Unix.dup Unix.stdout in
-    Unix.dup2 entrancep Unix.stdout;
-    Unix.set_nonblock entrancep;
-    (* FIXME: process logs in a parallel thread, and double check they are not getting cut off. *)
+    let ls = ref [] in
+    let lastl = ref "" in
+    let backup = ref (Unix.dup Unix.stdout) in
+    let exit_entrance = ref (Unix.pipe ()) in
+    let pre_advance () =
+      Unix.dup2 (snd !exit_entrance) Unix.stdout;
+      Unix.set_nonblock (snd !exit_entrance)
+    in
+    let advance is_last () =
+      Stdlib.flush Stdlib.stdout;
+      Unix.close (snd !exit_entrance);
+      Unix.dup2 !backup Unix.stdout;
+      let channel = Unix.in_channel_of_descr (fst !exit_entrance) in
+      (try
+         while true do
+           let is_endlined, line = input_line channel in
+           let line = !lastl ^ line in
+           if is_endlined then (
+             (match String.chop_prefix ~prefix:!captured_log_prefix line with
+             | None -> Stdlib.print_endline line
+             (* ls := line :: !ls *)
+             | Some logline -> ls := logline :: !ls);
+             lastl := "")
+           else lastl := line
+         done
+       with End_of_file -> ());
+      if not is_last then (
+        backup := Unix.dup Unix.stdout;
+        exit_entrance := Unix.pipe ();
+        pre_advance ())
+    in
+    advance_captured_logs := Some (advance false);
+    pre_advance ();
     let result =
       try arg ()
       with Sys_blocked_io ->
+        advance_captured_logs := None;
         invalid_arg
           "capture_stdout_logs: unfortunately, flushing stdout inside captured code is prohibited"
     in
-    Stdlib.flush Stdlib.stdout;
-    Unix.close entrancep;
-    Unix.dup2 backup Unix.stdout;
-    let ls = ref [] and channel = Unix.in_channel_of_descr exitp in
-    let output =
-      try
-        while true do
-          let line = Stdlib.input_line channel in
-          match String.chop_prefix ~prefix:!captured_log_prefix line with
-          | None -> Stdlib.print_endline line
-          | Some logline -> ls := logline :: !ls
-        done;
-        []
-      with End_of_file -> List.rev !ls
-    in
+    advance true ();
+    let output = List.rev !ls in
     Exn.protect
       ~f:(fun () ->
-        List.iter !captured_log_processors ~f:(fun { log_processor_prefix; process_logs } ->
+        (* Preserve the order in which kernels were launched. *)
+        List.iter (List.rev !captured_log_processors)
+          ~f:(fun { log_processor_prefix; process_logs } ->
             process_logs
             @@ List.filter_map output ~f:(String.chop_prefix ~prefix:log_processor_prefix)))
-      ~finally:(fun () -> captured_log_processors := []);
+      ~finally:(fun () ->
+        advance_captured_logs := None;
+        captured_log_processors := []);
     result)
diff --git a/lib/train.ml b/lib/train.ml
@@ -484,7 +484,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     Tn.log_accessible_headers ());
   for epoch = 0 to epochs - 1 do
     epoch_loss := 0.;
-    update ();
+    Utils.capture_stdout_logs update;
     learning_rates := learning_rate.@[0] :: !learning_rates;
     epoch_losses := !epoch_loss :: !epoch_losses;
     Option.iter per_epoch_callback ~f:(fun f ->
@@ -509,6 +509,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     Tensor.set_values infer values;
     (* For the gccjit backend, infer is only on host, not on device. For cuda, this will be
        needed. *)
+    Utils.capture_stdout_logs @@ fun () ->
     assert (Backend.from_host routine.context infer.value);
     run routine;
     assert (Backend.to_host routine.context model_result.value);