Remove cuda_printf_fifo_size from Utils.settings; add never_capture_stdout config

lukstafi · lukstafi · commit e97dccc92411 · 2024-12-18T15:37:09.000+01:00
diff --git a/arrayjit/lib/cuda_backend.cudajit.ml b/arrayjit/lib/cuda_backend.cudajit.ml
@@ -132,7 +132,8 @@ module Fresh () = struct
       let dev = { dev; primary_context } in
       set_ctx primary_context;
       if Utils.debug_log_from_routines () && not (Hash_set.mem initialized_devices ordinal) then
-        Option.iter Utils.settings.cuda_printf_fifo_size ~f:Cu.Context.(set_limit PRINTF_FIFO_SIZE);
+        Int.of_string_opt @@ Utils.get_global_arg ~arg_name:"cuda_printf_fifo_size" ~default:""
+        |> Option.iter ~f:Cu.Context.(set_limit PRINTF_FIFO_SIZE);
       Hash_set.add initialized_devices ordinal;
       let result = make_device dev ~ordinal in
       Stdlib.Gc.finalise finalize_device result;
diff --git a/arrayjit/lib/utils.ml b/arrayjit/lib/utils.ml
@@ -42,10 +42,6 @@ type settings = {
   mutable check_half_prec_constants_cutoff : float option;
       (** If given, generic code optimization should fail if a half precision FP16 constant exceeds
           the cutoff. *)
-  mutable cuda_printf_fifo_size : int option;
-      (** If not [None], the setting will be used for the size of the CUDA devices buffer for
-          storing logs, see [debug_log_from_routines] above. If [None], the default buffer size on
-          the devices is not altered. *)
 }
 [@@deriving sexp]
 
@@ -57,7 +53,6 @@ let settings =
     fixed_state_for_init = None;
     print_decimals_precision = 2;
     check_half_prec_constants_cutoff = Some (2. **. 14.);
-    cuda_printf_fifo_size = None;
   }
 
 let accessed_global_args = Hash_set.create (module String)
@@ -367,14 +362,15 @@ let restore_settings () =
     Int.of_string @@ get_global_arg ~arg_name:"print_decimals_precision" ~default:"2";
   settings.check_half_prec_constants_cutoff <-
     Float.of_string_opt
-    @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0";
-  settings.cuda_printf_fifo_size <-
-    Int.of_string_opt @@ get_global_arg ~arg_name:"cuda_printf_fifo_size" ~default:""
+    @@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0"
 
 let () = restore_settings ()
 let with_runtime_debug () = settings.output_debug_files_in_build_directory && settings.log_level > 1
 let debug_log_from_routines () = settings.debug_log_from_routines && settings.log_level > 1
 
+let never_capture_stdout () =
+  Bool.of_string @@ get_global_arg ~arg_name:"never_capture_stdout" ~default:"false"
+
 let enable_runtime_debug () =
   settings.output_debug_files_in_build_directory <- true;
   set_log_level @@ max 2 settings.log_level
@@ -581,8 +577,8 @@ let input_line chan =
   ( n > 0,
     String.chop_suffix_if_exists ~suffix:"\n" @@ String.chop_suffix_if_exists line ~suffix:"\r\n" )
 
-let capture_stdout_logs ?(never_skip = false) arg =
-  if (not never_skip) && not (debug_log_from_routines ()) then arg ()
+let capture_stdout_logs arg =
+  if never_capture_stdout () || not (debug_log_from_routines ()) then arg ()
   else (
     Stdlib.flush Stdlib.stdout;
     let ls = ref [] in
diff --git a/lib/train.ml b/lib/train.ml
@@ -362,8 +362,7 @@ let%track3_sexp parallel_update (type buffer_ptr dev runner event)
     let occupancy ~name:_ ~src_n:_ = true in
     Array.mapi ctxs ~f:(fun dst_n ctx ->
         if occupancy_dst ~dst_n then
-          snd
-          @@ Backend.(link_batch ctx @@ compile_batch ~occupancy Idx.Empty grad_merges)
+          snd @@ Backend.(link_batch ctx @@ compile_batch ~occupancy Idx.Empty grad_merges)
         else [||])
   in
   (* We can cache scheduling, because merging and copying does not depend on static indexing. *)
@@ -534,9 +533,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     Tn.log_accessible_headers ());
   for epoch = 0 to epochs - 1 do
     epoch_loss := 0.;
-    (* DEBUG: *)
-    (* Utils.capture_stdout_logs *)
-     update ();
+    Utils.capture_stdout_logs update;
     learning_rates := learning_rate.@[0] :: !learning_rates;
     rev_epoch_losses := !epoch_loss :: !rev_epoch_losses;
     Option.iter per_epoch_callback ~f:(fun f ->
@@ -573,8 +570,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     Tensor.set_values infer values;
     (* For the gccjit backend, infer is only on host, not on device. For cuda, this will be
        needed. *)
-       (* DEBUG: *)
-    (* Utils.capture_stdout_logs @@ fun () -> *)
+    Utils.capture_stdout_logs @@ fun () ->
     assert (Backend.from_host routine.context infer.value);
     run routine;
     assert (Backend.to_host routine.context model_result.value);
diff --git a/ocannl_config.example b/ocannl_config.example
@@ -12,6 +12,42 @@
 # the sources of other configs are printed when the log level > 0.
 # The configuration values below are (one of) the defaults.
 
+# Configurations that are stored as `Utils.settings`:
+
+# The log level, for ppx_minidebug and with a few other uses in OCANNL.
+log_level=1
+
+# If `log_level` is at least 2 and this is true, the generated code will contain
+# printf statements, whose output is then (typically) integrated into ppx_minidebug logs.
+debug_log_from_routines=false
+
+# If true, various intermediate representation files for the compiled code are generated
+# (or not removed). Moreover, if log level is at least 2, the generated binaries will
+# contain debug symbols for debugging with `gdb`, `cuda-gdb` etc.
+output_debug_files_in_build_directory=false
+
+# If given, the integer seed to initialize the randomness library with.
+fixed_state_for_init=
+
+# For printing tensors, etc.
+print_decimals_precision=2
+
+# Complains if a half-precision tensor node is a constant with absolute value exceeding this.
+check_half_prec_constants_cutoff=16384.0
+
+# Other configurations:
+
+# If true, stdout capturing is disabled, so some logs meant for the ppx_minidebug log files
+# (in particular CUDA logs) remain on the stdout.
+#
+# NOTE: current implementation of stdout capture loses information on channel
+# buffering overflows, so it is important to verify with this setting if one gets
+# sufficient information in the logs files.
+never_capture_stdout=false
+
+# If set and relevant, it's the `CU_LIMIT_PRINTF_FIFO_SIZE` CUDA configuration.
+cuda_printf_fifo_size=
+
 # The `-O` argument to the compiler executable for the `cc` backend.
 cc_backend_optimization_level=3
 
@@ -86,29 +122,3 @@ debug_highlights=
 
 # For ppx_minidebug: a pcre syntax regular expression to highlight in the logs.
 debug_highlight_pcre=
-
-# Configurations that are stored as `Utils.settings`:
-
-# The log level, for ppx_minidebug and with a few other uses in OCANNL.
-log_level=1
-
-# If `log_level` is at least 2 and this is true, the generated code will contain
-# printf statements, whose output is then (typically) integrated into ppx_minidebug logs.
-debug_log_from_routines=false
-
-# If true, various intermediate representation files for the compiled code are generated
-# (or not removed). Moreover, if log level is at least 2, the generated binaries will
-# contain debug symbols for debugging with `gdb`, `cuda-gdb` etc.
-output_debug_files_in_build_directory=false
-
-# If given, the integer seed to initialize the randomness library with.
-fixed_state_for_init=
-
-# For printing tensors, etc.
-print_decimals_precision=2
-
-# Complains if a half-precision tensor node is a constant with absolute value exceeding this.
-check_half_prec_constants_cutoff=16384.0
-
-# If set and relevant, it's the `CU_LIMIT_PRINTF_FIFO_SIZE` CUDA configuration.
-cuda_printf_fifo_size=
diff --git a/todo.md b/todo.md
@@ -3,4 +3,4 @@
 
 Update `anatomy_of_a_backend.md`
 Update introductory slides {cm:2024-12-17}
-Config to skip capturing logs from stdout
+Config to skip capturing logs from stdout {cm:2024-12-18}