Skip to content

Commit e97dccc

Browse files
committed
Remove cuda_printf_fifo_size from Utils.settings; add never_capture_stdout config
1 parent 263bc43 commit e97dccc

File tree

5 files changed

+48
-45
lines changed

5 files changed

+48
-45
lines changed

arrayjit/lib/cuda_backend.cudajit.ml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ module Fresh () = struct
132132
let dev = { dev; primary_context } in
133133
set_ctx primary_context;
134134
if Utils.debug_log_from_routines () && not (Hash_set.mem initialized_devices ordinal) then
135-
Option.iter Utils.settings.cuda_printf_fifo_size ~f:Cu.Context.(set_limit PRINTF_FIFO_SIZE);
135+
Int.of_string_opt @@ Utils.get_global_arg ~arg_name:"cuda_printf_fifo_size" ~default:""
136+
|> Option.iter ~f:Cu.Context.(set_limit PRINTF_FIFO_SIZE);
136137
Hash_set.add initialized_devices ordinal;
137138
let result = make_device dev ~ordinal in
138139
Stdlib.Gc.finalise finalize_device result;

arrayjit/lib/utils.ml

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@ type settings = {
4242
mutable check_half_prec_constants_cutoff : float option;
4343
(** If given, generic code optimization should fail if a half precision FP16 constant exceeds
4444
the cutoff. *)
45-
mutable cuda_printf_fifo_size : int option;
46-
(** If not [None], the setting will be used for the size of the CUDA devices buffer for
47-
storing logs, see [debug_log_from_routines] above. If [None], the default buffer size on
48-
the devices is not altered. *)
4945
}
5046
[@@deriving sexp]
5147

@@ -57,7 +53,6 @@ let settings =
5753
fixed_state_for_init = None;
5854
print_decimals_precision = 2;
5955
check_half_prec_constants_cutoff = Some (2. **. 14.);
60-
cuda_printf_fifo_size = None;
6156
}
6257

6358
let accessed_global_args = Hash_set.create (module String)
@@ -367,14 +362,15 @@ let restore_settings () =
367362
Int.of_string @@ get_global_arg ~arg_name:"print_decimals_precision" ~default:"2";
368363
settings.check_half_prec_constants_cutoff <-
369364
Float.of_string_opt
370-
@@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0";
371-
settings.cuda_printf_fifo_size <-
372-
Int.of_string_opt @@ get_global_arg ~arg_name:"cuda_printf_fifo_size" ~default:""
365+
@@ get_global_arg ~arg_name:"check_half_prec_constants_cutoff" ~default:"16384.0"
373366

374367
let () = restore_settings ()
375368
let with_runtime_debug () = settings.output_debug_files_in_build_directory && settings.log_level > 1
376369
let debug_log_from_routines () = settings.debug_log_from_routines && settings.log_level > 1
377370

371+
let never_capture_stdout () =
372+
Bool.of_string @@ get_global_arg ~arg_name:"never_capture_stdout" ~default:"false"
373+
378374
let enable_runtime_debug () =
379375
settings.output_debug_files_in_build_directory <- true;
380376
set_log_level @@ max 2 settings.log_level
@@ -581,8 +577,8 @@ let input_line chan =
581577
( n > 0,
582578
String.chop_suffix_if_exists ~suffix:"\n" @@ String.chop_suffix_if_exists line ~suffix:"\r\n" )
583579

584-
let capture_stdout_logs ?(never_skip = false) arg =
585-
if (not never_skip) && not (debug_log_from_routines ()) then arg ()
580+
let capture_stdout_logs arg =
581+
if never_capture_stdout () || not (debug_log_from_routines ()) then arg ()
586582
else (
587583
Stdlib.flush Stdlib.stdout;
588584
let ls = ref [] in

lib/train.ml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -362,8 +362,7 @@ let%track3_sexp parallel_update (type buffer_ptr dev runner event)
362362
let occupancy ~name:_ ~src_n:_ = true in
363363
Array.mapi ctxs ~f:(fun dst_n ctx ->
364364
if occupancy_dst ~dst_n then
365-
snd
366-
@@ Backend.(link_batch ctx @@ compile_batch ~occupancy Idx.Empty grad_merges)
365+
snd @@ Backend.(link_batch ctx @@ compile_batch ~occupancy Idx.Empty grad_merges)
367366
else [||])
368367
in
369368
(* We can cache scheduling, because merging and copying does not depend on static indexing. *)
@@ -534,9 +533,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
534533
Tn.log_accessible_headers ());
535534
for epoch = 0 to epochs - 1 do
536535
epoch_loss := 0.;
537-
(* DEBUG: *)
538-
(* Utils.capture_stdout_logs *)
539-
update ();
536+
Utils.capture_stdout_logs update;
540537
learning_rates := learning_rate.@[0] :: !learning_rates;
541538
rev_epoch_losses := !epoch_loss :: !rev_epoch_losses;
542539
Option.iter per_epoch_callback ~f:(fun f ->
@@ -573,8 +570,7 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
573570
Tensor.set_values infer values;
574571
(* For the gccjit backend, infer is only on host, not on device. For cuda, this will be
575572
needed. *)
576-
(* DEBUG: *)
577-
(* Utils.capture_stdout_logs @@ fun () -> *)
573+
Utils.capture_stdout_logs @@ fun () ->
578574
assert (Backend.from_host routine.context infer.value);
579575
run routine;
580576
assert (Backend.to_host routine.context model_result.value);

ocannl_config.example

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,42 @@
1212
# the sources of other configs are printed when the log level > 0.
1313
# The configuration values below are (one of) the defaults.
1414

15+
# Configurations that are stored as `Utils.settings`:
16+
17+
# The log level, for ppx_minidebug and with a few other uses in OCANNL.
18+
log_level=1
19+
20+
# If `log_level` is at least 2 and this is true, the generated code will contain
21+
# printf statements, whose output is then (typically) integrated into ppx_minidebug logs.
22+
debug_log_from_routines=false
23+
24+
# If true, various intermediate representation files for the compiled code are generated
25+
# (or not removed). Moreover, if log level is at least 2, the generated binaries will
26+
# contain debug symbols for debugging with `gdb`, `cuda-gdb` etc.
27+
output_debug_files_in_build_directory=false
28+
29+
# If given, the integer seed to initialize the randomness library with.
30+
fixed_state_for_init=
31+
32+
# For printing tensors, etc.
33+
print_decimals_precision=2
34+
35+
# Complains if a half-precision tensor node is a constant with absolute value exceeding this.
36+
check_half_prec_constants_cutoff=16384.0
37+
38+
# Other configurations:
39+
40+
# If true, stdout capturing is disabled, so some logs meant for the ppx_minidebug log files
41+
# (in particular CUDA logs) remain on the stdout.
42+
#
43+
# NOTE: current implementation of stdout capture loses information on channel
44+
# buffering overflows, so it is important to verify with this setting if one gets
45+
# sufficient information in the logs files.
46+
never_capture_stdout=false
47+
48+
# If set and relevant, it's the `CU_LIMIT_PRINTF_FIFO_SIZE` CUDA configuration.
49+
cuda_printf_fifo_size=
50+
1551
# The `-O` argument to the compiler executable for the `cc` backend.
1652
cc_backend_optimization_level=3
1753

@@ -86,29 +122,3 @@ debug_highlights=
86122

87123
# For ppx_minidebug: a pcre syntax regular expression to highlight in the logs.
88124
debug_highlight_pcre=
89-
90-
# Configurations that are stored as `Utils.settings`:
91-
92-
# The log level, for ppx_minidebug and with a few other uses in OCANNL.
93-
log_level=1
94-
95-
# If `log_level` is at least 2 and this is true, the generated code will contain
96-
# printf statements, whose output is then (typically) integrated into ppx_minidebug logs.
97-
debug_log_from_routines=false
98-
99-
# If true, various intermediate representation files for the compiled code are generated
100-
# (or not removed). Moreover, if log level is at least 2, the generated binaries will
101-
# contain debug symbols for debugging with `gdb`, `cuda-gdb` etc.
102-
output_debug_files_in_build_directory=false
103-
104-
# If given, the integer seed to initialize the randomness library with.
105-
fixed_state_for_init=
106-
107-
# For printing tensors, etc.
108-
print_decimals_precision=2
109-
110-
# Complains if a half-precision tensor node is a constant with absolute value exceeding this.
111-
check_half_prec_constants_cutoff=16384.0
112-
113-
# If set and relevant, it's the `CU_LIMIT_PRINTF_FIFO_SIZE` CUDA configuration.
114-
cuda_printf_fifo_size=

todo.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
Update `anatomy_of_a_backend.md`
55
Update introductory slides {cm:2024-12-17}
6-
Config to skip capturing logs from stdout
6+
Config to skip capturing logs from stdout {cm:2024-12-18}

0 commit comments

Comments
 (0)