Merge branch 'master' of https://github.com/ahrefs/ocannl

lukstafi · lukstafi · commit b6f271d6237f · 2025-05-21T13:49:16.000+02:00
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ A possible route to learning OCANNL:
 
 To use debugging as provided by configuring `Utils.settings.debug_log_from_routines <- true` with the `cuda` backend, you need to wrap the code scheduling tasks and synchronizing `cuda` devices with `Utils.capture_stdout_logs`. The reason is that CUDA kernels are allowed to use `printf`, but not `fprintf` -- the driver dumps the printing buffer of a device to `stdout` at certain times (e.g. when synchronizing the device). For an example, see the implementation of `Train.example_train_loop`. Specifically, it wraps two sections: the call to `Train.parallel_update`, and the body of the returned `infer_callback`.
 
-IMPORTANT: debug logging from CUDA in complex settings currently only works as intended for _very_ small computation sizes. If facing issues, try the setting `never_capture_stdout=true` (see [ocannl_config.example](ocannl_config.example)).
+NOTE: debug logging from CUDA in complex settings is a bit tricky, it involves another thread (domain) intercepting and filtering `stdout`. If facing issues, try the setting `never_capture_stdout=true` (see [ocannl_config.example](ocannl_config.example)).
 
 ## Upcoming milestones
 
diff --git a/arrayjit/lib/anatomy_of_a_backend.md b/arrayjit/lib/anatomy_of_a_backend.md
@@ -143,7 +143,7 @@ We output a log line only for comments and array assignments (corresponding to n
 
 #### Tracing via `stdout`
 
-Since the CUDA backend can only log to the standard output, it passes `let logs_to_stdout = true` to `C_syntax`. This uses `printf`, and prefixes each log line with a kernel run ID. When postprocessing the logs, each run extracts its own log lines. Simultaneous logging from multiple CUDA devices should still be clean -- without interleaving lines -- because the driver is supposed to dump the logs to standard output at device synchronization points.
+Since the CUDA backend can only log to the standard output, it uses `printf`, and prefixes each log line with a kernel run ID. When postprocessing the logs, each run extracts its own log lines. Simultaneous logging from multiple CUDA devices should still be clean -- without interleaving lines -- because the driver is supposed to dump the logs to standard output at device synchronization points.
 
 When using the default stream, CUDA would predictably write to the standard output at context synchronization only. Unfortunately, it does not appear to be the case with asynchronous streams. [Despite the assurance from the documentation, output happens in between CUDA calls...](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#formatted-output) To remedy this, we implement a `stdout` filtering scheme (function `Utils.capture_stdout_logs`), where all output is captured, tracing lines extracted, and other output printed on the original `stdout`.
 
diff --git a/arrayjit/lib/utils.ml b/arrayjit/lib/utils.ml
@@ -655,27 +655,33 @@ let capture_stdout_logs arg =
     (* The reader domain will close pipe_read_fd. *)
 
     let collected_logs_ref = ref [] in
-    let passthrough_lines_ref = ref [] in (* Buffer for non-log lines *)
     let reader_domain_failed = Atomic.make false in
 
     let reader_domain_logic () =
       let in_channel = Unix.in_channel_of_descr pipe_read_fd in
+      (* Create an output channel to the original stdout for immediate passthrough *)
+      let orig_out = Unix.out_channel_of_descr (Unix.dup original_stdout_fd) in
       try
         while true do
           let _is_endlined, line = input_line in_channel in
           match String.chop_prefix ~prefix:!captured_log_prefix line with
           | Some logline -> collected_logs_ref := logline :: !collected_logs_ref
-          | None -> passthrough_lines_ref := line :: !passthrough_lines_ref (* Buffer the line *)
+          | None -> 
+              (* Forward non-log lines to original stdout immediately *)
+              Stdlib.output_string orig_out (line ^ "\n");
+              Stdlib.flush orig_out
         done;
+        Stdlib.close_out_noerr orig_out;
         Stdlib.close_in_noerr in_channel (* This closes pipe_read_fd *)
       with
-      | End_of_file -> () (* Normal termination of the reader *)
-      | exn ->
-          Atomic.set reader_domain_failed true;
-          Stdio.eprintf "Exception in stdout reader domain: %s\\nBacktrace:\\n%s\\n%!"
-            (Exn.to_string exn) (Stdlib.Printexc.get_backtrace ());
+        | End_of_file -> () (* Normal termination of the reader *)
+        | exn ->
+            Stdlib.close_out_noerr orig_out;
+            Atomic.set reader_domain_failed true;
+            Stdio.eprintf "Exception in stdout reader domain: %s\\nBacktrace:\\n%s\\n%!"
+              (Exn.to_string exn) (Stdlib.Printexc.get_backtrace ());
             Stdlib.close_in_noerr in_channel (* This closes pipe_read_fd *);
-          Stdlib.Printexc.raise_with_backtrace exn (Stdlib.Printexc.get_raw_backtrace ())
+            Stdlib.Printexc.raise_with_backtrace exn (Stdlib.Printexc.get_raw_backtrace ())
     in
 
     let reader_domain = Domain.spawn reader_domain_logic in
@@ -707,8 +713,6 @@ let capture_stdout_logs arg =
             ~f:(fun { log_processor_prefix; process_logs } ->
               process_logs
               @@ List.filter_map captured_output ~f:(String.chop_prefix ~prefix:log_processor_prefix));
-          (* Print passthrough lines even if arg() failed, if reader was ok *)
-          List.iter (List.rev !passthrough_lines_ref) ~f:Stdlib.print_endline;
         );
         captured_log_processors := []; (* Clear processors *)
         Stdlib.Printexc.raise_with_backtrace exn (Stdlib.Printexc.get_raw_backtrace ())
@@ -740,10 +744,7 @@ let capture_stdout_logs arg =
             ~f:(fun { log_processor_prefix; process_logs } ->
               process_logs
               @@ List.filter_map captured_output ~f:(String.chop_prefix ~prefix:log_processor_prefix)))
-        ~finally:(fun () -> captured_log_processors := []);
-      
-      (* Then print passthrough lines to the now-restored original stdout *)
-      List.iter (List.rev !passthrough_lines_ref) ~f:Stdlib.print_endline;
+        ~finally:(fun () -> captured_log_processors := [])
     ) else (
         captured_log_processors := []; (* Clear processors if reader failed *)
     );