Add backend debugging and missing synchronization between epochs

lukstafi · lukstafi · commit b9987fa0d2b6 · 2024-11-25T22:42:25.000+01:00
diff --git a/arrayjit/lib/backend_intf.ml b/arrayjit/lib/backend_intf.ml
@@ -79,7 +79,6 @@ module type Device_config = sig
   val name : string
 end
 
-
 type ('buffer_ptr, 'dev, 'runner, 'event) device_ref = {
   dev : 'dev;
   ordinal : int;
@@ -270,6 +269,12 @@ module type Backend_device_common = sig
   val get_used_memory : device -> int
   (** Returns (an upper bound of) the memory used for arrays, in bytes. *)
 
+  val get_global_debug_info : unit -> Sexp.t
+  (** Global debug information; backend-specific and might evolve independently on the backends. *)
+
+  val get_debug_info : stream -> Sexp.t
+  (** Per-stream debug information; backend-specific and might evolve independently on the backends *)
+
   val await : stream -> unit
   (** Blocks till the stream becomes idle, i.e. synchronizes the stream. *)
 
diff --git a/arrayjit/lib/cuda_backend.cudajit.ml b/arrayjit/lib/cuda_backend.cudajit.ml
@@ -476,3 +476,13 @@ let%track3_sexp link_batch prior_context (code_batch : code_batch) ctx_arrays =
                Some task))
   in
   (lowered_bindings, procs)
+
+let get_global_debug_info () =
+  Sexp.message "cuda_global_debug"
+    [ ("live_streams", [%sexp_of: int] @@ Cudajit.Stream.get_total_live_streams ()) ]
+
+let get_debug_info (stream : stream) =
+  let tot, unr, unf = Cudajit.Stream.total_unreleased_unfinished_delimited_events stream.runner in
+  let i2s = [%sexp_of: int] in
+  Sexp.message "cuda_stream_debug"
+    [ ("total_events", i2s tot); ("unreleased_events", i2s unr); ("unfinished_events", i2s unf) ]
diff --git a/arrayjit/lib/cuda_backend.missing.ml b/arrayjit/lib/cuda_backend.missing.ml
@@ -71,4 +71,6 @@ let num_devices () = 0
 let suggested_num_streams Unimplemented_device = 0
 let get_ctx_stream Unimplemented_ctx = Unimplemented_stream
 let to_ordinal _stream = 0
+let get_global_debug_info () = Sexp.message "global_debug" []
+let get_debug_info Unimplemented_stream = Sexp.message "stream_debug" []
 let name = "cuda"
diff --git a/arrayjit/lib/schedulers.ml b/arrayjit/lib/schedulers.ml
@@ -206,6 +206,9 @@ module Multicore (Backend : For_add_scheduler) :
     let stream = spinup_stream ~stream_id:!latest_stream_id in
     Stdlib.Gc.finalise cleanup_stream stream;
     stream
+
+  let get_global_debug_info () = Sexp.message "global_debug" []
+  let get_debug_info (stream : stream) = sexp_of_runner stream.runner
 end
 
 (** For debugging, allow [Sync_scheduler(...).suggested_num_streams] calls to return >1 numbers. *)
@@ -263,4 +266,6 @@ module Sync (Backend : For_add_scheduler) = struct
   let initialize = Backend.initialize
   let is_initialized = Backend.is_initialized
   let schedule_task _stream task = Task.run task
+  let get_global_debug_info () = Sexp.message "global_debug" []
+  let get_debug_info (stream : stream) = sexp_of_runner stream.runner
 end
diff --git a/bin/moons_benchmark.ml b/bin/moons_benchmark.ml
@@ -80,13 +80,16 @@ let classify_moons ~seed ~on_device ~inlining_cutoff ~num_streams ~batch_size ~b
   let weight_decay = 0.0002 in
   Arrayjit.Schedulers.sync_suggested_num_streams := num_streams;
   let module Backend = (val Arrayjit.Backends.fresh_backend ~backend_name ()) in
+  Stdlib.Format.printf "Initial backend global debug info: %a\n%!" Sexp.pp_hum
+  @@ Backend.get_global_debug_info ();
   let per_batch_callback ~at_batch:_ ~at_step:_ ~learning_rate:_ ~batch_loss:_ ~epoch_loss:_ =
     if Option.is_none !start_time then start_time := Some (Time_now.nanoseconds_since_unix_epoch ())
   in
   (* Tn.print_accessible_headers (); *)
   let per_epoch_callback ~at_step ~at_epoch ~learning_rate ~epoch_loss =
     Stdio.printf "Epoch=%d, step=%d, lr=%f, epoch loss=%f\n%!" at_epoch at_step learning_rate
-      epoch_loss
+      epoch_loss;
+
   in
   Backend.initialize Train.BT.Most_parallel_streams;
   let {
@@ -101,7 +104,7 @@ let classify_moons ~seed ~on_device ~inlining_cutoff ~num_streams ~batch_size ~b
   } =
     Train.example_train_loop ~seed ~batch_size ~init_lr ~max_num_streams:num_streams ~data_len
       ~epochs ~inputs:moons_flat ~outputs:moons_classes ~model:mlp ~loss_fn ~weight_decay
-      ~per_batch_callback ~per_epoch_callback
+      ~per_batch_callback ~per_epoch_callback ~per_epoch_debug_streams:true
       (module Backend)
       ()
   in
@@ -177,6 +180,8 @@ let classify_moons ~seed ~on_device ~inlining_cutoff ~num_streams ~batch_size ~b
       }
   in
   Stdio.printf "\n\n%!";
+  Stdlib.Format.printf "Final backend global debug info: %a\n%!" Sexp.pp_hum
+  @@ Backend.get_global_debug_info ();
   result
 
 let _suspend () =
@@ -248,4 +253,4 @@ let benchmark benchmarks =
   List.map benchmarks ~f:(fun bench -> bench ())
   |> PrintBox_utils.table |> PrintBox_text.output Stdio.stdout
 
-let () = benchmark _mem_benchmarks
+let () = benchmark _cuda_benchmarks
diff --git a/lib/train.ml b/lib/train.ml
@@ -456,7 +456,8 @@ type example_train_result = {
 
 let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init_lr ?lr_schedule
     ?(copy_to_merge = false) ?max_num_streams ~data_len ~epochs ~inputs ~outputs ~model ~loss_fn
-    ~weight_decay ?per_batch_callback ?per_epoch_callback (module Backend : Backend) () =
+    ~weight_decay ?per_batch_callback ?per_epoch_callback ?(per_epoch_debug_streams = false)
+    (module Backend : Backend) () =
   let module TDSL = Operation.TDSL in
   let module NTDSL = Operation.NTDSL in
   Rand.init seed;
@@ -528,7 +529,15 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     epoch_losses := !epoch_loss :: !epoch_losses;
     Option.iter per_epoch_callback ~f:(fun f ->
         f ~at_step:!step_ref ~at_epoch:epoch ~learning_rate:learning_rate.@[0]
-          ~epoch_loss:!epoch_loss)
+          ~epoch_loss:!epoch_loss);
+    let debug_at pos =
+      Array.iter streams ~f:(fun s ->
+          Stdlib.Format.printf "Stream %d debug %s:@ %a\n%!" s.stream_id pos Sexp.pp_hum
+          @@ Backend.get_debug_info s)
+    in
+    if per_epoch_debug_streams then debug_at "before sync";
+    Array.iter streams ~f:Backend.await;
+    if per_epoch_debug_streams then debug_at "after sync"
   done;
   let%op model_result = model "infer" in
   let infer_fwd =
diff --git a/todo.md b/todo.md
@@ -2,4 +2,5 @@
 (B) bin/moons_benchmark with the cc backend crashes with half-prec overflow {cm:2024-11-24}
 (B) remove syncing from the data parallel algo: stream-to-stream syncing is now automatic {cm:2024-11-23}
 (A) cuda backend crashes in bin/moons_benchmark {cm:2024-11-22}
-(B) figure out why cuda backend parallelism slows down in later epochs
+(B) figure out why cuda backend parallelism slows down in later epochs {cm:2024-11-25}
+(A) Ensure that reading from host on CPU performs required synchronization