From 109994295e8b91c199ed080e188f42978b073773 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 31 Oct 2023 14:42:10 -0700 Subject: [PATCH] Upload profiler data to remote save folder (#338) --- olmo/train.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index 188ae4b32..695ed309a 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -43,6 +43,7 @@ move_to_device, peak_gpu_memory, syncronize_flag, + upload, ) __all__ = ["SpeedMonitor", "LRMonitor", "Trainer"] @@ -737,9 +738,13 @@ def on_trace_ready(p): output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=32) log.info(f"Profile by total CPU time at step {p.step_num}:\n{output}") - p.export_chrome_trace(str(profiler_output_dir / f"{p.step_num}.chrome_trace.json.gz")) - p.export_stacks(str(profiler_output_dir / f"{p.step_num}.gpu.stacks"), "self_cuda_time_total") - p.export_stacks(str(profiler_output_dir / f"{p.step_num}.cpu.stacks"), "self_cpu_time_total") + p.export_chrome_trace( + str(trace_path := (profiler_output_dir / f"{p.step_num}.chrome_trace.json.gz")) + ) + if self.cfg.remote_save_folder is not None: + upload_folder = f"{self.cfg.remote_save_folder.rstrip('/')}/profiler" + log.info(f"Tracing complete, uploading results to '{upload_folder}'...") + upload(trace_path, f"{upload_folder}/{trace_path.name}") from torch.profiler import ProfilerActivity