Skip to content

Commit

Permalink
Upload profiler data to remote save folder (#338)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh committed Oct 31, 2023
1 parent db0756f commit 1099942
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
move_to_device,
peak_gpu_memory,
syncronize_flag,
upload,
)

__all__ = ["SpeedMonitor", "LRMonitor", "Trainer"]
Expand Down Expand Up @@ -737,9 +738,13 @@ def on_trace_ready(p):
output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=32)
log.info(f"Profile by total CPU time at step {p.step_num}:\n{output}")

p.export_chrome_trace(str(profiler_output_dir / f"{p.step_num}.chrome_trace.json.gz"))
p.export_stacks(str(profiler_output_dir / f"{p.step_num}.gpu.stacks"), "self_cuda_time_total")
p.export_stacks(str(profiler_output_dir / f"{p.step_num}.cpu.stacks"), "self_cpu_time_total")
p.export_chrome_trace(
str(trace_path := (profiler_output_dir / f"{p.step_num}.chrome_trace.json.gz"))
)
if self.cfg.remote_save_folder is not None:
upload_folder = f"{self.cfg.remote_save_folder.rstrip('/')}/profiler"
log.info(f"Tracing complete, uploading results to '{upload_folder}'...")
upload(trace_path, f"{upload_folder}/{trace_path.name}")

from torch.profiler import ProfilerActivity

Expand Down

0 comments on commit 1099942

Please sign in to comment.