Skip to content

Commit

Permalink
[Memory Snapshot] Add recordAnnotations to capture record_function an…
Browse files Browse the repository at this point in the history
…notations (pytorch#124179)

Summary: Add new traceEvents into Memory Snapshot for record_function annotations. These will capture both the profiler's step annotation as well as user annotations.

Test Plan:
CI

New Snapshot Generated:
devvm2184.cco0.facebook.com.Apr_19_13_27_14.3072800.snapshot.pickle

Snippet of Snapshot device_traces show `ProfilerStep#0`, and `## forward ##` annotations:
```
[[{'action': 'user_defined',
   'addr': 0,
   'size': 0,
   'stream': 0,
   'time_us': 1713558427168556,
   'frames': [{'name': 'START', 'filename': 'ProfilerStep#0', 'line': 0}]},
  {'action': 'user_defined',
   'addr': 0,
   'size': 0,
   'stream': 0,
   'time_us': 1713558427168738,
   'frames': [{'name': 'END', 'filename': 'ProfilerStep#0', 'line': 0}]},
  {'action': 'user_defined',
   'addr': 0,
   'size': 0,
   'stream': 0,
   'time_us': 1713558427168865,
   'frames': [{'name': 'START', 'filename': 'ProfilerStep#1', 'line': 0}]},
  {'action': 'user_defined',
   'addr': 0,
   'size': 0,
   'stream': 0,
   'time_us': 1713558427168920,
   'frames': [{'name': 'START', 'filename': '## forward ##', 'line': 0}]},
  {'action': 'alloc',
   'addr': 140166073581568,
   'size': 3211264,
   'stream': 0,
   'time_us': 1713558427172978,
   'frames': [{'name': '_conv_forward',
     'filename': '/mnt/xarfuse/uid-416185/235d4caf-seed-nspid4026531836_cgpid32884718-ns-4026531840/torch/nn/modules/conv
```

Differential Revision: D55941362

Pulled By: aaronenyeshi

Pull Request resolved: pytorch#124179
Approved by: https://github.com/zdevito
  • Loading branch information
aaronenyeshi authored and pytorchmergebot committed May 15, 2024
1 parent ee8c155 commit 187aeae
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 6 deletions.
1 change: 1 addition & 0 deletions c10/core/Allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cstdint>
#include <functional>
#include <memory>
#include <string>
#include <utility>

#include <c10/core/Device.h>
Expand Down
10 changes: 10 additions & 0 deletions c10/cuda/CUDACachingAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,10 @@ class DeviceCachingAllocator {
}
}

void recordAnnotation(const std::shared_ptr<GatheredContext>& name) {
record_trace(TraceEntry::USER_DEFINED, 0, 0, nullptr, 0, name);
}

bool isHistoryEnabled() {
return record_history;
}
Expand Down Expand Up @@ -2990,6 +2994,12 @@ class NativeCachingAllocator : public CUDAAllocator {
}
}

void recordAnnotation(const std::shared_ptr<GatheredContext>& name) override {
for (auto& allocator : device_allocator) {
allocator->recordAnnotation(name);
}
}

bool isHistoryEnabled() override {
c10::DeviceIndex device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
Expand Down
10 changes: 8 additions & 2 deletions c10/cuda/CUDACachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ struct TraceEntry {
SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
// events
OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
// bytes reported by cuda)
OOM, // the allocator threw an OutOfMemoryError (addr_ is the amount of free
// bytes reported by cuda)
USER_DEFINED // a call made from user defined API such as record_function
};
TraceEntry(
Action action,
Expand Down Expand Up @@ -289,6 +290,7 @@ class CUDAAllocator : public Allocator {
CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
RecordContext when) = 0;
virtual void recordAnnotation(const std::shared_ptr<GatheredContext>& name){};
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;

// Attached AllocatorTraceTracker callbacks will be called while the
Expand Down Expand Up @@ -428,6 +430,10 @@ inline void recordHistory(
enabled, context_recorder, alloc_trace_max_entries, when);
}

inline void recordAnnotation(const std::shared_ptr<GatheredContext>& name) {
return get()->recordAnnotation(name);
}

inline bool isHistoryEnabled() {
return get()->isHistoryEnabled();
}
Expand Down
27 changes: 27 additions & 0 deletions torch/csrc/cuda/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ATen/core/TensorBody.h>
#include <ATen/cuda/CUDAConfig.h>
#include <ATen/native/ConvUtils.h>
#include <ATen/record_function.h>
#include <c10/core/Device.h>
#include <c10/core/TensorImpl.h>
#include <c10/util/UniqueVoidPtr.h>
Expand Down Expand Up @@ -37,6 +38,7 @@
#include <torch/csrc/cuda/THCP.h>
#include <torch/csrc/cuda/memory_snapshot.h>
#include <torch/csrc/cuda/python_comm.h>
#include <torch/csrc/profiler/combined_traceback.h>
#include <torch/csrc/profiler/python/combined_traceback.h>
#include <torch/csrc/python_headers.h>
#include <torch/csrc/utils/device_lazy_init.h>
Expand Down Expand Up @@ -737,6 +739,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
py::str snapshot_s = "snapshot";
py::str oom_s = "oom";
py::str device_free_s = "device_free";
py::str user_defined_s = "user_defined";

using namespace c10::cuda::CUDACachingAllocator;

Expand All @@ -760,6 +763,8 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
return segment_unmap_s;
case TraceEntry::SEGMENT_MAP:
return segment_map_s;
case TraceEntry::USER_DEFINED:
return user_defined_s;
}
throw std::runtime_error("unreachable");
};
Expand Down Expand Up @@ -961,6 +966,28 @@ static void registerCudaDeviceProperties(PyObject* module) {
const std::string&,
size_t)>(torch::cuda::_record_memory_history));

// Save user annotations to CCA memory snapshot tool
at::addThreadLocalCallback(at::RecordFunctionCallback(
[](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return nullptr; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "START", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
return nullptr;
},
[](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
if (fn.scope() != at::RecordScope::USER_SCOPE) {
return; // only record user-defined scopes.
}
unwind::Frame frame{fn.name(), "END", 0};
auto r = std::make_shared<CapturedTraceback>();
r->recordUserDefinedFrame(frame);
c10::cuda::CUDACachingAllocator::recordAnnotation(r);
}));

m.def("_cuda_isHistoryEnabled", []() {
return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
});
Expand Down
3 changes: 3 additions & 0 deletions torch/csrc/cuda/memory_snapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ std::string _memory_snapshot_pickled() {
IValue snapshot_s = "snapshot";
IValue oom_s = "oom";
IValue device_free_s = "device_free";
IValue user_defined_s = "user_defined";

using namespace c10::cuda::CUDACachingAllocator;

Expand All @@ -298,6 +299,8 @@ std::string _memory_snapshot_pickled() {
return segment_unmap_s;
case TraceEntry::SEGMENT_MAP:
return segment_map_s;
case TraceEntry::USER_DEFINED:
return user_defined_s;
}
throw std::runtime_error("unreachable");
};
Expand Down
18 changes: 14 additions & 4 deletions torch/csrc/profiler/combined_traceback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ SymbolizedTracebacks symbolize(
for (const auto& e : to_symbolize) {
if (e->python_) {
if (cur_python != e->python_ && !cur_py_frames.empty()) {
// NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
cur_python->appendSymbolized(cur_py_frames, r);
if (cur_python) {
// NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
cur_python->appendSymbolized(cur_py_frames, r);
}
cur_py_frames.clear();
}
cur_python = e->python_;
Expand All @@ -105,8 +107,10 @@ SymbolizedTracebacks symbolize(
}
}
if (!cur_py_frames.empty()) {
// NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
cur_python->appendSymbolized(cur_py_frames, r);
if (cur_python) {
// NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
cur_python->appendSymbolized(cur_py_frames, r);
}
cur_py_frames.clear();
}
std::vector<std::vector<uint64_t>> python_frame_fragments =
Expand Down Expand Up @@ -171,6 +175,12 @@ SymbolizedTracebacks symbolize(
for (; py_it != py_end; ++py_it) {
append_python(*py_it);
}

// Gather all user defined frames
for (const auto& f : sc->user_defined_frames_) {
r.tracebacks.back().push_back(r.all_frames.size());
r.all_frames.emplace_back(f);
}
}
return r;
}
Expand Down
5 changes: 5 additions & 0 deletions torch/csrc/profiler/combined_traceback.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,15 @@ struct TORCH_API CapturedTraceback : public c10::GatheredContext {
int traversePython(visitproc visit, void* arg);
int clearPython();

void recordUserDefinedFrame(const unwind::Frame& frame) {
user_defined_frames_.push_back(frame);
}

private:
std::vector<PyFrame> frames_;
std::vector<void*> cpp_frames_;
std::vector<jit::StackEntry> script_frames_;
std::vector<unwind::Frame> user_defined_frames_;
friend TORCH_API SymbolizedTracebacks
symbolize(const std::vector<CapturedTraceback*>& to_symbolize);

Expand Down

0 comments on commit 187aeae

Please sign in to comment.