Skip to content

Commit

Permalink
add flags to set caching memory manager split+recycling size limits
Browse files Browse the repository at this point in the history
Add flags to set configurable limits of the CachingMemoryManager. These flags
allow customization for different workloads. This diff changes the default recycling
size limit for training to 256MB. Following investigation of OOM on dynamic batching
nd based on past allocation analyses we found that this limit works well.
Big thanks for WilliamTambellini@ for his work on improving the memory manager
capability in handling flexible workload:
flashlight#188
  • Loading branch information
Your Name committed Jan 14, 2021
1 parent 25102d9 commit a95731a
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
10 changes: 8 additions & 2 deletions flashlight/app/asr/Train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ int main(int argc, char** argv) {
};

std::ofstream memLog;
if (FLAGS_fl_log_mem_ops_interval > 0 && isMaster) {
if (FLAGS_fl_mem_log_ops_interval > 0 && isMaster) {
auto* curMemMgr =
fl::MemoryManagerInstaller::currentlyInstalledMemoryManager();
if (curMemMgr) {
Expand All @@ -589,7 +589,7 @@ int main(int argc, char** argv) {
}
curMemMgr->setLogStream(&memLog);
curMemMgr->setLoggingEnabled(true);
curMemMgr->setLogFlushInterval(FLAGS_fl_log_mem_ops_interval);
curMemMgr->setLogFlushInterval(FLAGS_fl_mem_log_ops_interval);
}
}

Expand Down Expand Up @@ -665,6 +665,12 @@ int main(int argc, char** argv) {
auto* curMemMgr =
fl::MemoryManagerInstaller::currentlyInstalledMemoryManager();
if (curMemMgr) {
auto cachMemMgr =
std::dynamic_pointer_cast<CachingMemoryManager>(curMemMgr);
if (cachMemMgr) {
cachMemMgr->setRecyclingSizeLimit(FLAGS_fl_mem_recycling_size);
cachMemMgr->setSplitSizeLimit(FLAGS_fl_mem_split_size);
}
curMemMgr->printInfo("Memory Manager Stats", 0 /* device id */);
}
}
Expand Down
19 changes: 18 additions & 1 deletion flashlight/app/asr/common/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,12 +320,29 @@ DEFINE_string(
DEFINE_int64(fl_vlog_level, 0, "Sets the verbose logging level");

DEFINE_int64(
fl_log_mem_ops_interval,
fl_mem_log_ops_interval,
0,
"Flushes memory manager logs after a specified "
"number of log entries. 1000000 is a reasonable "
"value which will reduce overhead.");

DEFINE_int64(
fl_mem_recycling_size,
(1L << 28),
"prevents the caching memory manager from recycling buffers larger "
"than this value. Recycled buffers can be split by the caching "
"manager so it helps reduce fragmentation of buffers over this value."
"Default value of 256MB works well for typical workload where "
"number of allocation is exponentially decreasing with allocation "
"size and largest allocations are ~ 500MB.");

DEFINE_int64(
fl_mem_split_size,
std::numeric_limits<int64_t>::max(),
"prevents the caching memory manager from splitting buffers larger "
"than this value. Helps reduce external fragmentation by allowing "
"higher internal fragmentation.");

// MIXED PRECISION OPTIONS
DEFINE_bool(
fl_amp_use_mixed_precision,
Expand Down
2 changes: 2 additions & 0 deletions flashlight/app/asr/common/Flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ DECLARE_string(fl_optim_mode);
DECLARE_string(fl_log_level);
DECLARE_int64(fl_vlog_level);
DECLARE_int64(fl_log_mem_ops_interval);
DECLARE_int64(fl_mem_recycling_size);
DECLARE_int64(fl_mem_split_size);

/* ========== MIXED PRECISION OPTIONS ========== */

Expand Down

0 comments on commit a95731a

Please sign in to comment.