Skip to content

Commit

Permalink
[Runtime] Add TensorPoolAllocatorGPU for GPU, which reduce GPU memory…
Browse files Browse the repository at this point in the history
… usage. (#53)

Co-authored-by: yitongh <hyt@mail.ustc.edu.cn>
  • Loading branch information
liutongxuan and yitongh committed Jan 19, 2022
1 parent c1b5aea commit c68ca71
Show file tree
Hide file tree
Showing 16 changed files with 2,156 additions and 14 deletions.
24 changes: 24 additions & 0 deletions tensorflow/core/BUILD
Expand Up @@ -3304,6 +3304,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
"common_runtime/lower_functional_ops.h",
"common_runtime/lower_while_op.h",
"common_runtime/memory_planner.h",
"common_runtime/gpu_memory_planner.h",
"common_runtime/memory_types.h",
"common_runtime/metrics.h",
"common_runtime/mkl_cpu_allocator.h",
Expand All @@ -3325,9 +3326,11 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
"common_runtime/session_factory.h",
"common_runtime/simple_propagator_state.h",
"common_runtime/single_threaded_cpu_device.h",
"common_runtime/size_class.h",
"common_runtime/stats_publisher_interface.h",
"common_runtime/step_stats_collector.h",
"common_runtime/tensorpool_allocator.h",
"common_runtime/gpu_tensorpool_allocator.h",
"common_runtime/threadpool_device.h",
"common_runtime/process_state.h",
"common_runtime/pool_allocator.h",
Expand Down Expand Up @@ -3374,6 +3377,7 @@ tf_cuda_library(
"common_runtime/lower_if_op.cc",
"common_runtime/lower_while_op.cc",
"common_runtime/memory_planner.cc",
"common_runtime/gpu_memory_planner.cc",
"common_runtime/memory_types.cc",
"common_runtime/metrics.cc",
"common_runtime/mkl_cpu_allocator.cc",
Expand Down Expand Up @@ -3404,6 +3408,7 @@ tf_cuda_library(
"common_runtime/stats_publisher_interface.cc",
"common_runtime/step_stats_collector.cc",
"common_runtime/tensorpool_allocator.cc",
"common_runtime/gpu_tensorpool_allocator.cc",
"common_runtime/threadpool_device.cc",
"common_runtime/threadpool_device_factory.cc",
"graph/gradients.cc",
Expand Down Expand Up @@ -4670,6 +4675,25 @@ tf_cc_test_gpu(
],
)

tf_cc_test_gpu(
name = "gpu_tensorpool_allocator_test",
size = "medium",
srcs = ["common_runtime/gpu_tensorpool_allocator_test.cc"],
linkstatic = tf_kernel_tests_linkstatic(),
tags = tf_cuda_tests_tags(),
deps = [
":core",
":core_cpu",
":framework",
":framework_internal",
":lib",
":lib_internal",
":test",
":test_main",
":testlib",
],
)

tf_cuda_cc_test(
name = "gpu_device_unified_memory_test",
size = "small",
Expand Down
22 changes: 22 additions & 0 deletions tensorflow/core/common_runtime/direct_session.cc
Expand Up @@ -18,6 +18,8 @@ limitations under the License.
#include <atomic>
#include <string>
#include <vector>
#include <memory>
#include <unordered_map>

#include "absl/container/flat_hash_set.h"
#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
Expand All @@ -32,6 +34,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/graph_optimizer.h"
#include "tensorflow/core/common_runtime/memory_types.h"
#include "tensorflow/core/common_runtime/memory_planner.h"
#include "tensorflow/core/common_runtime/gpu_memory_planner.h"
#include "tensorflow/core/common_runtime/metrics.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
#include "tensorflow/core/common_runtime/process_util.h"
Expand Down Expand Up @@ -770,6 +773,21 @@ Status DirectSession::RunInternal(
return Status::OK();
}

bool DirectSession::EnableTensorPoolTracking(ExecutorsAndKeys* executors_and_keys) {
static std::unordered_map<ExecutorsAndKeys*, bool> has_training_graph;
if (has_training_graph.find(executors_and_keys) == has_training_graph.end()) {
for (const PerPartitionExecutorsAndLib& partition :
executors_and_keys->items) {
if (partition.graph->IsTrainingGraph()) {
has_training_graph[executors_and_keys] = true;
return true;
}
}
has_training_graph[executors_and_keys] = false;
}
return has_training_graph[executors_and_keys];
}

Status DirectSession::Run(const RunOptions& run_options,
const NamedTensorList& inputs,
const std::vector<string>& output_names,
Expand All @@ -781,6 +799,7 @@ Status DirectSession::Run(const RunOptions& run_options,
direct_session_runs->GetCell()->IncrementBy(1);

ScopedMemoryCollector scoped_memory_collector;
std::unique_ptr<ScopedMemoryCollectorGPU> scoped_memory_collector_gpu_ptr;

// Extract the inputs names for this run of the session.
std::vector<string> input_tensor_names;
Expand All @@ -804,6 +823,9 @@ Status DirectSession::Run(const RunOptions& run_options,
{
mutex_lock l(collective_graph_key_lock_);
collective_graph_key_ = executors_and_keys->collective_graph_key;
if (EnableTensorPoolTracking(executors_and_keys)) {
scoped_memory_collector_gpu_ptr.reset(new ScopedMemoryCollectorGPU);
}
}

// Configure a call frame for the step, which we use to feed and
Expand Down
3 changes: 3 additions & 0 deletions tensorflow/core/common_runtime/direct_session.h
Expand Up @@ -253,6 +253,9 @@ class DirectSession : public Session {
RunMetadata* run_metadata,
const thread::ThreadPoolOptions& threadpool_options);

// Returns whether enable tracking of tensorpool allocator
bool EnableTensorPoolTracking(ExecutorsAndKeys* executors_and_keys);

// Returns whether inter-op execution uses a global pool or the input
// `run_options` requests being run on inter_op_thread_pool = 0 in case
// multiple pools are configured.
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/core/common_runtime/gpu/gpu_device.cc
Expand Up @@ -209,7 +209,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
data->address_, data->allocator_, false);
}
data->allocator_->DeallocateRaw(data->address_);
data->allocator_->DeallocateRawAsync(data->address_);
delete data;
}

Expand Down
50 changes: 37 additions & 13 deletions tensorflow/core/common_runtime/gpu/gpu_process_state.cc
Expand Up @@ -30,6 +30,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_vmem_allocator.h"
#include "tensorflow/core/common_runtime/pool_allocator.h"
#include "tensorflow/core/common_runtime/shared_counter.h"
#include "tensorflow/core/common_runtime/gpu_tensorpool_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/log_memory.h"
#include "tensorflow/core/framework/tracking_allocator.h"
Expand Down Expand Up @@ -62,6 +63,12 @@ bool useCudaMallocAsyncAllocator() {
std::strcmp(debug_allocator_str, "cuda_malloc_async") == 0;
}

bool useTensorPoolAllocator() {
const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
return debug_allocator_str != nullptr &&
std::strcmp(debug_allocator_str, "tensorpool") == 0;
}

} // namespace

/*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
Expand Down Expand Up @@ -122,21 +129,35 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
(options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()),
gpu_visitors_[bus_id], {});
GPUBFCAllocator* gpu_bfc_allocator =
new GPUBFCAllocator(sub_allocator, total_bytes, options,
Allocator* gpu_allocator = nullptr;
GPUBFCAllocator* gpu_bfc_allocator = nullptr;
if (useTensorPoolAllocator()) {
gpu_allocator =
new GPUTensorPoolAllocator(sub_allocator,
strings::StrCat("GPU_", tf_gpu_id.value(), "_tensorpool"),
total_bytes);
} else {
gpu_bfc_allocator =
new GPUBFCAllocator(sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
Allocator* gpu_allocator = gpu_bfc_allocator;
// GPUVMemAllocator will allocate host memory as backup after running out of
// gpu device memory to avoid OOM failures
gpu_allocator = maybe_create_gpu_vmem_allocator(gpu_allocator,
bus_id,
platform_gpu_id,
tf_gpu_id.value(),
stream_exec);
gpu_allocator = gpu_bfc_allocator;
// GPUVMemAllocator will allocate host memory as backup after running out of
// gpu device memory to avoid OOM failures
gpu_allocator = maybe_create_gpu_vmem_allocator(gpu_allocator,
bus_id,
platform_gpu_id,
tf_gpu_id.value(),
stream_exec);
}

SharedCounter* timing_counter = nullptr;
if (options.experimental().timestamped_allocator()) {
timing_counter = new SharedCounter;
gpu_bfc_allocator->SetTimingCounter(timing_counter);
if (useTensorPoolAllocator()) {
LOG(WARNING) << "TensorPoolAllocator " << "don't support timestamped_allocator";
} else {
timing_counter = new SharedCounter;
gpu_bfc_allocator->SetTimingCounter(timing_counter);
}
}

// If true, checks for memory overwrites by writing
Expand Down Expand Up @@ -197,7 +218,10 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
<< " but only have " << gpu_allocators_.size();
return nullptr;
}

if (useTensorPoolAllocator()) {
LOG(WARNING) << "TensorPoolAllocator " << "don't support timestamped_allocator";
return nullptr;
}
AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
if (allocator_parts.counter.get() == nullptr) {
SharedCounter* timing_counter = new SharedCounter;
Expand Down

0 comments on commit c68ca71

Please sign in to comment.