[Runtime] Add TensorPoolAllocatorGPU for GPU, which reduce GPU memory…

… usage. (#53) Co-authored-by: yitongh <hyt@mail.ustc.edu.cn>
DeepRec-AI · Jan 19, 2022 · c68ca71 · c68ca71
1 parent c1b5aea
commit c68ca71
Show file tree

Hide file tree

Showing 16 changed files with 2,156 additions and 14 deletions.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -3304,6 +3304,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/lower_functional_ops.h",
     "common_runtime/lower_while_op.h",
     "common_runtime/memory_planner.h",
+    "common_runtime/gpu_memory_planner.h",
     "common_runtime/memory_types.h",
     "common_runtime/metrics.h",
     "common_runtime/mkl_cpu_allocator.h",
@@ -3325,9 +3326,11 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/session_factory.h",
     "common_runtime/simple_propagator_state.h",
     "common_runtime/single_threaded_cpu_device.h",
+    "common_runtime/size_class.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/tensorpool_allocator.h",
+    "common_runtime/gpu_tensorpool_allocator.h",
     "common_runtime/threadpool_device.h",
     "common_runtime/process_state.h",
     "common_runtime/pool_allocator.h",
@@ -3374,6 +3377,7 @@ tf_cuda_library(
         "common_runtime/lower_if_op.cc",
         "common_runtime/lower_while_op.cc",
         "common_runtime/memory_planner.cc",
+        "common_runtime/gpu_memory_planner.cc",
         "common_runtime/memory_types.cc",
         "common_runtime/metrics.cc",
         "common_runtime/mkl_cpu_allocator.cc",
@@ -3404,6 +3408,7 @@ tf_cuda_library(
         "common_runtime/stats_publisher_interface.cc",
         "common_runtime/step_stats_collector.cc",
         "common_runtime/tensorpool_allocator.cc",
+        "common_runtime/gpu_tensorpool_allocator.cc",
         "common_runtime/threadpool_device.cc",
         "common_runtime/threadpool_device_factory.cc",
         "graph/gradients.cc",
@@ -4670,6 +4675,25 @@ tf_cc_test_gpu(
     ],
 )
 
+tf_cc_test_gpu(
+    name = "gpu_tensorpool_allocator_test",
+    size = "medium",
+    srcs = ["common_runtime/gpu_tensorpool_allocator_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "gpu_device_unified_memory_test",
     size = "small",

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <atomic>
 #include <string>
 #include <vector>
+#include <memory>
+#include <unordered_map>
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
@@ -32,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/memory_planner.h"
+#include "tensorflow/core/common_runtime/gpu_memory_planner.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -770,6 +773,21 @@ Status DirectSession::RunInternal(
   return Status::OK();
 }
 
+bool DirectSession::EnableTensorPoolTracking(ExecutorsAndKeys* executors_and_keys) {
+  static std::unordered_map<ExecutorsAndKeys*, bool> has_training_graph;
+  if (has_training_graph.find(executors_and_keys) == has_training_graph.end()) {
+    for (const PerPartitionExecutorsAndLib& partition :
+        executors_and_keys->items) {
+      if (partition.graph->IsTrainingGraph()) {
+        has_training_graph[executors_and_keys] = true;
+        return true;
+      }
+    }
+    has_training_graph[executors_and_keys] = false;
+  }
+  return has_training_graph[executors_and_keys];
+}
+
 Status DirectSession::Run(const RunOptions& run_options,
                           const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
@@ -781,6 +799,7 @@ Status DirectSession::Run(const RunOptions& run_options,
   direct_session_runs->GetCell()->IncrementBy(1);
 
   ScopedMemoryCollector scoped_memory_collector;
+  std::unique_ptr<ScopedMemoryCollectorGPU> scoped_memory_collector_gpu_ptr;
 
   // Extract the inputs names for this run of the session.
   std::vector<string> input_tensor_names;
@@ -804,6 +823,9 @@ Status DirectSession::Run(const RunOptions& run_options,
   {
     mutex_lock l(collective_graph_key_lock_);
     collective_graph_key_ = executors_and_keys->collective_graph_key;
+    if (EnableTensorPoolTracking(executors_and_keys)) {
+      scoped_memory_collector_gpu_ptr.reset(new ScopedMemoryCollectorGPU);
+    }
   }
 
   // Configure a call frame for the step, which we use to feed and

diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
@@ -253,6 +253,9 @@ class DirectSession : public Session {
       RunMetadata* run_metadata,
       const thread::ThreadPoolOptions& threadpool_options);
 
+  // Returns whether enable tracking of tensorpool allocator
+  bool EnableTensorPoolTracking(ExecutorsAndKeys* executors_and_keys);
+
   // Returns whether inter-op execution uses a global pool or the input
   // `run_options` requests being run on inter_op_thread_pool = 0 in case
   // multiple pools are configured.

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -209,7 +209,7 @@ class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
       LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
                                        data->address_, data->allocator_, false);
     }
-    data->allocator_->DeallocateRaw(data->address_);
+    data->allocator_->DeallocateRawAsync(data->address_);
     delete data;
   }
 

diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_vmem_allocator.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
+#include "tensorflow/core/common_runtime/gpu_tensorpool_allocator.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
@@ -62,6 +63,12 @@ bool useCudaMallocAsyncAllocator() {
       std::strcmp(debug_allocator_str, "cuda_malloc_async") == 0;
 }
 
+bool useTensorPoolAllocator() {
+  const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
+  return debug_allocator_str != nullptr &&
+      std::strcmp(debug_allocator_str, "tensorpool") == 0;
+}
+
 }  // namespace
 
 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
@@ -122,21 +129,35 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
         (options.per_process_gpu_memory_fraction() > 1.0 ||
          options.experimental().use_unified_memory()),
         gpu_visitors_[bus_id], {});
-    GPUBFCAllocator* gpu_bfc_allocator =
-        new GPUBFCAllocator(sub_allocator, total_bytes, options,
+    Allocator* gpu_allocator = nullptr;
+    GPUBFCAllocator* gpu_bfc_allocator = nullptr;
+    if (useTensorPoolAllocator()) {
+      gpu_allocator =
+          new GPUTensorPoolAllocator(sub_allocator,
+                      strings::StrCat("GPU_", tf_gpu_id.value(), "_tensorpool"),
+                      total_bytes);
+    } else {
+      gpu_bfc_allocator =
+          new GPUBFCAllocator(sub_allocator, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
-    Allocator* gpu_allocator = gpu_bfc_allocator;
-    // GPUVMemAllocator will allocate host memory as backup after running out of
-    // gpu device memory to avoid OOM failures
-    gpu_allocator = maybe_create_gpu_vmem_allocator(gpu_allocator,
-                                                        bus_id,
-                                                        platform_gpu_id,
-                                                        tf_gpu_id.value(),
-                                                        stream_exec);
+      gpu_allocator = gpu_bfc_allocator;
+      // GPUVMemAllocator will allocate host memory as backup after running out of
+      // gpu device memory to avoid OOM failures
+      gpu_allocator = maybe_create_gpu_vmem_allocator(gpu_allocator,
+                                                      bus_id,
+                                                      platform_gpu_id,
+                                                      tf_gpu_id.value(),
+                                                      stream_exec);
+    }
+
     SharedCounter* timing_counter = nullptr;
     if (options.experimental().timestamped_allocator()) {
-      timing_counter = new SharedCounter;
-      gpu_bfc_allocator->SetTimingCounter(timing_counter);
+      if (useTensorPoolAllocator()) {
+        LOG(WARNING) << "TensorPoolAllocator " << "don't support timestamped_allocator";
+      } else {
+        timing_counter = new SharedCounter;
+        gpu_bfc_allocator->SetTimingCounter(timing_counter);
+      }
     }
 
     // If true, checks for memory overwrites by writing
@@ -197,7 +218,10 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
                << " but only have " << gpu_allocators_.size();
     return nullptr;
   }
-
+  if (useTensorPoolAllocator()) {
+    LOG(WARNING) << "TensorPoolAllocator " << "don't support timestamped_allocator";
+    return nullptr;
+  }
   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
   if (allocator_parts.counter.get() == nullptr) {
     SharedCounter* timing_counter = new SharedCounter;