Skip to content

Commit

Permalink
[CUDA Host Allocator] Add support of CudaHostRegister (pytorch#108488)
Browse files Browse the repository at this point in the history
Summary:

This diff adds another option to create cuda pinned memory using cudaHostRegister to avoid large lock wait time with cudaHostAlloc.

Differential Revision: D45843715
  • Loading branch information
banitag1 authored and facebook-github-bot committed Oct 5, 2023
1 parent a8653f3 commit 8e49959
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 6 deletions.
98 changes: 94 additions & 4 deletions aten/src/ATen/cuda/CachingHostAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <ATen/cuda/CUDAEvent.h>
#include <ATen/cuda/detail/CUDAHooks.h>
#include <ATen/detail/CUDAHooksInterface.h>
#include <c10/core/thread_pool.h>
#include <c10/cuda/CUDAAllocatorConfig.h>

#include <cuda_runtime_api.h>
#include <stdint.h>
Expand Down Expand Up @@ -174,11 +176,18 @@ class CUDAHostAllocator {
}

// Round up the allocation to the nearest power of two to improve reuse.
size_t roundSize = c10::llvm::PowerOf2Ceil(size);
void* ptr = nullptr;
C10_CUDA_CHECK(cudaHostAlloc(
&ptr, c10::llvm::PowerOf2Ceil(size), cudaHostAllocDefault));
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_cuda_host_register()) {
allocWithCudaHostRegister(&ptr, roundSize);
} else {
// Use cudaHostAlloc for allocating pinned memory (global lock in driver)
C10_CUDA_CHECK(cudaHostAlloc(&ptr, roundSize, cudaHostAllocDefault));
}

auto block = new Block();
block->size_ = c10::llvm::PowerOf2Ceil(size);
block->size_ = roundSize;
block->ptr_ = ptr;
block->allocated_ = true;

Expand Down Expand Up @@ -279,7 +288,14 @@ class CUDAHostAllocator {
for (auto* block : blocks_to_remove) {
blocks_.erase(block);
ptr_to_block_.erase(block->ptr_);
AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_cuda_host_register()) {
void* ptr = block->ptr_;
AT_CUDA_CHECK(cudaHostUnregister(ptr));
free(ptr);
} else {
AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
}
delete block;
}
}
Expand Down Expand Up @@ -343,6 +359,80 @@ class CUDAHostAllocator {
}
}

TaskThreadPool* getThreadPool() {
static TaskThreadPool* pool = new TaskThreadPool(
c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_max_register_threads());
return pool;
}

void mapPagesForRegister(
const void* ptr,
size_t size,
size_t i,
size_t numThreads,
size_t pageSize) {
uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
uintptr_t end = (uintptr_t)start + (size / numThreads);
if (i == (numThreads - 1)) {
end = (uintptr_t)ptr + size;
}

// pre-fault/map the pages by setting the first byte of the page
uintptr_t alignedStart =
(((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
memset((void*)p, 0, 1);
}
}

void registerPages(const void* ptr, size_t size) {
AT_CUDA_CHECK(
cudaHostRegister((void*)ptr, (size_t)size, cudaHostRegisterDefault));

// If host and device pointer don't match, give a warning and exit
void* devptr;
AT_CUDA_CHECK(cudaHostGetDevicePointer(&devptr, (void*)ptr, 0));
TORCH_CHECK(
(void*)devptr == (void*)ptr,
"Host and device pointer dont match with cudaHostRegister. "
"Please dont use this feature by setting "
"PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
"");
}

inline void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
// Here we do regular allocation, pre-fault/map the pages, and then do
// cudaHostRegister with GPU mapping flags to lock the pages, so we
// can minimize the cost for the cuda global lock.
*ptr = malloc(roundSize);

// Parallelize the mapping/registering of pages to reduce wall time
size_t pageSize = (1 << 12); // 4kB pages
size_t numMapThreads = c10::cuda::CUDACachingAllocator::
CUDAAllocatorConfig::pinned_num_register_threads();
if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
auto* pool = getThreadPool();
for (size_t i = 0; i < numMapThreads; i++) {
pool->run(std::bind(
&CUDAHostAllocator::mapPagesForRegister,
this,
*ptr,
roundSize,
i, // thread task-id
numMapThreads,
pageSize));
}
pool->waitWorkComplete();
} else {
// Map pages in the same thread
mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
}

// Register the mapped pages using cudaHostRegister
registerPages(*ptr, roundSize);
}

EventPool event_pool_;

alignas(64) std::mutex blocks_mutex_;
Expand Down
50 changes: 49 additions & 1 deletion c10/cuda/CUDAAllocatorConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
CUDAAllocatorConfig::CUDAAllocatorConfig()
: m_max_split_size(std::numeric_limits<size_t>::max()),
m_garbage_collection_threshold(0),
m_pinned_num_register_threads(1),
m_expandable_segments(false),
m_release_lock_on_cudamalloc(false) {
m_release_lock_on_cudamalloc(false),
m_pinned_use_cuda_host_register(false) {
m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
}

Expand Down Expand Up @@ -270,6 +272,12 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
i < config.size() && (config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for release_lock_on_cudamalloc");
m_release_lock_on_cudamalloc = (config[i] == "True");
} else if (config[i].compare("pinned_use_cuda_host_register") == 0) {
i = parsePinnedUseCudaHostRegister(config, i);
used_native_specific_option = true;
} else if (config[i].compare("pinned_num_register_threads") == 0) {
i = parsePinnedNumRegisterThreads(config, i);
used_native_specific_option = true;
} else {
TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i]);
}
Expand All @@ -286,6 +294,46 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
}
}

size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
TORCH_CHECK(
(config[i] == "True" || config[i] == "False"),
"Expected a single True/False argument for pinned_use_cuda_host_register");
m_pinned_use_cuda_host_register = (config[i] == "True");
} else {
TORCH_CHECK(
false, "Error, expecting pinned_use_cuda_host_register value", "");
}
return i;
}

size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
size_t val2 = stoi(config[i]);
TORCH_CHECK(
llvm::isPowerOf2_64(val2),
"Number of register threads has to be power of 2 ",
"");
auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
TORCH_CHECK(
val2 <= maxThreads,
"Number of register threads should be less than or equal to " +
std::to_string(maxThreads),
"");
m_pinned_num_register_threads = val2;
} else {
TORCH_CHECK(
false, "Error, expecting pinned_num_register_threads value", "");
}
return i;
}

// General caching allocator utilities
void setAllocatorSettings(const std::string& env) {
CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
Expand Down
26 changes: 25 additions & 1 deletion c10/cuda/CUDAAllocatorConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace cuda {
namespace CUDACachingAllocator {

// Environment config parser
class CUDAAllocatorConfig {
class C10_CUDA_API CUDAAllocatorConfig {
public:
static size_t max_split_size() {
return instance().m_max_split_size;
Expand All @@ -39,6 +39,22 @@ class CUDAAllocatorConfig {
return instance().m_release_lock_on_cudamalloc;
}

/** Pinned memory allocator settings */
static bool pinned_use_cuda_host_register() {
return instance().m_pinned_use_cuda_host_register;
}

static size_t pinned_num_register_threads() {
return instance().m_pinned_num_register_threads;
}

static size_t pinned_max_register_threads() {
// Based on the benchmark results, we see better allocation performance
// with 8 threads. However on future systems, we may need more threads
// and limiting this to 128 threads.
return 128;
}

// This is used to round-up allocation size to nearest power of 2 divisions.
// More description below in function roundup_power2_next_division
// As ane example, if we want 4 divisions between 2's power, this can be done
Expand Down Expand Up @@ -76,12 +92,20 @@ class CUDAAllocatorConfig {
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync);
size_t parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i);

std::atomic<size_t> m_max_split_size;
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<bool> m_expandable_segments;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
};

// General caching allocator utilities
Expand Down
12 changes: 12 additions & 0 deletions docs/source/notes/cuda.rst
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,18 @@ Available options:
appended to the end of the segment. This process does not create as many slivers
of unusable memory, so it is more likely to succeed at finding this memory.

`pinned_use_cuda_host_register` option is a boolean flag that determines whether to
use the CUDA API's cudaHostRegister function for allocating pinned memory instead
of the default cudaHostAlloc. When set to True, the memory is allocated using regular
malloc and then pages are mapped to the memory before calling cudaHostRegister.
This pre-mapping of pages helps reduce the lock time during the execution
of cudaHostRegister.

`pinned_num_register_threads` option is only valid when pinned_use_cuda_host_register
is set to True. By default, one thread is used to map the pages. This option allows
using more threads to parallelize the page mapping operations to reduce the overall
allocation time of pinned memory. A good value for this option is 8 based on
benchmarking results.

.. note::

Expand Down
7 changes: 7 additions & 0 deletions test/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ def tearDown(self):
del self.autocast_lists
super().tearDown()

def test_pinned_memory_with_cudaregister(self):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "pinned_use_cuda_host_register:True,pinned_num_register_threads:8"
t = torch.ones(20)
self.assertFalse(t.is_pinned())
pinned_t = torch.ones(1 << 16).pin_memory()
self.assertTrue(pinned_t.is_pinned())

def test_cudart_register(self):
t = torch.ones(20)
self.assertFalse(t.is_pinned())
Expand Down
1 change: 1 addition & 0 deletions torch/utils/hipify/cuda_to_hip_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8610,6 +8610,7 @@
("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
("cuda::CUDAAllocatorConfig", ("hip::HIPAllocatorConfig", API_C10)),
("CUDAAllocatorConfig", ("HIPAllocatorConfig", API_C10)),
("pinned_use_cuda_host_register", ("pinned_use_hip_host_register", API_C10)),
("c10::cuda::CUDAAllocator", ("c10::hip::HIPAllocator", API_C10)),
("cuda::CUDAAllocator", ("hip::HIPAllocator", API_C10)),
("CUDAAllocator", ("HIPAllocator", API_C10)),
Expand Down

0 comments on commit 8e49959

Please sign in to comment.