-
Notifications
You must be signed in to change notification settings - Fork 176
Open
Description
I wrote a test program roughly modeled after the C++ example — it transfers memory located on the GPU and tests various message sizes multiple times to take the average — but the performance is only about 0.3 GB/s.
When I set the network interfaces with
export UCX_NET_DEVICES="bond1,bond2,bond3,bond4,bond5,bond6,bond7,bond8,mlx5_bond_1:1,mlx5_bond_4:1,mlx5_bond_3:1,mlx5_bond_2:1,mlx5_bond_7:1,mlx5_bond_6:1,mlx5_bond_8:1,mlx5_bond_5:1"
the bandwidth drops to around 40 GB/s, which seems to indicate that the traffic is being routed through the NICs before reaching the target GPU.
However, my setup involves intra-node GPU communication with NVLink support. I’d like to know how to make it use NVLink directly instead of the network interfaces.
I also tried interfaces like createGpuXferReq, but they don’t seem to be supported.
#include <iostream>
#include <cassert>
#include <cstring>
#include <vector>
#include <cuda_runtime.h>
#include "nixl.h"
// Helper function to check CUDA errors
void checkCuda(cudaError_t result, const char* msg) {
if (result != cudaSuccess) {
std::cerr << "CUDA Error: " << msg << " : " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
// Check if the buffer is filled with a specific value
void check_device_buf(uint8_t* d_buf, size_t len, uint8_t expected) {
uint8_t* h_buf = (uint8_t*)malloc(len);
checkCuda(cudaMemcpy(h_buf, d_buf, len, cudaMemcpyDeviceToHost), "Memcpy device to host");
for (size_t i = 0; i < len; ++i) {
assert(h_buf[i] == expected);
}
free(h_buf);
}
// Function to measure transfer speed for a given size, repeat n times and return average GB/s
double measure_transfer_speed(
nixlAgent& A1, nixlAgent& A2,
nixlBackendH* ucx1, nixlBackendH* ucx2,
uint8_t* d_addr1, uint8_t* d_addr2,
size_t len, int repeat, int dev_src, int dev_dst,
const std::string& agent1, const std::string& agent2)
{
// Register memory for this size
nixl_opt_args_t extra_params1, extra_params2;
extra_params1.backends.push_back(ucx1);
extra_params2.backends.push_back(ucx2);
nixlBlobDesc buff1, buff2;
nixl_reg_dlist_t dlist1(VRAM_SEG), dlist2(VRAM_SEG);
buff1.addr = (uintptr_t)d_addr1;
buff1.len = len;
buff1.devId = dev_src;
dlist1.addDesc(buff1);
buff2.addr = (uintptr_t)d_addr2;
buff2.len = len;
buff2.devId = dev_dst;
dlist2.addDesc(buff2);
// Verify pointers are valid
if (d_addr1 == nullptr || d_addr2 == nullptr) {
std::cerr << "Error: Device pointers are null! d_addr1=" << d_addr1
<< ", d_addr2=" << d_addr2 << std::endl;
return -1.0;
}
std::cout << "Registering memory: len=" << len
<< ", d_addr1=" << (void*)d_addr1
<< ", d_addr2=" << (void*)d_addr2 << std::endl;
// Set CUDA device context before registering memory
checkCuda(cudaSetDevice(dev_src), "Set device for registration");
nixl_status_t ret1, ret2;
ret1 = A1.registerMem(dlist1, &extra_params1);
ret2 = A2.registerMem(dlist2, &extra_params2);
assert(ret1 == NIXL_SUCCESS);
assert(ret2 == NIXL_SUCCESS);
// Exchange metadata
std::string meta1, meta2, ret_s1;
ret1 = A1.getLocalMD(meta1);
ret2 = A2.getLocalMD(meta2);
assert(ret1 == NIXL_SUCCESS);
assert(ret2 == NIXL_SUCCESS);
ret1 = A1.loadRemoteMD(meta2, ret_s1);
assert(ret1 == NIXL_SUCCESS);
// Prepare transfer: copy whole buffer
size_t req_size = len;
size_t src_offset = 0;
size_t dst_offset = 0;
nixl_xfer_dlist_t req_src_descs(VRAM_SEG);
nixlBasicDesc req_src;
req_src.addr = (uintptr_t)(((uint8_t*)d_addr1) + src_offset);
req_src.len = req_size;
req_src.devId = dev_src;
req_src_descs.addDesc(req_src);
nixl_xfer_dlist_t req_dst_descs(VRAM_SEG);
nixlBasicDesc req_dst;
req_dst.addr = (uintptr_t)(((uint8_t*)d_addr2) + dst_offset);
req_dst.len = req_size;
req_dst.devId = dev_dst;
req_dst_descs.addDesc(req_dst);
// CUDA events for timing
cudaEvent_t start, stop;
checkCuda(cudaSetDevice(dev_src), "Set device for event create");
checkCuda(cudaEventCreate(&start), "Create start event");
checkCuda(cudaEventCreate(&stop), "Create stop event");
double total_ms = 0.0;
for (int i = 0; i < repeat; ++i) {
// Reset src and dst buffer
checkCuda(cudaSetDevice(dev_src), "Set device for memset src");
checkCuda(cudaMemset(d_addr1, 0xbb, len), "Memset src");
checkCuda(cudaSetDevice(dev_dst), "Set device for memset dst");
checkCuda(cudaMemset(d_addr2, 0, len), "Memset dst");
nixlXferReqH* req_handle;
// nixlGpuXferReqH gpu_req_handle;
nixl_opt_args_t xfer_params = extra_params1;
// xfer_params.notifMsg = "gpu_notification";
// xfer_params.hasNotif = true;
ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &xfer_params);
// ret1 = A1.createGpuXferReq(*req_handle, gpu_req_handle);
// std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl;
assert(ret1 == NIXL_SUCCESS);
// Timing is measured on CPU since the transfer is not on a CUDA stream.
auto t_start = std::chrono::high_resolution_clock::now();
nixl_status_t status = A1.postXferReq(req_handle);
// nixl_notifs_t notif_map;
// int n_notifs = 0;
while (status != NIXL_SUCCESS) {
status = A1.getXferStatus(req_handle);
assert(status >= 0);
}
auto t_end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
total_ms += ms;
// Check that the destination buffer has the expected value in the transferred region
checkCuda(cudaSetDevice(dev_dst), "Set device for check");
uint8_t* check_ptr = d_addr2 + dst_offset;
check_device_buf(check_ptr, req_size, 0xbb);
// Cleanup transfer request
ret1 = A1.releaseXferReq(req_handle);
assert(ret1 == NIXL_SUCCESS);
}
// Cleanup
ret1 = A1.deregisterMem(dlist1, &extra_params1);
ret2 = A2.deregisterMem(dlist2, &extra_params2);
assert(ret1 == NIXL_SUCCESS);
assert(ret2 == NIXL_SUCCESS);
ret1 = A1.invalidateRemoteMD(agent2);
assert(ret1 == NIXL_SUCCESS);
// Return average GB/s
double avg_ms = total_ms / repeat;
double gb = double(len) / (1024.0 * 1024.0 * 1024.0);
double gbps = gb / (avg_ms / 1000.0);
return gbps;
}
int main() {
// Agent names
std::string agent1("GPUAgent1");
std::string agent2("GPUAgent2");
// Set CUDA device 0 for agent1, device 1 for agent2
int dev_count = 0;
checkCuda(cudaGetDeviceCount(&dev_count), "Get device count");
assert(dev_count >= 2); // Need at least 2 GPUs
// Allocate device memory on two GPUs, max size 1M
size_t max_len = 1LL << 32; // 1GB
uint8_t* d_addr1 = nullptr;
uint8_t* d_addr2 = nullptr;
checkCuda(cudaSetDevice(0), "Set device 0");
checkCuda(cudaMalloc(&d_addr1, max_len), "Malloc device 0");
checkCuda(cudaSetDevice(1), "Set device 1");
checkCuda(cudaMalloc(&d_addr2, max_len), "Malloc device 1");
// NIXL setup
nixlAgentConfig cfg(true);
nixl_b_params_t init1, init2;
nixl_mem_list_t mems1, mems2;
nixlAgent A1(agent1, cfg);
nixlAgent A2(agent2, cfg);
std::vector<nixl_backend_t> plugins;
nixl_status_t ret1, ret2;
ret1 = A1.getAvailPlugins(plugins);
assert(ret1 == NIXL_SUCCESS);
// Use UCX backend for GPU memory
ret1 = A1.getPluginParams("UCX", mems1, init1);
ret2 = A2.getPluginParams("UCX", mems2, init2);
assert(ret1 == NIXL_SUCCESS);
assert(ret2 == NIXL_SUCCESS);
nixlBackendH* ucx1, *ucx2;
ret1 = A1.createBackend("UCX", init1, ucx1);
ret2 = A2.createBackend("UCX", init2, ucx2);
assert(ret1 == NIXL_SUCCESS);
assert(ret2 == NIXL_SUCCESS);
// Test sizes: 512B, 1K, 2K, ..., 512K, 1M
std::vector<size_t> sizes;
for (size_t sz = 8; sz <= (long long)1024 * 1024 * 1024 * 4; sz *= 2) {
sizes.push_back(sz);
}
std::cout << "Size(Bytes), Avg GB/s" << std::endl;
// Print size in human-readable units (B, KB, MB, GB)
auto human_readable_size = [](size_t sz) -> std::string {
char buf[32];
if (sz >= (1ULL << 30)) {
snprintf(buf, sizeof(buf), "%.2f GB", sz / double(1ULL << 30));
} else if (sz >= (1ULL << 20)) {
snprintf(buf, sizeof(buf), "%.2f MB", sz / double(1ULL << 20));
} else if (sz >= (1ULL << 10)) {
snprintf(buf, sizeof(buf), "%.2f KB", sz / double(1ULL << 10));
} else {
snprintf(buf, sizeof(buf), "%zu B", sz);
}
return std::string(buf);
};
for (size_t sz : sizes) {
int repeat = std::min(100,std::max(1, (int)((1024LL * 1024 * 1024) / sz)));
double gbps = measure_transfer_speed(
A1, A2, ucx1, ucx2, d_addr1, d_addr2, sz, repeat, 0, 1, agent1, agent2
);
std::cout << human_readable_size(sz) << ", " << gbps << " GB/s" << std::endl;
// break;
}
checkCuda(cudaSetDevice(0), "Set device 0 for free");
cudaFree(d_addr1);
checkCuda(cudaSetDevice(1), "Set device 1 for free");
cudaFree(d_addr2);
std::cout << "GPU-to-GPU bandwidth test done." << std::endl;
return 0;
}
output
16.00 KB, 0.138126 GB/s
Registering memory: len=32768, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 KB, 0.194278 GB/s
Registering memory: len=65536, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 KB, 0.243451 GB/s
Registering memory: len=131072, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
128.00 KB, 0.270097 GB/s
Registering memory: len=262144, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
256.00 KB, 0.291732 GB/s
Registering memory: len=524288, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
512.00 KB, 0.30523 GB/s
Registering memory: len=1048576, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
1.00 MB, 0.310145 GB/s
Registering memory: len=2097152, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
2.00 MB, 0.320992 GB/s
Registering memory: len=4194304, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
4.00 MB, 0.322928 GB/s
Registering memory: len=8388608, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
8.00 MB, 0.282562 GB/s
Registering memory: len=16777216, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
16.00 MB, 0.266396 GB/s
Registering memory: len=33554432, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 MB, 0.270343 GB/s
Registering memory: len=67108864, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 MB, 0.26952 GB/s
nvidia-smi nvlink --status
GPU 0: NVIDIA H20 (UUID: GPU-31641892-7df4-6434-99c4-d81fe5a57930)
Link 0: 26.562 GB/s
Link 1: 26.562 GB/s
Link 2: 26.562 GB/s
Link 3: 26.562 GB/s
Link 4: 26.562 GB/s
Link 5: 26.562 GB/s
Link 6: 26.562 GB/s
Link 7: 26.562 GB/s
Link 8: 26.562 GB/s
Link 9: 26.562 GB/s
Link 10: 26.562 GB/s
Link 11: 26.562 GB/s
Link 12: 26.562 GB/s
Link 13: 26.562 GB/s
Link 14: 26.562 GB/s
Link 15: 26.562 GB/s
Link 16: 26.562 GB/s
Link 17: 26.562 GB/s
GPU 1: NVIDIA H20 (UUID: GPU-65fe466d-80f5-84c9-714c-114170c0bb1e)
Link 0: 26.562 GB/s
Link 1: 26.562 GB/s
Link 2: 26.562 GB/s
Link 3: 26.562 GB/s
Link 4: 26.562 GB/s
Link 5: 26.562 GB/s
Link 6: 26.562 GB/s
Link 7: 26.562 GB/s
Link 8: 26.562 GB/s
Link 9: 26.562 GB/s
Link 10: 26.562 GB/s
Link 11: 26.562 GB/s
Link 12: 26.562 GB/s
Link 13: 26.562 GB/s
Link 14: 26.562 GB/s
Link 15: 26.562 GB/s
Link 16: 26.562 GB/s
Link 17: 26.562 GB/s
Metadata
Metadata
Assignees
Labels
No labels