Skip to content

Inter-GPU communication is very slow; how can I correctly implement inter-GPU communication in C++? #864

@njw1123

Description

@njw1123

I wrote a test program roughly modeled after the C++ example — it transfers memory located on the GPU and tests various message sizes multiple times to take the average — but the performance is only about 0.3 GB/s.

When I set the network interfaces with

export UCX_NET_DEVICES="bond1,bond2,bond3,bond4,bond5,bond6,bond7,bond8,mlx5_bond_1:1,mlx5_bond_4:1,mlx5_bond_3:1,mlx5_bond_2:1,mlx5_bond_7:1,mlx5_bond_6:1,mlx5_bond_8:1,mlx5_bond_5:1"

the bandwidth drops to around 40 GB/s, which seems to indicate that the traffic is being routed through the NICs before reaching the target GPU.

However, my setup involves intra-node GPU communication with NVLink support. I’d like to know how to make it use NVLink directly instead of the network interfaces.

I also tried interfaces like createGpuXferReq, but they don’t seem to be supported.



#include <iostream>
#include <cassert>
#include <cstring>
#include <vector>
#include <cuda_runtime.h>
#include "nixl.h"

// Helper function to check CUDA errors
void checkCuda(cudaError_t result, const char* msg) {
    if (result != cudaSuccess) {
        std::cerr << "CUDA Error: " << msg << " : " << cudaGetErrorString(result) << std::endl;
        exit(EXIT_FAILURE);
    }
}

// Check if the buffer is filled with a specific value
void check_device_buf(uint8_t* d_buf, size_t len, uint8_t expected) {
    uint8_t* h_buf = (uint8_t*)malloc(len);
    checkCuda(cudaMemcpy(h_buf, d_buf, len, cudaMemcpyDeviceToHost), "Memcpy device to host");
    for (size_t i = 0; i < len; ++i) {
        assert(h_buf[i] == expected);
    }
    free(h_buf);
}

// Function to measure transfer speed for a given size, repeat n times and return average GB/s
double measure_transfer_speed(
    nixlAgent& A1, nixlAgent& A2,
    nixlBackendH* ucx1, nixlBackendH* ucx2,
    uint8_t* d_addr1, uint8_t* d_addr2,
    size_t len, int repeat, int dev_src, int dev_dst,
    const std::string& agent1, const std::string& agent2)
{
    // Register memory for this size
    nixl_opt_args_t extra_params1, extra_params2;
    extra_params1.backends.push_back(ucx1);
    extra_params2.backends.push_back(ucx2);

    nixlBlobDesc buff1, buff2;
    nixl_reg_dlist_t dlist1(VRAM_SEG), dlist2(VRAM_SEG);

    buff1.addr = (uintptr_t)d_addr1;
    buff1.len = len;
    buff1.devId = dev_src;
    dlist1.addDesc(buff1);

    buff2.addr = (uintptr_t)d_addr2;
    buff2.len = len;
    buff2.devId = dev_dst;
    dlist2.addDesc(buff2);

    // Verify pointers are valid
    if (d_addr1 == nullptr || d_addr2 == nullptr) {
        std::cerr << "Error: Device pointers are null! d_addr1=" << d_addr1 
                  << ", d_addr2=" << d_addr2 << std::endl;
        return -1.0;
    }
    
    std::cout << "Registering memory: len=" << len 
              << ", d_addr1=" << (void*)d_addr1 
              << ", d_addr2=" << (void*)d_addr2 << std::endl;

    // Set CUDA device context before registering memory
    checkCuda(cudaSetDevice(dev_src), "Set device for registration");

    nixl_status_t ret1, ret2;
    ret1 = A1.registerMem(dlist1, &extra_params1);
    ret2 = A2.registerMem(dlist2, &extra_params2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    // Exchange metadata
    std::string meta1, meta2, ret_s1;
    ret1 = A1.getLocalMD(meta1);
    ret2 = A2.getLocalMD(meta2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    ret1 = A1.loadRemoteMD(meta2, ret_s1);
    assert(ret1 == NIXL_SUCCESS);

    // Prepare transfer: copy whole buffer
    size_t req_size = len;
    size_t src_offset = 0;
    size_t dst_offset = 0;

    nixl_xfer_dlist_t req_src_descs(VRAM_SEG);
    nixlBasicDesc req_src;
    req_src.addr = (uintptr_t)(((uint8_t*)d_addr1) + src_offset);
    req_src.len = req_size;
    req_src.devId = dev_src;
    req_src_descs.addDesc(req_src);

    nixl_xfer_dlist_t req_dst_descs(VRAM_SEG);
    nixlBasicDesc req_dst;
    req_dst.addr = (uintptr_t)(((uint8_t*)d_addr2) + dst_offset);
    req_dst.len = req_size;
    req_dst.devId = dev_dst;
    req_dst_descs.addDesc(req_dst);

    // CUDA events for timing
    cudaEvent_t start, stop;
    checkCuda(cudaSetDevice(dev_src), "Set device for event create");
    checkCuda(cudaEventCreate(&start), "Create start event");
    checkCuda(cudaEventCreate(&stop), "Create stop event");

    double total_ms = 0.0;

    for (int i = 0; i < repeat; ++i) {
        // Reset src and dst buffer
        checkCuda(cudaSetDevice(dev_src), "Set device for memset src");
        checkCuda(cudaMemset(d_addr1, 0xbb, len), "Memset src");
        checkCuda(cudaSetDevice(dev_dst), "Set device for memset dst");
        checkCuda(cudaMemset(d_addr2, 0, len), "Memset dst");

        nixlXferReqH* req_handle;
        // nixlGpuXferReqH gpu_req_handle;
        nixl_opt_args_t xfer_params = extra_params1;
        // xfer_params.notifMsg = "gpu_notification";
        // xfer_params.hasNotif = true;
        ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &xfer_params);
        // ret1 = A1.createGpuXferReq(*req_handle, gpu_req_handle);
        // std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl;
        assert(ret1 == NIXL_SUCCESS);

        // Timing is measured on CPU since the transfer is not on a CUDA stream.
        auto t_start = std::chrono::high_resolution_clock::now();

        nixl_status_t status = A1.postXferReq(req_handle);
        // nixl_notifs_t notif_map;
        // int n_notifs = 0;
        
        while (status != NIXL_SUCCESS) {
            status = A1.getXferStatus(req_handle);
            assert(status >= 0);
        }

        auto t_end = std::chrono::high_resolution_clock::now();
        double ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
        total_ms += ms;


        // Check that the destination buffer has the expected value in the transferred region
        checkCuda(cudaSetDevice(dev_dst), "Set device for check");
        uint8_t* check_ptr = d_addr2 + dst_offset;
        check_device_buf(check_ptr, req_size, 0xbb);

        // Cleanup transfer request
        ret1 = A1.releaseXferReq(req_handle);
        assert(ret1 == NIXL_SUCCESS);
    }

    // Cleanup
    ret1 = A1.deregisterMem(dlist1, &extra_params1);
    ret2 = A2.deregisterMem(dlist2, &extra_params2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    ret1 = A1.invalidateRemoteMD(agent2);
    assert(ret1 == NIXL_SUCCESS);

    // Return average GB/s
    double avg_ms = total_ms / repeat;
    double gb = double(len) / (1024.0 * 1024.0 * 1024.0);
    double gbps = gb / (avg_ms / 1000.0);
    return gbps;
}

int main() {
    // Agent names
    std::string agent1("GPUAgent1");
    std::string agent2("GPUAgent2");

    // Set CUDA device 0 for agent1, device 1 for agent2
    int dev_count = 0;
    checkCuda(cudaGetDeviceCount(&dev_count), "Get device count");
    assert(dev_count >= 2); // Need at least 2 GPUs

    // Allocate device memory on two GPUs, max size 1M
    size_t max_len = 1LL << 32; // 1GB
    uint8_t* d_addr1 = nullptr;
    uint8_t* d_addr2 = nullptr;

    checkCuda(cudaSetDevice(0), "Set device 0");
    checkCuda(cudaMalloc(&d_addr1, max_len), "Malloc device 0");
    checkCuda(cudaSetDevice(1), "Set device 1");
    checkCuda(cudaMalloc(&d_addr2, max_len), "Malloc device 1");

    // NIXL setup
    nixlAgentConfig cfg(true);
    nixl_b_params_t init1, init2;
    nixl_mem_list_t mems1, mems2;

    nixlAgent A1(agent1, cfg);
    nixlAgent A2(agent2, cfg);

    std::vector<nixl_backend_t> plugins;
    nixl_status_t ret1, ret2;
    ret1 = A1.getAvailPlugins(plugins);
    assert(ret1 == NIXL_SUCCESS);

    // Use UCX backend for GPU memory
    ret1 = A1.getPluginParams("UCX", mems1, init1);
    ret2 = A2.getPluginParams("UCX", mems2, init2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    nixlBackendH* ucx1, *ucx2;
    ret1 = A1.createBackend("UCX", init1, ucx1);
    ret2 = A2.createBackend("UCX", init2, ucx2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    // Test sizes: 512B, 1K, 2K, ..., 512K, 1M
    std::vector<size_t> sizes;
    for (size_t sz = 8; sz <= (long long)1024 * 1024 * 1024 * 4; sz *= 2) {
        sizes.push_back(sz);
    }

    std::cout << "Size(Bytes), Avg GB/s" << std::endl;
    // Print size in human-readable units (B, KB, MB, GB)
    auto human_readable_size = [](size_t sz) -> std::string {
        char buf[32];
        if (sz >= (1ULL << 30)) {
            snprintf(buf, sizeof(buf), "%.2f GB", sz / double(1ULL << 30));
        } else if (sz >= (1ULL << 20)) {
            snprintf(buf, sizeof(buf), "%.2f MB", sz / double(1ULL << 20));
        } else if (sz >= (1ULL << 10)) {
            snprintf(buf, sizeof(buf), "%.2f KB", sz / double(1ULL << 10));
        } else {
            snprintf(buf, sizeof(buf), "%zu B", sz);
        }
        return std::string(buf);
    };

    for (size_t sz : sizes) {
        int repeat = std::min(100,std::max(1, (int)((1024LL * 1024 * 1024) / sz)));
        double gbps = measure_transfer_speed(
            A1, A2, ucx1, ucx2, d_addr1, d_addr2, sz, repeat, 0, 1, agent1, agent2
        );
        std::cout << human_readable_size(sz) << ", " << gbps << " GB/s" << std::endl;
        // break;
    }

    checkCuda(cudaSetDevice(0), "Set device 0 for free");
    cudaFree(d_addr1);
    checkCuda(cudaSetDevice(1), "Set device 1 for free");
    cudaFree(d_addr2);

    std::cout << "GPU-to-GPU bandwidth test done." << std::endl;
    return 0;
}


output


16.00 KB, 0.138126 GB/s
Registering memory: len=32768, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 KB, 0.194278 GB/s
Registering memory: len=65536, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 KB, 0.243451 GB/s
Registering memory: len=131072, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
128.00 KB, 0.270097 GB/s
Registering memory: len=262144, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
256.00 KB, 0.291732 GB/s
Registering memory: len=524288, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
512.00 KB, 0.30523 GB/s
Registering memory: len=1048576, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
1.00 MB, 0.310145 GB/s
Registering memory: len=2097152, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
2.00 MB, 0.320992 GB/s
Registering memory: len=4194304, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
4.00 MB, 0.322928 GB/s
Registering memory: len=8388608, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
8.00 MB, 0.282562 GB/s
Registering memory: len=16777216, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
16.00 MB, 0.266396 GB/s
Registering memory: len=33554432, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 MB, 0.270343 GB/s
Registering memory: len=67108864, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 MB, 0.26952 GB/s


nvidia-smi nvlink --status
GPU 0: NVIDIA H20 (UUID: GPU-31641892-7df4-6434-99c4-d81fe5a57930)

         Link 0: 26.562 GB/s
         Link 1: 26.562 GB/s
         Link 2: 26.562 GB/s
         Link 3: 26.562 GB/s
         Link 4: 26.562 GB/s
         Link 5: 26.562 GB/s
         Link 6: 26.562 GB/s
         Link 7: 26.562 GB/s
         Link 8: 26.562 GB/s
         Link 9: 26.562 GB/s
         Link 10: 26.562 GB/s
         Link 11: 26.562 GB/s
         Link 12: 26.562 GB/s
         Link 13: 26.562 GB/s
         Link 14: 26.562 GB/s
         Link 15: 26.562 GB/s
         Link 16: 26.562 GB/s
         Link 17: 26.562 GB/s
GPU 1: NVIDIA H20 (UUID: GPU-65fe466d-80f5-84c9-714c-114170c0bb1e)
         Link 0: 26.562 GB/s
         Link 1: 26.562 GB/s
         Link 2: 26.562 GB/s
         Link 3: 26.562 GB/s
         Link 4: 26.562 GB/s
         Link 5: 26.562 GB/s
         Link 6: 26.562 GB/s
         Link 7: 26.562 GB/s
         Link 8: 26.562 GB/s
         Link 9: 26.562 GB/s
         Link 10: 26.562 GB/s
         Link 11: 26.562 GB/s
         Link 12: 26.562 GB/s
         Link 13: 26.562 GB/s
         Link 14: 26.562 GB/s
         Link 15: 26.562 GB/s
         Link 16: 26.562 GB/s
         Link 17: 26.562 GB/s

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions