Inter-GPU communication is very slow; how can I correctly implement inter-GPU communication in C++?

I wrote a test program roughly modeled after the C++ example — it transfers memory located on the GPU and tests various message sizes multiple times to take the average — but the performance is only about 0.3 GB/s.

When I set the network interfaces with
```
export UCX_NET_DEVICES="bond1,bond2,bond3,bond4,bond5,bond6,bond7,bond8,mlx5_bond_1:1,mlx5_bond_4:1,mlx5_bond_3:1,mlx5_bond_2:1,mlx5_bond_7:1,mlx5_bond_6:1,mlx5_bond_8:1,mlx5_bond_5:1"
```

the bandwidth drops to around 40 GB/s, which seems to indicate that the traffic is being routed through the NICs before reaching the target GPU.

However, my setup involves intra-node GPU communication with NVLink support. I’d like to know how to make it use NVLink directly instead of the network interfaces.

I also tried interfaces like createGpuXferReq, but they don’t seem to be supported.

```


#include <iostream>
#include <cassert>
#include <cstring>
#include <vector>
#include <cuda_runtime.h>
#include "nixl.h"

// Helper function to check CUDA errors
void checkCuda(cudaError_t result, const char* msg) {
    if (result != cudaSuccess) {
        std::cerr << "CUDA Error: " << msg << " : " << cudaGetErrorString(result) << std::endl;
        exit(EXIT_FAILURE);
    }
}

// Check if the buffer is filled with a specific value
void check_device_buf(uint8_t* d_buf, size_t len, uint8_t expected) {
    uint8_t* h_buf = (uint8_t*)malloc(len);
    checkCuda(cudaMemcpy(h_buf, d_buf, len, cudaMemcpyDeviceToHost), "Memcpy device to host");
    for (size_t i = 0; i < len; ++i) {
        assert(h_buf[i] == expected);
    }
    free(h_buf);
}

// Function to measure transfer speed for a given size, repeat n times and return average GB/s
double measure_transfer_speed(
    nixlAgent& A1, nixlAgent& A2,
    nixlBackendH* ucx1, nixlBackendH* ucx2,
    uint8_t* d_addr1, uint8_t* d_addr2,
    size_t len, int repeat, int dev_src, int dev_dst,
    const std::string& agent1, const std::string& agent2)
{
    // Register memory for this size
    nixl_opt_args_t extra_params1, extra_params2;
    extra_params1.backends.push_back(ucx1);
    extra_params2.backends.push_back(ucx2);

    nixlBlobDesc buff1, buff2;
    nixl_reg_dlist_t dlist1(VRAM_SEG), dlist2(VRAM_SEG);

    buff1.addr = (uintptr_t)d_addr1;
    buff1.len = len;
    buff1.devId = dev_src;
    dlist1.addDesc(buff1);

    buff2.addr = (uintptr_t)d_addr2;
    buff2.len = len;
    buff2.devId = dev_dst;
    dlist2.addDesc(buff2);

    // Verify pointers are valid
    if (d_addr1 == nullptr || d_addr2 == nullptr) {
        std::cerr << "Error: Device pointers are null! d_addr1=" << d_addr1 
                  << ", d_addr2=" << d_addr2 << std::endl;
        return -1.0;
    }
    
    std::cout << "Registering memory: len=" << len 
              << ", d_addr1=" << (void*)d_addr1 
              << ", d_addr2=" << (void*)d_addr2 << std::endl;

    // Set CUDA device context before registering memory
    checkCuda(cudaSetDevice(dev_src), "Set device for registration");

    nixl_status_t ret1, ret2;
    ret1 = A1.registerMem(dlist1, &extra_params1);
    ret2 = A2.registerMem(dlist2, &extra_params2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    // Exchange metadata
    std::string meta1, meta2, ret_s1;
    ret1 = A1.getLocalMD(meta1);
    ret2 = A2.getLocalMD(meta2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    ret1 = A1.loadRemoteMD(meta2, ret_s1);
    assert(ret1 == NIXL_SUCCESS);

    // Prepare transfer: copy whole buffer
    size_t req_size = len;
    size_t src_offset = 0;
    size_t dst_offset = 0;

    nixl_xfer_dlist_t req_src_descs(VRAM_SEG);
    nixlBasicDesc req_src;
    req_src.addr = (uintptr_t)(((uint8_t*)d_addr1) + src_offset);
    req_src.len = req_size;
    req_src.devId = dev_src;
    req_src_descs.addDesc(req_src);

    nixl_xfer_dlist_t req_dst_descs(VRAM_SEG);
    nixlBasicDesc req_dst;
    req_dst.addr = (uintptr_t)(((uint8_t*)d_addr2) + dst_offset);
    req_dst.len = req_size;
    req_dst.devId = dev_dst;
    req_dst_descs.addDesc(req_dst);

    // CUDA events for timing
    cudaEvent_t start, stop;
    checkCuda(cudaSetDevice(dev_src), "Set device for event create");
    checkCuda(cudaEventCreate(&start), "Create start event");
    checkCuda(cudaEventCreate(&stop), "Create stop event");

    double total_ms = 0.0;

    for (int i = 0; i < repeat; ++i) {
        // Reset src and dst buffer
        checkCuda(cudaSetDevice(dev_src), "Set device for memset src");
        checkCuda(cudaMemset(d_addr1, 0xbb, len), "Memset src");
        checkCuda(cudaSetDevice(dev_dst), "Set device for memset dst");
        checkCuda(cudaMemset(d_addr2, 0, len), "Memset dst");

        nixlXferReqH* req_handle;
        // nixlGpuXferReqH gpu_req_handle;
        nixl_opt_args_t xfer_params = extra_params1;
        // xfer_params.notifMsg = "gpu_notification";
        // xfer_params.hasNotif = true;
        ret1 = A1.createXferReq(NIXL_WRITE, req_src_descs, req_dst_descs, agent2, req_handle, &xfer_params);
        // ret1 = A1.createGpuXferReq(*req_handle, gpu_req_handle);
        // std::cout << "Xfer request created, status: " << nixlEnumStrings::statusStr(ret1) << std::endl;
        assert(ret1 == NIXL_SUCCESS);

        // Timing is measured on CPU since the transfer is not on a CUDA stream.
        auto t_start = std::chrono::high_resolution_clock::now();

        nixl_status_t status = A1.postXferReq(req_handle);
        // nixl_notifs_t notif_map;
        // int n_notifs = 0;
        
        while (status != NIXL_SUCCESS) {
            status = A1.getXferStatus(req_handle);
            assert(status >= 0);
        }

        auto t_end = std::chrono::high_resolution_clock::now();
        double ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
        total_ms += ms;


        // Check that the destination buffer has the expected value in the transferred region
        checkCuda(cudaSetDevice(dev_dst), "Set device for check");
        uint8_t* check_ptr = d_addr2 + dst_offset;
        check_device_buf(check_ptr, req_size, 0xbb);

        // Cleanup transfer request
        ret1 = A1.releaseXferReq(req_handle);
        assert(ret1 == NIXL_SUCCESS);
    }

    // Cleanup
    ret1 = A1.deregisterMem(dlist1, &extra_params1);
    ret2 = A2.deregisterMem(dlist2, &extra_params2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    ret1 = A1.invalidateRemoteMD(agent2);
    assert(ret1 == NIXL_SUCCESS);

    // Return average GB/s
    double avg_ms = total_ms / repeat;
    double gb = double(len) / (1024.0 * 1024.0 * 1024.0);
    double gbps = gb / (avg_ms / 1000.0);
    return gbps;
}

int main() {
    // Agent names
    std::string agent1("GPUAgent1");
    std::string agent2("GPUAgent2");

    // Set CUDA device 0 for agent1, device 1 for agent2
    int dev_count = 0;
    checkCuda(cudaGetDeviceCount(&dev_count), "Get device count");
    assert(dev_count >= 2); // Need at least 2 GPUs

    // Allocate device memory on two GPUs, max size 1M
    size_t max_len = 1LL << 32; // 1GB
    uint8_t* d_addr1 = nullptr;
    uint8_t* d_addr2 = nullptr;

    checkCuda(cudaSetDevice(0), "Set device 0");
    checkCuda(cudaMalloc(&d_addr1, max_len), "Malloc device 0");
    checkCuda(cudaSetDevice(1), "Set device 1");
    checkCuda(cudaMalloc(&d_addr2, max_len), "Malloc device 1");

    // NIXL setup
    nixlAgentConfig cfg(true);
    nixl_b_params_t init1, init2;
    nixl_mem_list_t mems1, mems2;

    nixlAgent A1(agent1, cfg);
    nixlAgent A2(agent2, cfg);

    std::vector<nixl_backend_t> plugins;
    nixl_status_t ret1, ret2;
    ret1 = A1.getAvailPlugins(plugins);
    assert(ret1 == NIXL_SUCCESS);

    // Use UCX backend for GPU memory
    ret1 = A1.getPluginParams("UCX", mems1, init1);
    ret2 = A2.getPluginParams("UCX", mems2, init2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    nixlBackendH* ucx1, *ucx2;
    ret1 = A1.createBackend("UCX", init1, ucx1);
    ret2 = A2.createBackend("UCX", init2, ucx2);
    assert(ret1 == NIXL_SUCCESS);
    assert(ret2 == NIXL_SUCCESS);

    // Test sizes: 512B, 1K, 2K, ..., 512K, 1M
    std::vector<size_t> sizes;
    for (size_t sz = 8; sz <= (long long)1024 * 1024 * 1024 * 4; sz *= 2) {
        sizes.push_back(sz);
    }

    std::cout << "Size(Bytes), Avg GB/s" << std::endl;
    // Print size in human-readable units (B, KB, MB, GB)
    auto human_readable_size = [](size_t sz) -> std::string {
        char buf[32];
        if (sz >= (1ULL << 30)) {
            snprintf(buf, sizeof(buf), "%.2f GB", sz / double(1ULL << 30));
        } else if (sz >= (1ULL << 20)) {
            snprintf(buf, sizeof(buf), "%.2f MB", sz / double(1ULL << 20));
        } else if (sz >= (1ULL << 10)) {
            snprintf(buf, sizeof(buf), "%.2f KB", sz / double(1ULL << 10));
        } else {
            snprintf(buf, sizeof(buf), "%zu B", sz);
        }
        return std::string(buf);
    };

    for (size_t sz : sizes) {
        int repeat = std::min(100,std::max(1, (int)((1024LL * 1024 * 1024) / sz)));
        double gbps = measure_transfer_speed(
            A1, A2, ucx1, ucx2, d_addr1, d_addr2, sz, repeat, 0, 1, agent1, agent2
        );
        std::cout << human_readable_size(sz) << ", " << gbps << " GB/s" << std::endl;
        // break;
    }

    checkCuda(cudaSetDevice(0), "Set device 0 for free");
    cudaFree(d_addr1);
    checkCuda(cudaSetDevice(1), "Set device 1 for free");
    cudaFree(d_addr2);

    std::cout << "GPU-to-GPU bandwidth test done." << std::endl;
    return 0;
}


```

output
```

16.00 KB, 0.138126 GB/s
Registering memory: len=32768, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 KB, 0.194278 GB/s
Registering memory: len=65536, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 KB, 0.243451 GB/s
Registering memory: len=131072, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
128.00 KB, 0.270097 GB/s
Registering memory: len=262144, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
256.00 KB, 0.291732 GB/s
Registering memory: len=524288, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
512.00 KB, 0.30523 GB/s
Registering memory: len=1048576, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
1.00 MB, 0.310145 GB/s
Registering memory: len=2097152, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
2.00 MB, 0.320992 GB/s
Registering memory: len=4194304, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
4.00 MB, 0.322928 GB/s
Registering memory: len=8388608, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
8.00 MB, 0.282562 GB/s
Registering memory: len=16777216, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
16.00 MB, 0.266396 GB/s
Registering memory: len=33554432, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
32.00 MB, 0.270343 GB/s
Registering memory: len=67108864, d_addr1=0x7f03c0000000, d_addr2=0x7f02a0000000
64.00 MB, 0.26952 GB/s

```



```

nvidia-smi nvlink --status
GPU 0: NVIDIA H20 (UUID: GPU-31641892-7df4-6434-99c4-d81fe5a57930)

         Link 0: 26.562 GB/s
         Link 1: 26.562 GB/s
         Link 2: 26.562 GB/s
         Link 3: 26.562 GB/s
         Link 4: 26.562 GB/s
         Link 5: 26.562 GB/s
         Link 6: 26.562 GB/s
         Link 7: 26.562 GB/s
         Link 8: 26.562 GB/s
         Link 9: 26.562 GB/s
         Link 10: 26.562 GB/s
         Link 11: 26.562 GB/s
         Link 12: 26.562 GB/s
         Link 13: 26.562 GB/s
         Link 14: 26.562 GB/s
         Link 15: 26.562 GB/s
         Link 16: 26.562 GB/s
         Link 17: 26.562 GB/s
GPU 1: NVIDIA H20 (UUID: GPU-65fe466d-80f5-84c9-714c-114170c0bb1e)
         Link 0: 26.562 GB/s
         Link 1: 26.562 GB/s
         Link 2: 26.562 GB/s
         Link 3: 26.562 GB/s
         Link 4: 26.562 GB/s
         Link 5: 26.562 GB/s
         Link 6: 26.562 GB/s
         Link 7: 26.562 GB/s
         Link 8: 26.562 GB/s
         Link 9: 26.562 GB/s
         Link 10: 26.562 GB/s
         Link 11: 26.562 GB/s
         Link 12: 26.562 GB/s
         Link 13: 26.562 GB/s
         Link 14: 26.562 GB/s
         Link 15: 26.562 GB/s
         Link 16: 26.562 GB/s
         Link 17: 26.562 GB/s

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Inter-GPU communication is very slow; how can I correctly implement inter-GPU communication in C++? #864

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Inter-GPU communication is very slow; how can I correctly implement inter-GPU communication in C++? #864

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions