From 89b2765ca51659494b2b2f5028fdd87aa3df5992 Mon Sep 17 00:00:00 2001 From: Yang Liming Date: Mon, 1 Jun 2026 14:24:33 +0800 Subject: [PATCH] feat: add io_uring transport layer support Add a new io_uring-based transport layer (IouringTransport) as an alternative to the existing TCP and RDMA transports, following the same architectural patterns as the RDMA implementation. Core implementation: - src/brpc/iouring/iouring_endpoint.h/cpp: IouringEndpoint (SocketUser subclass) that submits async read/write SQEs and reaps CQEs, with optional SQPOLL polling mode. - src/brpc/iouring/iouring_helper.h/cpp: global io_uring ring lifecycle management, per-bthread-tag poller threads, and availability checks. - src/brpc/iouring_transport.h/cpp: IouringTransport (Transport interface) wiring Init/Release/Reset/Connect/CutFromIOBuf(List)/ WaitEpollOut/ProcessEvent/QueueMessage/Debug/ContextInitOrDie. Build system integration: - CMakeLists.txt: BRPC_WITH_IOURING option; find_package(liburing); conditionally compile iouring sources and link -luring. - BUILD.bazel / bazel/config/BUILD.bazel: brpc_with_iouring config_setting; conditional srcs/defines/linkopts/deps. - WORKSPACE: new_local_repository for @com_github_axboe_liburing. - bazel/third_party/liburing/liburing.BUILD: cc_library target for liburing. Framework hooks: - src/brpc/socket_mode.h: add SOCKET_MODE_IOURING enum value. - src/brpc/transport_factory.cpp: register IouringTransport in TransportFactory::Create(). - src/brpc/socket.h: friend declarations for IouringEndpoint / IouringTransport. - src/brpc/input_messenger.h: friend declarations for IouringEndpoint / IouringTransport. Bug fixes: - src/butil/single_threaded_pool.h: rename static member BLOCK_SIZE to POOL_BLOCK_SIZE to avoid conflict with the BLOCK_SIZE macro defined by (pulled in via liburing.h). - src/brpc/iouring_transport.cpp: move DECLARE_bool(usercode_in_*) inside namespace brpc{} to match the DEFINE_bool site in event_dispatcher.cpp, fixing linker undefined-reference errors. Example and documentation: - example/iouring_performance/: server, client, proto, CMakeLists.txt mirroring the rdma_performance example; supports WITH_IOURING=1 make flag. - example/BUILD.bazel: Bazel targets for the new example. - docs/cn/iouring.md: Chinese-language guide covering build, flags, architecture and comparison with RDMA. --- BUILD.bazel | 20 + CMakeLists.txt | 48 +- WORKSPACE | 9 + bazel/config/BUILD.bazel | 6 + bazel/third_party/liburing/liburing.BUILD | 43 + docs/cn/iouring.md | 265 ++++++ example/BUILD.bazel | 56 ++ example/iouring_echo_c++/CMakeLists.txt | 139 +++ example/iouring_echo_c++/client.cpp | 73 ++ example/iouring_echo_c++/echo.proto | 33 + example/iouring_echo_c++/server.cpp | 182 ++++ src/brpc/input_messenger.h | 6 + src/brpc/iouring/iouring_block_pool.cpp | 587 +++++++++++++ src/brpc/iouring/iouring_block_pool.h | 332 ++++++++ src/brpc/iouring/iouring_endpoint.cpp | 993 ++++++++++++++++++++++ src/brpc/iouring/iouring_endpoint.h | 244 ++++++ src/brpc/iouring/iouring_helper.cpp | 247 ++++++ src/brpc/iouring/iouring_helper.h | 224 +++++ src/brpc/iouring_transport.cpp | 279 ++++++ src/brpc/iouring_transport.h | 83 ++ src/brpc/socket.h | 5 + src/brpc/socket_mode.h | 5 +- src/brpc/transport_factory.cpp | 11 + src/bthread/types.h | 7 + src/butil/single_threaded_pool.h | 2 +- 25 files changed, 3895 insertions(+), 4 deletions(-) create mode 100644 bazel/third_party/liburing/liburing.BUILD create mode 100644 docs/cn/iouring.md create mode 100644 example/iouring_echo_c++/CMakeLists.txt create mode 100644 example/iouring_echo_c++/client.cpp create mode 100644 example/iouring_echo_c++/echo.proto create mode 100644 example/iouring_echo_c++/server.cpp create mode 100644 src/brpc/iouring/iouring_block_pool.cpp create mode 100644 src/brpc/iouring/iouring_block_pool.h create mode 100644 src/brpc/iouring/iouring_endpoint.cpp create mode 100644 src/brpc/iouring/iouring_endpoint.h create mode 100644 src/brpc/iouring/iouring_helper.cpp create mode 100644 src/brpc/iouring/iouring_helper.h create mode 100644 src/brpc/iouring_transport.cpp create mode 100644 src/brpc/iouring_transport.h diff --git a/BUILD.bazel b/BUILD.bazel index 22cb508548..2c3114eb96 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -51,6 +51,9 @@ DEFINES = [ }) + select({ "//bazel/config:brpc_with_rdma": ["BRPC_WITH_RDMA=1"], "//conditions:default": [], + }) + select({ + "//bazel/config:brpc_with_iouring": ["BRPC_WITH_IOURING=1"], + "//conditions:default": ["BRPC_WITH_IOURING=0"], }) + select({ "//bazel/config:brpc_with_debug_bthread_sche_safety": ["BRPC_DEBUG_BTHREAD_SCHE_SAFETY=1"], "//conditions:default": ["BRPC_DEBUG_BTHREAD_SCHE_SAFETY=0"], @@ -94,6 +97,11 @@ LINKOPTS = [ "-libverbs", ], "//conditions:default": [], +}) + select({ + "//bazel/config:brpc_with_iouring": [ + "-luring", + ], + "//conditions:default": [], }) + select({ "//bazel/config:brpc_with_asan": ["-fsanitize=address"], "//conditions:default": [], @@ -530,12 +538,19 @@ cc_library( "src/brpc/policy/thrift_protocol.cpp", "src/brpc/event_dispatcher_epoll.cpp", "src/brpc/event_dispatcher_kqueue.cpp", + "src/brpc/iouring/iouring_endpoint.cpp", + "src/brpc/iouring/iouring_helper.cpp", ]) + select({ "//bazel/config:brpc_with_thrift": glob([ "src/brpc/thrift*.cpp", "src/brpc/**/thrift*.cpp", ]), "//conditions:default": [], + }) + select({ + "//bazel/config:brpc_with_iouring": glob([ + "src/brpc/iouring/*.cpp", + ]), + "//conditions:default": [], }), hdrs = glob([ "src/brpc/*.h", @@ -562,6 +577,11 @@ cc_library( "@org_apache_thrift//:thrift", ], "//conditions:default": [], + }) + select({ + "//bazel/config:brpc_with_iouring": [ + "@com_github_axboe_liburing//:liburing", + ], + "//conditions:default": [], }), ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e74007b66..29f945612a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(WITH_THRIFT "With thrift framed protocol supported" OFF) option(WITH_BTHREAD_TRACER "With bthread tracer supported" OFF) option(WITH_SNAPPY "With snappy" OFF) option(WITH_RDMA "With RDMA" OFF) +option(WITH_IOURING "With io_uring" OFF) option(WITH_DEBUG_BTHREAD_SCHE_SAFETY "With debugging bthread sche safety" OFF) option(WITH_DEBUG_LOCK "With debugging lock" OFF) option(WITH_ASAN "With AddressSanitizer" OFF) @@ -104,6 +105,11 @@ if(WITH_RDMA) set(WITH_RDMA_VAL "1") endif() +set(WITH_IOURING_VAL "0") +if(WITH_IOURING) + set(WITH_IOURING_VAL "1") +endif() + set(WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL "0") if(WITH_DEBUG_BTHREAD_SCHE_SAFETY) set(WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL "1") @@ -136,7 +142,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-deprecated-declarations -Wno-inconsistent-missing-override") endif() -set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=${WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL} -DBRPC_DEBUG_LOCK=${WITH_DEBUG_LOCK_VAL}") +set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DBRPC_WITH_IOURING=${WITH_IOURING_VAL} -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=${WITH_DEBUG_BTHREAD_SCHE_SAFETY_VAL} -DBRPC_DEBUG_LOCK=${WITH_DEBUG_LOCK_VAL}") if (WITH_ASAN) set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -fsanitize=address") set(CMAKE_C_FLAGS "${CMAKE_CPP_FLAGS} -fsanitize=address") @@ -277,6 +283,41 @@ if(WITH_RDMA) endif() endif() +if(WITH_IOURING) + message("brpc compile with io_uring") + # Search for the header in standard paths and common source-tree locations. + find_path(IOURING_INCLUDE_PATH NAMES liburing.h + PATHS + /usr/include + /usr/local/include + /docker/root/projects/liburing/src/include + NO_DEFAULT_PATH) + if(NOT IOURING_INCLUDE_PATH) + find_path(IOURING_INCLUDE_PATH NAMES liburing.h) + endif() + + find_library(IOURING_LIB NAMES uring liburing.a + PATHS + /usr/lib + /usr/lib64 + /usr/local/lib + /usr/local/lib64 + /docker/root/projects/liburing/src + NO_DEFAULT_PATH) + if(NOT IOURING_LIB) + find_library(IOURING_LIB NAMES uring) + endif() + + if((NOT IOURING_INCLUDE_PATH) OR (NOT IOURING_LIB)) + message(FATAL_ERROR "Fail to find liburing. " + "Please install liburing-dev / liburing-devel, or " + "build from source at https://github.com/axboe/liburing " + "and set CMAKE_PREFIX_PATH accordingly.") + endif() + include_directories(${IOURING_INCLUDE_PATH}) + message(STATUS "Found liburing: include=${IOURING_INCLUDE_PATH} lib=${IOURING_LIB}") +endif() + find_library(PROTOC_LIB NAMES protoc) if(NOT PROTOC_LIB) message(FATAL_ERROR "Fail to find protoc lib") @@ -329,6 +370,11 @@ if(WITH_RDMA) list(APPEND DYNAMIC_LIB ${RDMA_LIB}) endif() +if(WITH_IOURING) + list(APPEND DYNAMIC_LIB ${IOURING_LIB}) + set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -luring") +endif() + set(BRPC_PRIVATE_LIBS "-lgflags -lprotobuf -lleveldb -lprotoc -lssl -lcrypto -ldl -lz") if(WITH_GLOG) diff --git a/WORKSPACE b/WORKSPACE index a107f0a52c..d642227c49 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -279,6 +279,15 @@ http_archive( urls = ["https://archive.apache.org/dist/thrift/0.15.0/thrift-0.15.0.tar.gz"], ) +# io_uring support via local liburing source tree. +# Enable with: --define=BRPC_WITH_IOURING=true +# Requires liburing checked out at /docker/root/projects/liburing +new_local_repository( + name = "com_github_axboe_liburing", + path = "/docker/root/projects/liburing", + build_file = "//bazel/third_party/liburing:liburing.BUILD", +) + # # Perl Dependencies # diff --git a/bazel/config/BUILD.bazel b/bazel/config/BUILD.bazel index eec551da8b..374f00a9dc 100644 --- a/bazel/config/BUILD.bazel +++ b/bazel/config/BUILD.bazel @@ -104,6 +104,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "brpc_with_iouring", + define_values = {"BRPC_WITH_IOURING": "true"}, + visibility = ["//visibility:public"], +) + config_setting( name = "brpc_with_boringssl", define_values = {"BRPC_WITH_BORINGSSL": "true"}, diff --git a/bazel/third_party/liburing/liburing.BUILD b/bazel/third_party/liburing/liburing.BUILD new file mode 100644 index 0000000000..13feffd48c --- /dev/null +++ b/bazel/third_party/liburing/liburing.BUILD @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# BUILD file for liburing (io_uring userspace library). +# Used via new_local_repository pointing to a local liburing source tree. + +licenses(["notice"]) # MIT / GPL-2.0-only (dual-licensed) + +cc_library( + name = "liburing", + srcs = [ + "src/queue.c", + "src/register.c", + "src/setup.c", + "src/syscall.c", + ], + hdrs = glob([ + "src/include/**/*.h", + "src/include/*.h", + ]), + includes = [ + "src/include", + ], + copts = [ + "-D_GNU_SOURCE", + # Suppress warnings from third-party code + "-Wno-unused-variable", + "-Wno-implicit-function-declaration", + ], + visibility = ["//visibility:public"], +) diff --git a/docs/cn/iouring.md b/docs/cn/iouring.md new file mode 100644 index 0000000000..2b191bdcd7 --- /dev/null +++ b/docs/cn/iouring.md @@ -0,0 +1,265 @@ +# io_uring 支持 + +## 编译 + +io_uring 依赖内核 5.1+(推荐 5.10+),仅支持 Linux 系统。需要先安装或编译 [liburing](https://github.com/axboe/liburing)。 + +### 准备 liburing + +```bash +# 方式一:发行版包管理(推荐) +sudo apt install liburing-dev # Debian / Ubuntu +sudo yum install liburing-devel # CentOS / RHEL + +# 方式二:源码编译 +git clone https://github.com/axboe/liburing.git /path/to/liburing +cd /path/to/liburing && make +``` + +### 使用 CMake + +```bash +mkdir bld && cd bld +cmake -DWITH_IOURING=ON \ + -DIOURING_INCLUDE_PATH=/path/to/liburing/src/include \ + -DIOURING_LIB=/path/to/liburing/src/liburing.a .. +make +``` + +> **注意**:CMake 变量名为 `IOURING_INCLUDE_PATH` 和 `IOURING_LIB`,不是 `LIBURING_INCLUDE_PATH` / `LIBURING_LIBRARY`,拼写错误会触发 CMake Warning 且实际不生效。 + +如果 liburing 已通过包管理安装,省略后两个 `-D` 参数即可: + +```bash +cmake -DWITH_IOURING=ON .. && make +``` + +如果 liburing 安装在非标准路径,也可以用 `CMAKE_PREFIX_PATH` 让 CMake 自动发现: + +```bash +cmake -DWITH_IOURING=ON -DCMAKE_PREFIX_PATH=/path/to/liburing/install .. && make +``` + +### 编译示例程序 + +```bash +cd example/iouring_echo_c++ +mkdir bld && cd bld +cmake -DBRPC_WITH_IOURING=ON .. +make +``` + +--- + +## 基本原理 + +io_uring 通过内核与用户态共享的提交队列(SQ)和完成队列(CQ)实现异步 I/O,避免了每次 I/O 的系统调用开销。brpc 的实现复用了原有的 `Socket` 类,每个 Socket 持有一个 `IouringEndpoint`(`src/brpc/iouring/iouring_endpoint.cpp`)。 + +- **写路径**:`CutFromIOBufList` 将 `IOBuf` 中的数据段提交为 `IORING_OP_WRITEV`(或注册模式下的 `IORING_OP_WRITE_FIXED`)SQE。 +- **读路径**:`SubmitRead` 提交 `IORING_OP_READ`(或 `IORING_OP_READ_FIXED`)SQE,Poller 线程通过 `PollCq` 收割 CQE,复用 `InputMessenger` 完成消息解析。 + +### one-thread-per-ring 架构 + +每个 `bthread_tag` 拥有一个 `PollerGroup`,包含**恰好一个** Poller bthread,持有一个独立的 `io_uring` 实例(ring)。 + +**所有 SQ 操作(`SubmitRead`、`CutFromIOBufList`)都在 Poller bthread 上执行**,不需要任何锁。新连接通过 MPSC 队列(`op_queue`)发送 ADD/REMOVE 消息给 Poller,由 Poller 在主循环中处理。 + +> **为什么每个 tag 只能有一个 Poller?** +> io_uring 的 SQ 是单生产者设计,不支持并发写入。bthread 采用 work-stealing 调度,同一 `bthread_tag` 内的 bthread 会在多个 pthread 上运行。若同一 tag 存在多个 Poller,两个 Poller bthread 可能被 steal 到不同 pthread 并发操作各自 ring 的 SQ,同时业务 bthread 也可能在另一个 pthread 上提交 SQE,产生竞争。因此**一个 bthread_tag 只能有一个 Poller**,水平扩展应通过增加 bthread_tag 数量(`--task_group_ntags`)实现,每个 tag 独占一个 Poller 和一个 ring。 + +### CQE 收割策略 + +通过 `--iouring_polling_mode` 选择 Poller 线程的收割策略: + +- **none**(默认):中断驱动模式,Poller 调用 `io_uring_wait_cqe_timeout`(超时 1 ms)阻塞等待内核通知。1 ms 超时的目的是保持 Poller 循环对 `op_queue` 中新连接的响应性,同时在无 I/O 时避免 CPU 空转。适合通用场景。 +- **sqpoll**:启用 `IORING_SETUP_SQPOLL`,内核线程持续轮询 SQ,无需 `io_uring_submit` 系统调用,延迟最低,需要 `CAP_SYS_NICE`。 +- **iopoll**:`IORING_SETUP_IOPOLL`,仅对 O_DIRECT 块设备有效。 +- **hybrid**:先忙转 N 次(`--iouring_hybrid_spin_count`),无 CQE 后再阻塞,兼顾延迟和 CPU 利用率。 + +--- + +## 启动与初始化 + +在调用 `brpc::Server::Start()` 之前完成 io_uring 初始化: + +```cpp +#include + +// 1. 全局初始化:探测内核能力,初始化内存池(若启用了 --iouring_register_buffers) +brpc::iouring::GlobalIouringInitializeOrDie(); + +// 2. 为指定 bthread tag 创建 ring 并启动 Poller bthreads +// tag=0 为默认 tag,覆盖所有普通 bthread +if (!brpc::iouring::InitPollingModeWithTag(/*tag=*/0)) { + LOG(FATAL) << "Failed to init io_uring"; +} +``` + +`InitPollingModeWithTag` 支持三个可选回调,均在 Poller 线程上调用: + +```cpp +brpc::iouring::InitPollingModeWithTag( + /*tag=*/0, + /*callback=*/[](brpc::iouring::IouringPollerHandle h) { + // 每次 PollCq 之后调用,用于提交用户自定义 SQE 或收割用户 CQE。 + // 注意:bRPC 的 PollCq 会跳过 user_data bit63=0 的 CQE 且不调 + // io_uring_cqe_seen(),用户必须在此处手动 drain,否则 CQ 会满。 + h.Submit([](::io_uring* r) -> int { + // drain 用户 CQE + struct io_uring_cqe* cqe = nullptr; + while (io_uring_peek_cqe(r, &cqe) == 0) { + if (cqe->user_data & brpc::iouring::kBrpcCqeTag) break; + // 处理 cqe->res … + io_uring_cqe_seen(r, cqe); + } + // 提交用户 SQE + ::io_uring_sqe* sqe = io_uring_get_sqe(r); + if (!sqe) return 0; + io_uring_prep_nop(sqe); + sqe->user_data = my_token; // bit63 必须为 0 + return 1; + }); + }, + /*init_fn=*/nullptr, // ring 创建后调用一次 + /*release_fn=*/nullptr // ring 销毁前调用一次 +); +``` + +完整示例见 `example/iouring_echo_c++/server.cpp`。 + +--- + +## 参数说明 + +所有参数均通过 gflags 在命令行传入。 + +### Ring 大小 + +| 参数 | 默认值 | 说明 | +|------|--------|------| +| `iouring_sq_size` | `256` | 每个 ring 的 SQ 深度(并发 in-flight SQE 上限) | +| `iouring_cq_size` | `0` | 每个 ring 的 CQ 深度(0 表示 2 × sq_size) | + +### Poller 线程 + +| 参数 | 默认值 | 说明 | +|------|--------|------| +| `iouring_poller_yield` | `false` | 每轮 poll 后 yield,降低 CPU 占用(会增加尾延迟) | +| `iouring_max_cqe_poll_once` | `32` | 每次 `io_uring_peek_batch_cqe` 最多收割的 CQE 数 | + +### 轮询模式 + +| 参数 | 默认值 | 说明 | +|------|--------|------| +| `iouring_polling_mode` | `none` | CQE 收割策略,见下表 | +| `iouring_sqpoll_idle_ms` | `2000` | SQPOLL 内核线程空闲超时(ms) | +| `iouring_sqpoll_cpu` | `-1` | SQPOLL 内核线程绑定的 CPU(-1 = 不绑定) | +| `iouring_hybrid_spin_count` | `1000` | hybrid 模式下忙转的迭代次数后再阻塞 | + +`iouring_polling_mode` 可选值: + +| 值 | 说明 | +|----|------| +| `none` | 中断驱动(默认):Poller 调用 `io_uring_wait_cqe_timeout`(超时 1 ms)阻塞等待内核通知,1 ms 超时保持对新连接的响应性 | +| `sqpoll` | 内核 SQ 轮询线程(`IORING_SETUP_SQPOLL`),Poller 用 peek 收割 CQE,最低延迟,需要 `CAP_SYS_NICE` | +| `iopoll` | 块设备完成轮询(`IORING_SETUP_IOPOLL`),内核不产生中断,Poller 每次 submit 后主动 peek,仅对 O_DIRECT 块设备有效 | +| `hybrid` | 先忙转 N 次再阻塞,兼顾延迟和 CPU 利用率 | + +### 注册内存(零拷贝) + +| 参数 | 默认值 | 说明 | +|------|--------|------| +| `iouring_register_buffers` | `false` | 启用 `IORING_OP_READ_FIXED` / `IORING_OP_WRITE_FIXED` 零拷贝模式 | +| `iouring_mem_pool_initial_mb` | `256` | 初始注册内存大小(MiB) | +| `iouring_mem_pool_increase_mb` | `256` | 内存池扩容步长(MiB) | +| `iouring_mem_pool_max_regions` | `8` | 最大扩容次数 | +| `iouring_iobuf_block_size` | `8192` | IOBuf block 和 read slot 的大小(字节),须与内存池对齐 | +| `iouring_read_slot_num` | `256` | 每个 ring 初始 read slot 数量 | +| `iouring_read_slot_max` | `4096` | 每个 ring 最大 read slot 数量 | + +> **注意**:启用 `--iouring_register_buffers` 时,系统要么全部使用注册内存(`READ_FIXED` / `WRITE_FIXED`),要么完全不用,**不支持混用**。若初始化失败(内存不足、内核不支持等),整个 io_uring transport 会被禁用。 + +--- + +## 用户提交自定义 SQE + +用户可以通过 `InitPollingModeWithTag` 的 `callback` 在每次 Poller 迭代中提交自定义 SQE。 + +### CQE 标记约定 + +bRPC 提交的每条 SQE 都会在 `user_data` 的 **bit 63** 置 1: + +```cpp +sqe->user_data = reinterpret_cast(ctx) | brpc::iouring::kBrpcCqeTag; +``` + +- `PollCq` 只处理 bit 63 = 1 的 CQE,跳过 bit 63 = 0 的用户 CQE,且**不调用 `io_uring_cqe_seen()`**。 +- 用户 **必须** 在 callback 中主动 drain 自己的 CQE,否则 CQ 会被撑满,导致新 SQE 无法提交。 +- 用户 SQE 的 `user_data` **bit 63 必须为 0**(普通指针或整数均满足此条件)。 + +### drain 用户 CQE 的正确方式 + +在 `Submit` 的 `prepare_fn` 中: + +```cpp +h.Submit([](::io_uring* r) -> int { + struct io_uring_cqe* cqe = nullptr; + while (io_uring_peek_cqe(r, &cqe) == 0) { + // 遇到 bRPC 的 CQE 立即停止——不能调 cqe_seen,也不能跳过继续。 + // CQ 是 FIFO,不 seen 头部就无法推进,继续 peek 仍返回同一条。 + if (cqe->user_data & brpc::iouring::kBrpcCqeTag) break; + process(cqe->user_data, cqe->res); + io_uring_cqe_seen(r, cqe); // 必须调用,否则头指针不前进 + } + // 提交新 SQE … + return n; +}); +``` + +--- + +## 示例程序 + +示例位于 `example/iouring_echo_c++/`,演示了所有模式的启动方式。 + +```bash +cd example/iouring_echo_c++ +mkdir bld && cd bld +cmake -DBRPC_WITH_IOURING=ON .. && make +cd .. + +# 默认:epoll(不使用 io_uring) +./echo_server + +# io_uring,none 模式(Poller 最多等待 1 ms,适合通用场景) +./echo_server --use_iouring + +# io_uring + SQPOLL 模式(最低延迟,需要 CAP_SYS_NICE / root) +./echo_server --use_iouring --iouring_polling_mode=sqpoll + +# io_uring + hybrid 模式(忙转 N 次后阻塞,兼顾延迟和 CPU) +./echo_server --use_iouring --iouring_polling_mode=hybrid + +# io_uring + 零拷贝注册内存(最高吞吐量) +./echo_server --use_iouring --iouring_register_buffers + +# io_uring + SQPOLL + 零拷贝(延迟和吞吐量双优化) +./echo_server --use_iouring --iouring_polling_mode=sqpoll --iouring_register_buffers + +# 客户端(发送至 8000 端口,每秒一次) +./echo_client --server=0.0.0.0:8000 +``` + +--- + +## 与 RDMA 的对比 + +| 特性 | io_uring | RDMA | +|------|----------|------| +| 硬件要求 | 无(标准 Linux,内核 ≥ 5.1) | 需要 RDMA 网卡(InfiniBand/RoCE) | +| 零拷贝发送 | ✅(`WRITE_FIXED`,内核直接 DMA) | ✅(硬件 DMA,绕过内核) | +| 零拷贝接收 | ✅(`READ_FIXED`,预注册 slot) | ✅(内存注册 + DMA) | +| 内存注册 | 可选(`--iouring_register_buffers`) | 必须预注册内存池 | +| 延迟 | 低(减少系统调用) | 极低(硬件实现,μs 级) | +| 吞吐量 | 高(受限于 NIC 带宽) | 极高(可达数十 Gbps) | +| 配置复杂度 | 低 | 高(GID/QPN/内存池等) | diff --git a/example/BUILD.bazel b/example/BUILD.bazel index 4ee7cb140f..e572ff2c0e 100644 --- a/example/BUILD.bazel +++ b/example/BUILD.bazel @@ -34,6 +34,9 @@ COPTS = [ }) + select({ "//bazel/config:brpc_with_rdma": ["-DBRPC_WITH_RDMA=1"], "//conditions:default": [""], +}) + select({ + "//bazel/config:brpc_with_iouring": ["-DBRPC_WITH_IOURING=1"], + "//conditions:default": [""], }) brpc_proto_library( @@ -43,6 +46,13 @@ brpc_proto_library( proto_deps = [], ) +brpc_proto_library( + name = "cc_iouring_performance_proto", + srcs = ["iouring_performance/test.proto"], + include = "iouring_performance", + proto_deps = [], +) + brpc_proto_library( name = "cc_rdma_performance_proto", srcs = ["rdma_performance/test.proto"], @@ -110,6 +120,52 @@ cc_binary( ], ) +cc_binary( + name = "iouring_performance_server", + srcs = [ + "iouring_performance/server.cpp", + ], + includes = [ + "iouring_performance", + ], + copts = COPTS + select({ + "//bazel/config:brpc_with_iouring": ["-DBRPC_WITH_IOURING=1"], + "//conditions:default": [""], + }), + deps = [ + ":cc_iouring_performance_proto", + "//:brpc", + ] + select({ + "//bazel/config:brpc_with_iouring": [ + "@com_github_axboe_liburing//:liburing", + ], + "//conditions:default": [], + }), +) + +cc_binary( + name = "iouring_performance_client", + srcs = [ + "iouring_performance/client.cpp", + ], + includes = [ + "iouring_performance", + ], + copts = COPTS + select({ + "//bazel/config:brpc_with_iouring": ["-DBRPC_WITH_IOURING=1"], + "//conditions:default": [""], + }), + deps = [ + ":cc_iouring_performance_proto", + "//:brpc", + ] + select({ + "//bazel/config:brpc_with_iouring": [ + "@com_github_axboe_liburing//:liburing", + ], + "//conditions:default": [], + }), +) + cc_binary( name = "redis_c++_server", srcs = [ diff --git a/example/iouring_echo_c++/CMakeLists.txt b/example/iouring_echo_c++/CMakeLists.txt new file mode 100644 index 0000000000..0aee1833f5 --- /dev/null +++ b/example/iouring_echo_c++/CMakeLists.txt @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 2.8.10) +project(iouring_echo_c++ C CXX) + +option(LINK_SO "Whether examples are linked dynamically" OFF) + +execute_process( + COMMAND bash -c "find ${PROJECT_SOURCE_DIR}/../.. -type d -regex \".*output/include$\" | head -n1 | xargs dirname | tr -d '\n'" + OUTPUT_VARIABLE OUTPUT_PATH +) + +set(CMAKE_PREFIX_PATH ${OUTPUT_PATH}) + +include(FindThreads) +include(FindProtobuf) +protobuf_generate_cpp(PROTO_SRC PROTO_HEADER echo.proto) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +find_library(THRIFT_LIB NAMES thrift) +if (NOT THRIFT_LIB) + set(THRIFT_LIB "") +endif() + +find_path(BRPC_INCLUDE_PATH NAMES brpc/server.h) +if(LINK_SO) + find_library(BRPC_LIB NAMES brpc) +else() + find_library(BRPC_LIB NAMES libbrpc.a brpc) +endif() +if((NOT BRPC_INCLUDE_PATH) OR (NOT BRPC_LIB)) + message(FATAL_ERROR "Fail to find brpc") +endif() +include_directories(${BRPC_INCLUDE_PATH}) + +find_path(GFLAGS_INCLUDE_PATH gflags/gflags.h) +find_library(GFLAGS_LIBRARY NAMES gflags libgflags) +if((NOT GFLAGS_INCLUDE_PATH) OR (NOT GFLAGS_LIBRARY)) + message(FATAL_ERROR "Fail to find gflags") +endif() +include_directories(${GFLAGS_INCLUDE_PATH}) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + include(CheckFunctionExists) + CHECK_FUNCTION_EXISTS(clock_gettime HAVE_CLOCK_GETTIME) + if(NOT HAVE_CLOCK_GETTIME) + set(DEFINE_CLOCK_GETTIME "-DNO_CLOCK_GETTIME_IN_MAC") + endif() +endif() + +set(CMAKE_CXX_FLAGS "${DEFINE_CLOCK_GETTIME} -DNDEBUG -O2 -D__const__=__unused__ -pipe -W -Wall -Wno-unused-parameter -fPIC -fno-omit-frame-pointer") + +if(CMAKE_VERSION VERSION_LESS "3.1.3") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() +else() + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + +find_path(LEVELDB_INCLUDE_PATH NAMES leveldb/db.h) +find_library(LEVELDB_LIB NAMES leveldb) +if ((NOT LEVELDB_INCLUDE_PATH) OR (NOT LEVELDB_LIB)) + message(FATAL_ERROR "Fail to find leveldb") +endif() +include_directories(${LEVELDB_INCLUDE_PATH}) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(OPENSSL_ROOT_DIR + "/usr/local/opt/openssl" + ) +endif() + +find_package(OpenSSL) +include_directories(${OPENSSL_INCLUDE_DIR}) + +# liburing is required for io_uring support. +find_library(URING_LIB NAMES uring liburing) +if(NOT URING_LIB) + message(WARNING "liburing not found; io_uring support will be disabled at runtime " + "(--use_iouring will print an error). " + "Install liburing-dev to enable it.") + set(URING_LIB "") +endif() + +set(DYNAMIC_LIB + ${CMAKE_THREAD_LIBS_INIT} + ${GFLAGS_LIBRARY} + ${PROTOBUF_LIBRARIES} + ${LEVELDB_LIB} + ${OPENSSL_CRYPTO_LIBRARY} + ${OPENSSL_SSL_LIBRARY} + ${THRIFT_LIB} + ${URING_LIB} + dl + ) + +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(DYNAMIC_LIB ${DYNAMIC_LIB} + pthread + "-framework CoreFoundation" + "-framework CoreGraphics" + "-framework CoreData" + "-framework CoreText" + "-framework Security" + "-framework Foundation" + "-Wl,-U,_MallocExtension_ReleaseFreeMemory" + "-Wl,-U,_ProfilerStart" + "-Wl,-U,_ProfilerStop" + "-Wl,-U,__Z13GetStackTracePPvii" + "-Wl,-U,_mallctl" + "-Wl,-U,_malloc_stats_print" + ) +endif() + +add_executable(echo_client client.cpp ${PROTO_SRC} ${PROTO_HEADER}) +add_executable(echo_server server.cpp ${PROTO_SRC} ${PROTO_HEADER}) + +target_link_libraries(echo_client ${BRPC_LIB} ${DYNAMIC_LIB}) +target_link_libraries(echo_server ${BRPC_LIB} ${DYNAMIC_LIB}) diff --git a/example/iouring_echo_c++/client.cpp b/example/iouring_echo_c++/client.cpp new file mode 100644 index 0000000000..b3437e6951 --- /dev/null +++ b/example/iouring_echo_c++/client.cpp @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// A client sending requests to the iouring echo server every interval_ms. + +#include +#include +#include +#include +#include "echo.pb.h" + +DEFINE_string(protocol, "baidu_std", "Protocol type. Defined in src/brpc/options.proto"); +DEFINE_string(connection_type, "", "Connection type. Available values: single, pooled, short"); +DEFINE_string(server, "0.0.0.0:8000", "IP Address of server"); +DEFINE_string(load_balancer, "", "The algorithm for load balancing"); +DEFINE_int32(timeout_ms, 100, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries (not including the first RPC)"); +DEFINE_int32(interval_ms, 1000, "Milliseconds between consecutive requests"); + +int main(int argc, char* argv[]) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + brpc::Channel channel; + + brpc::ChannelOptions options; + options.protocol = FLAGS_protocol; + options.connection_type = FLAGS_connection_type; + options.timeout_ms = FLAGS_timeout_ms; + options.max_retry = FLAGS_max_retry; + if (channel.Init(FLAGS_server.c_str(), FLAGS_load_balancer.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return -1; + } + + example::EchoService_Stub stub(&channel); + + int log_id = 0; + while (!brpc::IsAskedToQuit()) { + example::EchoRequest request; + example::EchoResponse response; + brpc::Controller cntl; + + request.set_message("hello world"); + cntl.set_log_id(log_id++); + + stub.Echo(&cntl, &request, &response, NULL); + if (!cntl.Failed()) { + LOG(INFO) << "Received response from " << cntl.remote_side() + << ": " << response.message() + << " latency=" << cntl.latency_us() << "us"; + } else { + LOG(WARNING) << cntl.ErrorText(); + } + usleep(FLAGS_interval_ms * 1000L); + } + + LOG(INFO) << "EchoClient is going to quit"; + return 0; +} diff --git a/example/iouring_echo_c++/echo.proto b/example/iouring_echo_c++/echo.proto new file mode 100644 index 0000000000..2b39627fe8 --- /dev/null +++ b/example/iouring_echo_c++/echo.proto @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax="proto2"; +package example; + +option cc_generic_services = true; + +message EchoRequest { + required string message = 1; +}; + +message EchoResponse { + required string message = 1; +}; + +service EchoService { + rpc Echo(EchoRequest) returns (EchoResponse); +}; diff --git a/example/iouring_echo_c++/server.cpp b/example/iouring_echo_c++/server.cpp new file mode 100644 index 0000000000..5032f84f56 --- /dev/null +++ b/example/iouring_echo_c++/server.cpp @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// An echo server demonstrating all io_uring transport modes supported by bRPC. +// +// --------------------------------------------------------------------------- +// Design: one-thread-per-ring polling mode +// --------------------------------------------------------------------------- +// +// In this mode a dedicated Poller bthread drives the io_uring ring for each +// bthread_tag. All SQ operations (SubmitRead, CutFromIOBufList) execute on +// that Poller thread, so no locking is needed on the hot I/O path. +// +// --------------------------------------------------------------------------- +// Quick-start examples +// --------------------------------------------------------------------------- +// +// # Default: epoll (no io_uring) +// ./echo_server +// +// # io_uring, interrupt-driven Poller (none mode, 1 ms timeout between peeks) +// ./echo_server --use_iouring +// +// # io_uring + SQPOLL mode (lowest latency; needs CAP_SYS_NICE / root) +// ./echo_server --use_iouring --iouring_polling_mode=sqpoll +// +// # io_uring + hybrid mode (busy-spin N times then block) +// ./echo_server --use_iouring --iouring_polling_mode=hybrid +// +// # io_uring + zero-copy registered buffers (highest throughput) +// ./echo_server --use_iouring --iouring_register_buffers +// +// # io_uring + SQPOLL + zero-copy (latency + throughput optimised) +// ./echo_server --use_iouring --iouring_polling_mode=sqpoll \ +// --iouring_register_buffers +// +// --------------------------------------------------------------------------- +// Tunables (all --iouring_* flags are forwarded to the transport layer) +// --------------------------------------------------------------------------- +// +// Ring sizing +// --iouring_sq_size=N SQ depth per ring (default 256). +// --iouring_cq_size=N CQ depth per ring (0 = 2 * sq_size). +// +// Poller thread +// --iouring_poller_yield Yield after each poll iteration (reduces CPU, +// increases tail latency). +// --iouring_max_cqe_poll_once=N Max CQEs reaped per peek call (default 32). +// +// Polling mode (selects ring setup flags and CQE-reap strategy) +// --iouring_polling_mode= none No kernel-side polling; Poller waits up +// to 1 ms between peeks (default). +// sqpoll Kernel SQ polling thread (IORING_SETUP_SQPOLL). +// iopoll Block-device completion polling (O_DIRECT only). +// hybrid Busy-spin N times then block. +// --iouring_sqpoll_idle_ms SQPOLL kernel thread idle timeout ms (default 2000). +// --iouring_sqpoll_cpu=N CPU to pin the SQPOLL thread (-1 = no pin). +// --iouring_hybrid_spin_count=N Spin iterations for hybrid mode (default 1000). +// +// Registered buffers (zero-copy) +// --iouring_register_buffers Enable READ_FIXED / WRITE_FIXED. +// --iouring_mem_pool_initial_mb=N Initial registered memory (default 256 MiB). +// --iouring_mem_pool_increase_mb=N Growth step (default 256 MiB). +// --iouring_mem_pool_max_regions=N Max growth regions (default 8). +// --iouring_iobuf_block_size=N IOBuf block / slot size in bytes (default 8192). +// --iouring_read_slot_num=N Initial read slots per ring (default 256). +// --iouring_read_slot_max=N Max read slots per ring (default 4096). +// --------------------------------------------------------------------------- + +#include +#include +#include +#include +#include "echo.pb.h" + +DEFINE_bool(use_iouring, false, + "Enable io_uring transport. Requires kernel >= 5.1. " + "When false the default epoll transport is used."); + +DEFINE_int32(port, 8000, "TCP Port of this server"); +DEFINE_string(listen_addr, "", "Server listen address, may be IPV4/IPV6/UDS. " + "If set, --port is ignored."); +DEFINE_int32(idle_timeout_s, -1, "Connection will be closed if there is no " + "read/write operations during the last `idle_timeout_s'"); + +namespace example { +class EchoServiceImpl : public EchoService { +public: + EchoServiceImpl() {} + virtual ~EchoServiceImpl() {} + virtual void Echo(google::protobuf::RpcController* cntl_base, + const EchoRequest* request, + EchoResponse* response, + google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + response->set_message(request->message()); + } +}; +} // namespace example + +int main(int argc, char* argv[]) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + +#if BRPC_WITH_IOURING + if (FLAGS_use_iouring) { + // GlobalIouringInitializeOrDie() probes kernel opcodes and, when + // --iouring_register_buffers is set, initialises the global memory + // pool so that all subsequent IOBuf allocations come from pre- + // registered pages. Must be called before Server::Start(). + brpc::iouring::GlobalIouringInitializeOrDie(); + + // InitPollingModeWithTag() creates the io_uring ring(s) for bthread + // tag 0 (the default tag used by all worker bthreads) and starts the + // Poller bthread(s). + // + // The Poller thread drives the ring: it dequeues ADD/REMOVE ops, + // issues the first SubmitRead for each new connection, and reaps CQEs + // in a tight loop. Because every SQ operation runs on the Poller + // thread, the ring needs no locking. + // + // Optional callbacks (all nullptr here): + // callback – called after every PollCq pass; use h.Submit() to + // drain your own CQEs and queue new SQEs. + // init_fn – called once after the ring is created. + // release_fn – called just before the ring is destroyed on shutdown. + if (!brpc::iouring::InitPollingModeWithTag(/*tag=*/0)) { + LOG(ERROR) << "Fail to init io_uring polling mode"; + return -1; + } + } +#else + if (FLAGS_use_iouring) { + LOG(ERROR) << "This binary was not compiled with io_uring support " + "(BRPC_WITH_IOURING is not set). " + "Rebuild with -DWITH_IOURING=ON."; + return -1; + } +#endif + + brpc::Server server; + + example::EchoServiceImpl echo_service_impl; + if (server.AddService(&echo_service_impl, + brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(ERROR) << "Fail to add service"; + return -1; + } + + butil::EndPoint point; + if (!FLAGS_listen_addr.empty()) { + if (butil::str2endpoint(FLAGS_listen_addr.c_str(), &point) < 0) { + LOG(ERROR) << "Invalid listen address: " << FLAGS_listen_addr; + return -1; + } + } else { + point = butil::EndPoint(butil::IP_ANY, FLAGS_port); + } + + brpc::ServerOptions options; + options.idle_timeout_sec = FLAGS_idle_timeout_s; + if (server.Start(point, &options) != 0) { + LOG(ERROR) << "Fail to start EchoServer"; + return -1; + } + + server.RunUntilAskedToQuit(); + return 0; +} diff --git a/src/brpc/input_messenger.h b/src/brpc/input_messenger.h index 8482c3f3fc..ab13547c58 100644 --- a/src/brpc/input_messenger.h +++ b/src/brpc/input_messenger.h @@ -29,7 +29,11 @@ namespace brpc { namespace rdma { class RdmaEndpoint; } +namespace iouring { +class IouringEndpoint; +} class TcpTransport; +class IouringTransport; struct InputMessageHandler { // The callback to cut a message from `source'. // Returned message will be passed to process_request or process_response @@ -92,7 +96,9 @@ class InputMessageClosure { class InputMessenger : public SocketUser { friend class Socket; friend class TcpTransport; +friend class IouringTransport; friend class rdma::RdmaEndpoint; +friend class iouring::IouringEndpoint; public: explicit InputMessenger(size_t capacity = 128); ~InputMessenger(); diff --git a/src/brpc/iouring/iouring_block_pool.cpp b/src/brpc/iouring/iouring_block_pool.cpp new file mode 100644 index 0000000000..99ac1fb7e9 --- /dev/null +++ b/src/brpc/iouring/iouring_block_pool.cpp @@ -0,0 +1,587 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_IOURING + +#include +#include +#include +#include + +#include +#include + +#include "butil/errno.h" // berror() +#include "butil/logging.h" +#include "butil/scoped_lock.h" // BAIDU_SCOPED_LOCK +#include "butil/iobuf.h" // butil::iobuf::blockmem_allocate +#include "brpc/iouring/iouring_block_pool.h" + +// iobuf internal hooks – declared in iobuf.cpp / iobuf_inl.h +namespace butil { +namespace iobuf { +extern void* (*blockmem_allocate)(size_t); +extern void (*blockmem_deallocate)(void*); +} +} + +namespace brpc { +namespace iouring { + +// --------------------------------------------------------------------------- +// gflags +// --------------------------------------------------------------------------- + +DEFINE_bool(iouring_register_buffers, false, + "Enable io_uring pre-registered buffer I/O (READ_FIXED / " + "WRITE_FIXED). When true, all IOBuf blocks are allocated from a " + "registered slab so that writes can use IORING_OP_WRITE_FIXED " + "with no per-op page pinning, and reads use IORING_OP_READ_FIXED. " + "Requires kernel >= 5.1."); + +DEFINE_int32(iouring_mem_pool_initial_mb, 256, + "Initial size of the io_uring fixed-buffer memory pool (MB). " + "Effective only with --iouring_register_buffers=true."); + +DEFINE_int32(iouring_mem_pool_increase_mb, 256, + "Growth increment when the pool is exhausted (MB). " + "Effective only with --iouring_register_buffers=true."); + +DEFINE_int32(iouring_mem_pool_max_regions, 8, + "Maximum number of memory regions. " + "Each region causes one io_uring_register_buffers_update() per " + "ring on growth."); + +DEFINE_int32(iouring_iobuf_block_size, 8192, + "Size of each IOBuf block when --iouring_register_buffers=true. " + "butil::SetDefaultBlockSize() is called with this value at " + "startup so IOBuf and the registered slab are always in sync."); + +DEFINE_int32(iouring_read_slot_num, 256, + "Initial read slots per Poller ring."); + +DEFINE_int32(iouring_read_slot_max, 4096, + "Maximum read slots per Poller ring."); + +// --------------------------------------------------------------------------- +bool IsFixedBuffersEnabled() { return FLAGS_iouring_register_buffers; } + +// --------------------------------------------------------------------------- +// IouringMemPool – implementation +// --------------------------------------------------------------------------- + +__thread IouringMemPool::FreeNode* IouringMemPool::tls_free_ = nullptr; +__thread size_t IouringMemPool::tls_free_cnt_ = 0; + +static const size_t kTlsCacheMax = 128; // blocks cached per thread +static const size_t kBytesPerMB = 1UL << 20; + +IouringMemPool& IouringMemPool::Instance() { + static IouringMemPool inst; + return inst; +} + +// --------------------------------------------------------------------------- +// Init +// --------------------------------------------------------------------------- +bool IouringMemPool::Init(size_t block_size) { + if (initialized_) { + LOG(WARNING) << "IouringMemPool already initialized"; + return true; + } + if (block_size == 0 || block_size % 4096 != 0) { + LOG(ERROR) << "IouringMemPool::Init: block_size must be a nonzero " + "multiple of 4096, got " << block_size; + return false; + } + + block_size_ = block_size; + + // Hook IOBuf's block allocator. + prev_allocate_ = butil::iobuf::blockmem_allocate; + prev_deallocate_ = butil::iobuf::blockmem_deallocate; + butil::iobuf::blockmem_allocate = MemPoolAllocate; + butil::iobuf::blockmem_deallocate = MemPoolDeallocate; + + // Allocate the initial region. + if (!AddRegion(static_cast(FLAGS_iouring_mem_pool_initial_mb))) { + // Unhook on failure. + butil::iobuf::blockmem_allocate = prev_allocate_; + butil::iobuf::blockmem_deallocate = prev_deallocate_; + return false; + } + + initialized_ = true; + LOG(INFO) << "IouringMemPool ready: block_size=" << block_size_ + << " initial_mb=" << FLAGS_iouring_mem_pool_initial_mb; + return true; +} + +// --------------------------------------------------------------------------- +// Destroy +// --------------------------------------------------------------------------- +void IouringMemPool::Destroy() { + if (!initialized_) { return; } + + butil::iobuf::blockmem_allocate = prev_allocate_; + butil::iobuf::blockmem_deallocate = prev_deallocate_; + + { + BAIDU_SCOPED_LOCK(extend_lock_); + for (auto& r : regions_) { + free(reinterpret_cast(r.base)); + } + regions_.clear(); + } + { + BAIDU_SCOPED_LOCK(free_lock_); + global_free_ = nullptr; + } + initialized_ = false; +} + +// --------------------------------------------------------------------------- +// AddRingRegistrar / RemoveRingRegistrar +// --------------------------------------------------------------------------- +void IouringMemPool::AddRingRegistrar(struct io_uring* ring, RegionRegisterCb cb) { + BAIDU_SCOPED_LOCK(registrar_lock_); + // Register all existing regions with the new ring using the already-computed + // buf_index_base (identical for every ring). + { + BAIDU_SCOPED_LOCK(extend_lock_); + for (const auto& r : regions_) { + cb(reinterpret_cast(r.base), r.size, r.block_size, + r.buf_index_base); + } + } + reg_rings_.push_back(ring); + reg_cbs_.push_back(std::move(cb)); +} + +void IouringMemPool::RemoveRingRegistrar(struct io_uring* ring) { + BAIDU_SCOPED_LOCK(registrar_lock_); + for (size_t i = 0; i < reg_rings_.size(); ++i) { + if (reg_rings_[i] == ring) { + reg_rings_.erase(reg_rings_.begin() + i); + reg_cbs_.erase(reg_cbs_.begin() + i); + break; + } + } + // No per-region ring state to clean up: buf_index_base is ring-agnostic. +} + +// --------------------------------------------------------------------------- +// AddRegion +// Must be called under extend_lock_ (but NOT under free_lock_ or +// registrar_lock_, which are acquired internally in the correct order). +// +// Lock ordering enforced throughout IouringMemPool: +// extend_lock_ (coarsest – serialises region growth) +// registrar_lock_ (ring registration callbacks) +// free_lock_ (free-list access – fine-grained, brief) +// --------------------------------------------------------------------------- +bool IouringMemPool::AddRegion(size_t region_size_mb) { + // Must be called under extend_lock_ (caller's responsibility). + if (static_cast(regions_.size()) >= FLAGS_iouring_mem_pool_max_regions) { + LOG_EVERY_SECOND(ERROR) + << "IouringMemPool: max regions (" << FLAGS_iouring_mem_pool_max_regions + << ") reached. Increase --iouring_mem_pool_max_regions."; + return false; + } + + const size_t region_size = region_size_mb * kBytesPerMB; + // Round down to a multiple of block_size_. + const size_t aligned_size = (region_size / block_size_) * block_size_; + if (aligned_size == 0) { + LOG(ERROR) << "IouringMemPool: region_size_mb too small"; + return false; + } + + void* mem = nullptr; + if (posix_memalign(&mem, 4096, aligned_size) != 0) { + PLOG(ERROR) << "IouringMemPool: posix_memalign failed"; + return false; + } + memset(mem, 0, aligned_size); + + // Compute buf_index_base: sum of blocks in all existing regions. + // Safe to read regions_ here because we hold extend_lock_. + // The same value is used for every ring (all rings share the same + // iovec layout), so we only need to store it once per Region. + int buf_index_base = 0; + for (const auto& r : regions_) { + buf_index_base += r.block_count; + } + + const int blocks = static_cast(aligned_size / block_size_); + + Region region; + region.base = reinterpret_cast(mem); + region.size = aligned_size; + region.block_size = block_size_; + region.buf_index_base = buf_index_base; + region.block_count = blocks; + + // Notify all registered rings (registrar_lock_ < extend_lock_ in the + // global ordering, but here extend_lock_ is already held, so we must NOT + // acquire extend_lock_ inside registrar_lock_ elsewhere). + { + BAIDU_SCOPED_LOCK(registrar_lock_); + for (size_t i = 0; i < reg_rings_.size(); ++i) { + reg_cbs_[i](mem, aligned_size, block_size_, buf_index_base); + } + } + + regions_.push_back(std::move(region)); + // Publish the new region count atomically so GetBufIndex lock-free readers + // can discover the new region without holding extend_lock_. + region_count_.store(static_cast(regions_.size()), + butil::memory_order_release); + + // Populate the global free-list with all blocks in the new region. + // free_lock_ is the innermost lock; safe to acquire under extend_lock_. + { + BAIDU_SCOPED_LOCK(free_lock_); + for (int i = blocks - 1; i >= 0; --i) { + auto* node = reinterpret_cast( + reinterpret_cast(mem) + i * block_size_); + node->next = global_free_; + global_free_ = node; + } + } + + LOG(INFO) << "IouringMemPool: added region base=" << mem + << " size_mb=" << region_size_mb + << " blocks=" << blocks + << " buf_index_base=" << buf_index_base; + return true; +} + +// --------------------------------------------------------------------------- +// Allocate / Deallocate +// --------------------------------------------------------------------------- + +// TLS fast-path: no lock needed. +void* IouringMemPool::Allocate(size_t /*size*/) { + // TLS fast-path: no contention. + if (tls_free_) { + FreeNode* node = tls_free_; + tls_free_ = node->next; + --tls_free_cnt_; + return node; + } + + // TLS cache is empty. Try to refill from the global free-list. + // If the global list is also empty, grow the pool first. + // --------------------------------------------------------------- + // Lock ordering: extend_lock_ (coarsest) → free_lock_ (finest). + // We must NEVER hold free_lock_ while acquiring extend_lock_. + // Strategy: + // 1. Try free_lock_ briefly to steal blocks. + // 2. If empty, release free_lock_, acquire extend_lock_, grow, + // release extend_lock_, then re-try free_lock_. + // --------------------------------------------------------------- + + // Step 1: fast steal under free_lock_ alone. + { + BAIDU_SCOPED_LOCK(free_lock_); + if (global_free_) { + // There are already free blocks — just refill TLS. + size_t moved = 0; + const size_t target = kTlsCacheMax / 2; + while (global_free_ && moved < target) { + FreeNode* node = global_free_; + global_free_ = node->next; + node->next = tls_free_; + tls_free_ = node; + ++tls_free_cnt_; + ++moved; + } + goto done; + } + } // free_lock_ released here + + // Step 2: global list was empty — try to grow the pool. + { + BAIDU_SCOPED_LOCK(extend_lock_); // serialize growth + // Re-check under extend_lock_: another thread may have grown already. + bool need_grow = false; + { + BAIDU_SCOPED_LOCK(free_lock_); + need_grow = (global_free_ == nullptr); + } + if (need_grow) { + if (!AddRegion( + static_cast(FLAGS_iouring_mem_pool_increase_mb))) { + LOG_EVERY_SECOND(ERROR) + << "IouringMemPool: out of memory, cannot grow."; + return nullptr; + } + } + } // extend_lock_ released here + + // Step 3: steal blocks from the now-non-empty global list. + { + BAIDU_SCOPED_LOCK(free_lock_); + if (!global_free_) { return nullptr; } // still empty (shouldn't happen) + size_t moved = 0; + const size_t target = kTlsCacheMax / 2; + while (global_free_ && moved < target) { + FreeNode* node = global_free_; + global_free_ = node->next; + node->next = tls_free_; + tls_free_ = node; + ++tls_free_cnt_; + ++moved; + } + } + +done: + if (!tls_free_) { return nullptr; } + FreeNode* node = tls_free_; + tls_free_ = node->next; + --tls_free_cnt_; + return node; +} + +void IouringMemPool::Deallocate(void* ptr) { + if (!ptr) { return; } + + // TLS fast-path: cache locally. + if (tls_free_cnt_ < kTlsCacheMax) { + auto* node = reinterpret_cast(ptr); + node->next = tls_free_; + tls_free_ = node; + ++tls_free_cnt_; + return; + } + + // TLS is full: flush half to the global list. + BAIDU_SCOPED_LOCK(free_lock_); + const size_t flush = kTlsCacheMax / 2; + for (size_t i = 0; i < flush && tls_free_; ++i) { + FreeNode* node = tls_free_; + tls_free_ = node->next; + --tls_free_cnt_; + node->next = global_free_; + global_free_ = node; + } + // Then put the current block into TLS. + auto* node = reinterpret_cast(ptr); + node->next = tls_free_; + tls_free_ = node; + ++tls_free_cnt_; +} + +// Static hooks for butil::iobuf. +void* IouringMemPool::MemPoolAllocate(size_t size) { + return Instance().Allocate(size); +} +void IouringMemPool::MemPoolDeallocate(void* ptr) { + Instance().Deallocate(ptr); +} + +// --------------------------------------------------------------------------- +// GetBufIndex +// +// Hot path: called once per IOBuf block in CutFromIOBufList. +// +// Lock-free fast path: read region_count_ (acquire) to get a stable count, +// then walk regions_[0..count-1] without a lock. This is safe because: +// - regions_ only ever grows (entries are never removed or modified once +// published). +// - AddRegion stores to region_count_ with memory_order_release AFTER +// pushing the new entry, so if we observe count == N we can safely +// read regions_[0..N-1] without tearing. +// - If AddRegion is running concurrently and we read a stale count we +// simply miss the new region and return -1, causing the caller to fall +// back to WRITEV (correct, just not zero-copy for that block). +// +// Slow path (ring lookup): once the region is found we still need to find +// the ring's buf_index_base, which lives in per-region parallel vectors. +// These are also append-only for a given region once published, so they +// are safe to read without locks after the region is visible. +// --------------------------------------------------------------------------- +int IouringMemPool::GetBufIndex(struct io_uring* ring, const void* ptr) const { + if (!ptr) { return -1; } + const uintptr_t addr = reinterpret_cast(ptr); + + // Load the count with acquire semantics so we see all stores from + // AddRegion that preceded the region_count_ store. + const int count = region_count_.load(butil::memory_order_acquire); + + for (int ri = 0; ri < count; ++ri) { + const Region& r = regions_[ri]; + if (addr < r.base || addr >= r.base + r.size) { continue; } + + // Found the containing region. buf_index_base is identical for every + // ring, so no per-ring lookup is needed — O(1) direct computation. + const int block_offset = + static_cast((addr - r.base) / r.block_size); + (void)ring; // ring param kept for API compatibility / future use + return r.buf_index_base + block_offset; + } + return -1; +} + +// --------------------------------------------------------------------------- +// IouringReadSlotPool – implementation +// --------------------------------------------------------------------------- + +bool IouringReadSlotPool::Init(struct io_uring* ring, + int initial_slots, + int max_slot_count, + size_t slot_buf_size) { + if (!ring || initial_slots <= 0 || max_slot_count < initial_slots + || slot_buf_size == 0) { + LOG(ERROR) << "IouringReadSlotPool::Init: invalid arguments"; + return false; + } + + // Receive slots must come from IouringMemPool so that they reside in + // pre-registered memory and the kernel can DMA directly into them + // (IORING_OP_READ_FIXED) without per-op page pinning. + IouringMemPool& mp = IouringMemPool::Instance(); + if (!mp.initialized()) { + LOG(ERROR) << "IouringReadSlotPool::Init: IouringMemPool not initialised; " + "--iouring_register_buffers must be true"; + return false; + } + if (slot_buf_size != mp.block_size()) { + LOG(ERROR) << "IouringReadSlotPool::Init: slot_buf_size " << slot_buf_size + << " != MemPool block_size " << mp.block_size(); + return false; + } + + ring_ = ring; + slot_buf_size_ = slot_buf_size; + max_slot_count_ = max_slot_count; + + if (!GrowBy(initial_slots)) { + ring_ = nullptr; + return false; + } + return true; +} + +// --------------------------------------------------------------------------- +// IouringReadSlotPool::GrowBy +// --------------------------------------------------------------------------- +bool IouringReadSlotPool::GrowBy(int n) { + if (total_slot_count_ + n > max_slot_count_) { + n = max_slot_count_ - total_slot_count_; + if (n <= 0) { + LOG_EVERY_SECOND(WARNING) + << "IouringReadSlotPool: max_slot_count reached (" + << max_slot_count_ << ")"; + return false; + } + } + + const int base_idx = total_slot_count_; + + // All receive buffers come from IouringMemPool so they are automatically + // covered by the ring's registered buffer table. + IouringMemPool& mp = IouringMemPool::Instance(); + + // Accumulate allocations before committing; allows clean rollback on OOM. + std::vector allocated; + allocated.reserve(static_cast(n)); + for (int i = 0; i < n; ++i) { + void* blk = mp.Allocate(slot_buf_size_); + if (!blk) { + LOG(ERROR) << "IouringReadSlotPool: IouringMemPool OOM after " + << i << " of " << n << " receive buffers"; + for (void* p : allocated) { mp.Deallocate(p); } + return false; + } + allocated.push_back(blk); + } + + // Commit: add entries and mark all new slots as free. + // Invariant preserved: entries_[k].buf is always slot index k. + for (int i = 0; i < n; ++i) { + entries_.push_back({allocated[static_cast(i)]}); + free_slot_indices_.push_back(base_idx + i); + ++total_slot_count_; + } + return true; +} + +// --------------------------------------------------------------------------- +// Acquire +// --------------------------------------------------------------------------- +bool IouringReadSlotPool::Acquire(IouringReadSlot* out) { + if (free_slot_indices_.empty()) { + const int grow = std::min(total_slot_count_, + max_slot_count_ - total_slot_count_); + if (grow <= 0 || !GrowBy(grow)) { return false; } + } + + const int idx = free_slot_indices_.back(); + free_slot_indices_.pop_back(); + + // entries_[k].buf is always slot index k (GrowBy appends in order), + // so direct index access is O(1). An out-of-range idx can only happen + // due to memory corruption – DCHECK catches it in debug builds. + DCHECK_GE(idx, 0); + DCHECK_LT(idx, static_cast(entries_.size())); + const ReadSlotEntry& e = entries_[static_cast(idx)]; + out->buf = e.buf; + out->buf_index = IouringMemPool::Instance().GetBufIndex(ring_, e.buf); + out->size = slot_buf_size_; + out->slot_idx = idx; + return true; +} + +// --------------------------------------------------------------------------- +// Release +// --------------------------------------------------------------------------- +void IouringReadSlotPool::Release(const IouringReadSlot& slot) { + // slot.slot_idx was stored by Acquire; use it directly for O(1) release. + const int idx = slot.slot_idx; + DCHECK_GE(idx, 0); + DCHECK_LT(idx, static_cast(entries_.size())); + // buf match is a sanity check: in correct usage Acquire always fills + // slot_idx to match the entry, so this can only fail on double-release + // or memory corruption. + DCHECK_EQ(entries_[static_cast(idx)].buf, slot.buf) + << "IouringReadSlotPool::Release: buf mismatch at slot_idx=" << idx; + free_slot_indices_.push_back(idx); +} + +// --------------------------------------------------------------------------- +// Destroy +// --------------------------------------------------------------------------- +void IouringReadSlotPool::Destroy() { + if (!ring_) { return; } + + IouringMemPool& mp = IouringMemPool::Instance(); + for (auto& e : entries_) { + mp.Deallocate(e.buf); + e.buf = nullptr; + } + entries_.clear(); + free_slot_indices_.clear(); + total_slot_count_ = 0; + ring_ = nullptr; +} + +} // namespace iouring +} // namespace brpc + +#endif // BRPC_WITH_IOURING diff --git a/src/brpc/iouring/iouring_block_pool.h b/src/brpc/iouring/iouring_block_pool.h new file mode 100644 index 0000000000..85716ae4ec --- /dev/null +++ b/src/brpc/iouring/iouring_block_pool.h @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// iouring_block_pool.h +// +// Two cooperating subsystems for io_uring pre-registered I/O buffers. +// +// Background +// ---------- +// io_uring supports pre-registering memory regions with the kernel via +// io_uring_register_buffers(). Once registered, any page in those regions is +// permanently pinned, so individual I/O operations skip the per-op +// get_user_pages / put_page overhead. Registered buffers work for BOTH +// directions: +// IORING_OP_READ_FIXED – kernel DMA-writes received data into the buf +// IORING_OP_WRITE_FIXED – kernel DMA-reads send data from the buf +// The |buf_index| argument selects which registered iovec entry to use. +// +// Subsystem 1 – IouringMemPool (global, one per process) +// -------------------------------------------------------- +// Replaces butil::iobuf::blockmem_allocate so that EVERY IOBuf block comes +// from a pre-registered slab. This means that IOBuf blocks on the write +// path can be handed directly to IORING_OP_WRITE_FIXED without any copy. +// +// Layout of one region (slab): +// +// [ iovec_table_entry_0 ][ iovec_table_entry_1 ] ... [ iovec_table_entry_N ] +// ^ ^ +// region.base region.base + region.size +// +// Each iovec entry covers exactly one IOBuf block (iouring_iobuf_block_size +// bytes). The kernel buf_index for block at offset k in region r is: +// +// buf_index = region.buf_index_base + k +// +// GetBufIndex(ptr) does this lookup in O(max_regions) time, same as RDMA's +// GetRegionId(). +// +// Dynamic growth +// -------------- +// When the pool runs out of blocks a new region is allocated and all rings +// are updated via io_uring_register_buffers_update() (kernel >= 5.13) or a +// full re-registration on older kernels. Growth is serialised by a mutex; +// the hot allocation path is lock-free (TLS free-list). +// +// Subsystem 2 – IouringReadSlotPool (per-Poller-ring) +// ------------------------------------------------------ +// Manages a set of fixed-size receive buffers ("read slots") for +// IORING_OP_READ_FIXED. Each slot is one pre-registered buffer that is +// handed directly to a READ_FIXED SQE; the kernel DMA-writes received data +// into it without any per-op page pinning. +// +// Slots are always allocated from IouringMemPool (same pre-registered slab), +// so no additional io_uring_register_buffers call is needed per ring. +// Requires --iouring_register_buffers=true. +// +// Thread safety +// ------------- +// IouringMemPool: thread-safe (TLS fast-path + mutex for growth). +// IouringReadSlotPool: NOT thread-safe. All Acquire/Release calls must +// happen on the Poller thread (enforced by design). + +#ifndef BRPC_IOURING_BLOCK_POOL_H +#define BRPC_IOURING_BLOCK_POOL_H + +#if BRPC_WITH_IOURING + +#include +#include +#include +#include +#include +#include "butil/atomicops.h" +#include "butil/macros.h" +#include "butil/synchronization/lock.h" +#include + +namespace brpc { +namespace iouring { + +// --------------------------------------------------------------------------- +// gflags (defined in iouring_block_pool.cpp) +// --------------------------------------------------------------------------- + +DECLARE_bool(iouring_register_buffers); // master switch: enables fixed-buf I/O + +// IOBuf block pool (for WRITE_FIXED) +DECLARE_int32(iouring_mem_pool_initial_mb); // initial slab size per region (MB) +DECLARE_int32(iouring_mem_pool_increase_mb); // increment on growth (MB) +DECLARE_int32(iouring_mem_pool_max_regions); // max regions (default 8) +DECLARE_int32(iouring_iobuf_block_size); // IOBuf block size; synced to butil via SetDefaultBlockSize() + +// Per-ring read slots (for READ_FIXED) +DECLARE_int32(iouring_read_slot_num); // initial slots per ring (default 256) +DECLARE_int32(iouring_read_slot_max); // max slots per ring (default 4096) + +// --------------------------------------------------------------------------- +// IouringReadSlot – token held by an endpoint while a READ_FIXED is in flight +// --------------------------------------------------------------------------- +struct IouringReadSlot { + void* buf; // pointer to the slot's memory + int buf_index; // index in the ring's registered iovec table + size_t size; // slot size in bytes + int slot_idx; // index into IouringReadSlotPool::entries_ (for O(1) Release) +}; + +// --------------------------------------------------------------------------- +// IouringMemPool +// +// Global memory pool for IOBuf blocks. Must be initialised once (via Init) +// before any IOBuf is allocated. +// +// Usage: +// // At startup (before any IOBuf allocation): +// IouringMemPool::Instance().Init(ring_registrar_callback); +// +// // In CutFromIOBufList, for each IOBuf block: +// int buf_index = IouringMemPool::Instance().GetBufIndex(block_ptr); +// if (buf_index >= 0) +// // submit IORING_OP_WRITE_FIXED(block_ptr, buf_index) +// else +// // fallback IORING_OP_WRITEV +// --------------------------------------------------------------------------- + +// Callback invoked whenever a new region is allocated, once per registered +// ring. The implementation should call io_uring_register_buffers_update() +// (or equivalent) to pin the new pages in the ring's registered-buffer table. +// |buf_index_base| is the starting index for this region and is identical +// across all rings (all rings share the same iovec layout). +using RegionRegisterCb = std::function; + +class IouringMemPool { +public: + static IouringMemPool& Instance(); + + // Must be called once before any IOBuf allocation. + // |block_size|: size of each IOBuf block (== iouring_iobuf_block_size). + // After Init(), butil::iobuf::blockmem_allocate is replaced with + // MemPoolAllocate and blockmem_deallocate with MemPoolDeallocate. + bool Init(size_t block_size); + + void Destroy(); + + // Register a callback invoked for each new region on each ring. + // Multiple rings can register; all are called in AddRegion(). + void AddRingRegistrar(struct io_uring* ring, RegionRegisterCb cb); + void RemoveRingRegistrar(struct io_uring* ring); + + // Allocate one block of block_size_ bytes from registered memory. + // Called via blockmem_allocate hook. + void* Allocate(size_t size); + + // Return a block to the pool. + // Called via blockmem_deallocate hook. + void Deallocate(void* ptr); + + // Look up the buf_index for a pointer inside a registered region, + // for a specific ring. + // Returns -1 if |ptr| is not in any registered region. + // + // Hot path: uses a lock-free snapshot of region_count_ so that the + // common case (ptr found in an existing region) avoids any mutex. + // A concurrent AddRegion publishes the new count only AFTER the Region + // entry is fully initialised, so reading a stale count simply means we + // miss the newest region and return -1 (caller falls back to WRITEV). + int GetBufIndex(struct io_uring* ring, const void* ptr) const; + + bool initialized() const { return initialized_; } + size_t block_size() const { return block_size_; } + +private: + IouringMemPool() = default; + ~IouringMemPool() { Destroy(); } + + // Grow by allocating and registering a new region. + // Called under extend_lock_. + bool AddRegion(size_t region_size_mb); + + struct Region { + uintptr_t base; + size_t size; // total bytes + size_t block_size; // bytes per block in this region + // buf_index_base is the same for every ring (io_uring assigns indices + // in a global namespace per ring, but all rings use the same iovec + // layout: region 0 gets [0, N0), region 1 gets [N0, N0+N1), …). + // Storing it once eliminates the per-ring inner loop in GetBufIndex. + int buf_index_base; // first buf_index of this region (all rings) + int block_count; // aligned_size / block_size (cached) + }; + + struct FreeNode { + FreeNode* next; + }; + + // TLS free-list for the hot (lock-free) allocation path. + static __thread FreeNode* tls_free_; + static __thread size_t tls_free_cnt_; + + static void* MemPoolAllocate(size_t size); + static void MemPoolDeallocate(void* ptr); + + // Previous allocator hooks (saved at Init time for fallback). + void* (*prev_allocate_)(size_t) = nullptr; + void (*prev_deallocate_)(void*) = nullptr; + + bool initialized_ = false; + size_t block_size_ = 0; + + // Global free-list (protected by free_lock_). + butil::Mutex free_lock_; + FreeNode* global_free_ = nullptr; + + // Region table (protected by extend_lock_). + // region_count_ is a lock-free snapshot: written by AddRegion (under + // extend_lock_) AFTER the new Region is pushed, read by GetBufIndex + // without any lock for the fast path. + mutable butil::Mutex extend_lock_; + std::vector regions_; + butil::atomic region_count_{0}; + + // Per-ring callbacks (protected by registrar_lock_). + butil::Mutex registrar_lock_; + std::vector reg_rings_; + std::vector reg_cbs_; + + DISALLOW_COPY_AND_ASSIGN(IouringMemPool); +}; + +// --------------------------------------------------------------------------- +// IouringReadSlotPool (per-Poller-ring, receive buffers for READ_FIXED) +// +// Owns a pool of pre-registered receive buffers. Call Acquire() before +// submitting a READ_FIXED SQE to get a (buf, buf_index, size) triple; +// call Release() after the CQE is consumed and the data has been copied. +// --------------------------------------------------------------------------- +class IouringReadSlotPool { +public: + IouringReadSlotPool() = default; + ~IouringReadSlotPool() { Destroy(); } + + // Initialise the read-slot pool for |ring|. + // + // Requires IouringMemPool to be initialised (--iouring_register_buffers=true) + // and |slot_buf_size| == MemPool block_size so that receive buffers are + // allocated from the same pre-registered slab and the kernel can DMA-write + // received data directly into them (IORING_OP_READ_FIXED) without per-op + // page pinning. + // + // |initial_slots| : number of receive buffers to pre-allocate. + // |max_slot_count| : upper bound (controls the sparse table size). + // |slot_buf_size| : bytes per receive buffer (must equal + // iouring_iobuf_block_size). + bool Init(struct io_uring* ring, + int initial_slots, + int max_slot_count, + size_t slot_buf_size); + + void Destroy(); + + // Acquire a slot (may grow the pool). + bool Acquire(IouringReadSlot* out); + + // Return a slot to the pool. + void Release(const IouringReadSlot& slot); + + bool initialized() const { return ring_ != nullptr; } + size_t slot_buf_size() const { return slot_buf_size_; } + int total_slot_count() const { return total_slot_count_; } + int free_count() const { return static_cast(free_slot_indices_.size()); } + +private: + bool GrowBy(int n); + + // One entry per allocated receive buffer. + // Invariant: entries_[k].buf is the buffer for logical slot index k. + // (GrowBy appends in order, so the vector index IS the slot index.) + struct ReadSlotEntry { + void* buf; // pointer to the buffer (owned by IouringMemPool) + }; + + struct io_uring* ring_ = nullptr; + size_t slot_buf_size_ = 0; // bytes per receive buffer + int max_slot_count_ = 0; // hard upper bound on slots + int total_slot_count_ = 0; // currently allocated slots + std::vector entries_; // all allocated entries + std::vector free_slot_indices_; // indices of free slots + + DISALLOW_COPY_AND_ASSIGN(IouringReadSlotPool); +}; + +// --------------------------------------------------------------------------- +// Convenience helpers +// --------------------------------------------------------------------------- + +// True when --iouring_register_buffers=true. +bool IsFixedBuffersEnabled(); + +// Look up buf_index for a ptr in the given ring's registration table. +// Returns -1 if ptr is not in a registered region (caller must fall back). +inline int GetWriteBufIndex(struct io_uring* ring, const void* ptr) { + return IouringMemPool::Instance().GetBufIndex(ring, ptr); +} + +} // namespace iouring +} // namespace brpc + +#else // !BRPC_WITH_IOURING + +namespace brpc { +namespace iouring { +inline bool IsFixedBuffersEnabled() { return false; } +} +} + +#endif // BRPC_WITH_IOURING +#endif // BRPC_IOURING_BLOCK_POOL_H diff --git a/src/brpc/iouring/iouring_endpoint.cpp b/src/brpc/iouring/iouring_endpoint.cpp new file mode 100644 index 0000000000..24d7810dee --- /dev/null +++ b/src/brpc/iouring/iouring_endpoint.cpp @@ -0,0 +1,993 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_IOURING + +#include +#include +#include +#include +#include + +#include +#include "butil/atomicops.h" +#include "butil/fd_utility.h" +#include "butil/logging.h" +#include "butil/macros.h" +#include "butil/third_party/murmurhash3/murmurhash3.h" +#include "bthread/bthread.h" +#include "brpc/event_dispatcher.h" +#include "brpc/input_messenger.h" +#include "brpc/socket.h" +#include "brpc/reloadable_flags.h" +#include "brpc/iouring/iouring_block_pool.h" +#include "brpc/iouring/iouring_helper.h" +#include "brpc/iouring/iouring_endpoint.h" + +DECLARE_int32(task_group_ntags); + +namespace brpc { +namespace iouring { + +// --------------------------------------------------------------------------- +// gflags (endpoint-level tunables) +// --------------------------------------------------------------------------- + +// Each bthread_tag has exactly one Poller bthread (and one io_uring ring). +// io_uring's SQ is single-producer; bthread work-stealing means multiple +// Pollers per tag could run on different pthreads and race on the SQ. +// Horizontal scaling is achieved by increasing --task_group_ntags instead. + +DEFINE_bool(iouring_poller_yield, false, + "Yield (bthread_yield / sched_yield) after each poll iteration. " + "Reduces CPU usage at the cost of higher tail latency."); + +DEFINE_int32(iouring_max_cqe_poll_once, 32, + "Maximum CQEs reaped per io_uring_peek_batch_cqe() call."); + +static const int32_t MAX_INFLIGHT_WRITES = 64; + +// --------------------------------------------------------------------------- +// Constructor / Destructor +// --------------------------------------------------------------------------- + +IouringEndpoint::IouringEndpoint(Socket* s) + : _socket(s) + , _inflight_writes(0) +{ + _read_slot = {}; +} + +IouringEndpoint::~IouringEndpoint() { + Reset(); +} + +void IouringEndpoint::Reset() { + DeallocateResources(); + _inflight_writes.store(0, butil::memory_order_relaxed); +} + +// --------------------------------------------------------------------------- +// Resource management +// --------------------------------------------------------------------------- + +int IouringEndpoint::AllocateResources() { + // Register this socket with the Poller via the MPSC op_queue. + // The Poller thread will dequeue the ADD message on its next iteration + // and issue the first SubmitRead there – on the Poller thread, without + // any locking. + // + // Registered-buffer mode: acquire a fixed read slot on the Poller thread + // when processing the ADD message (see main loop). Nothing to do here. + PollerAddSid(); + return 0; +} + +void IouringEndpoint::DeallocateResources() { + // Embed the read slot (if any) in the REMOVE message so the Poller thread + // can Release it without any locking – all slot_pool operations happen on + // the Poller thread. + PollerRemoveSid(_read_slot); + _read_slot = {}; +} + +// --------------------------------------------------------------------------- +// Ring / Poller access +// --------------------------------------------------------------------------- + +IouringEndpoint::Poller* IouringEndpoint::GetPoller() const { + bthread_tag_t tag = bthread_self_tag(); + if (tag < 0 || tag >= static_cast(_poller_groups.size())) { tag = 0; } + auto& pollers = _poller_groups[tag].pollers; + const size_t index = butil::fmix32(_socket->id()) % pollers.size(); + if (!pollers[index].ring_initialized) { return nullptr; } + return &pollers[index]; +} + +// --------------------------------------------------------------------------- +// IouringPollerHandle – implementation +// +// ring() and Submit() are implemented here (not in the header) because they +// need access to IouringEndpoint::Poller and IouringEndpoint::_poller_groups, +// which are private and not yet fully defined at the point where +// IouringPollerHandle is declared in iouring_helper.h. +// --------------------------------------------------------------------------- + +int IouringPollerHandle::Submit( + std::function prepare_fn) const { + if (tag_ < 0 || tag_ >= static_cast( + IouringEndpoint::_poller_groups.size())) { + errno = ENODEV; + return -1; + } + auto& pollers = IouringEndpoint::_poller_groups[tag_].pollers; + if (index_ < 0 || index_ >= static_cast(pollers.size())) { + errno = ENODEV; + return -1; + } + IouringEndpoint::Poller& poller = pollers[index_]; + if (!poller.ring_initialized) { + errno = ENODEV; + return -1; + } + // Called on the Poller thread (passive path only); no lock needed. + int n = prepare_fn(&poller.ring); + if (n < 0) { errno = EBUSY; return -1; } + if (n == 0) { return 0; } + int ret = io_uring_submit(&poller.ring); + if (ret < 0) { errno = -ret; return -1; } + return ret; +} + +// --------------------------------------------------------------------------- +// SubmitOneSqe +// +// Must be called on the Poller thread; no locking. +// Gets one SQE, calls |prepare_fn(sqe)|, issues io_uring_submit(). +// Returns io_uring_submit() result (>= 0) or -1 (errno set). +// errno=ENOBUFS → SQ full +// errno=ENODEV → ring not yet initialised +// --------------------------------------------------------------------------- + +int IouringEndpoint::SubmitOneSqe( + std::function prepare_fn) { + Poller* poller = GetPoller(); + if (!poller) { errno = ENODEV; return -1; } + + struct io_uring_sqe* sqe = io_uring_get_sqe(&poller->ring); + if (!sqe) { errno = ENOBUFS; return -1; } + prepare_fn(sqe); + int ret = io_uring_submit(&poller->ring); + if (ret < 0) { errno = -ret; return -1; } + return ret; +} + +// --------------------------------------------------------------------------- +// Ring parameters factory +// --------------------------------------------------------------------------- + +struct io_uring_params IouringEndpoint::BuildRingParams() { + struct io_uring_params p; + memset(&p, 0, sizeof(p)); + + const IouringPollingMode mode = GetPollingMode(); + + switch (mode) { + case IouringPollingMode::SQPOLL: + case IouringPollingMode::HYBRID: + p.flags |= IORING_SETUP_SQPOLL; + if (FLAGS_iouring_sqpoll_idle_ms > 0) { + p.sq_thread_idle = static_cast(FLAGS_iouring_sqpoll_idle_ms); + } + if (FLAGS_iouring_sqpoll_cpu >= 0) { + p.flags |= IORING_SETUP_SQ_AFF; + p.sq_thread_cpu = static_cast(FLAGS_iouring_sqpoll_cpu); + } + break; + case IouringPollingMode::IOPOLL: + p.flags |= IORING_SETUP_IOPOLL; + break; + case IouringPollingMode::NONE: + default: + break; + } + + if (FLAGS_iouring_cq_size > 0) { + p.flags |= IORING_SETUP_CQSIZE; + p.cq_entries = static_cast(FLAGS_iouring_cq_size); + } + + return p; +} + +// --------------------------------------------------------------------------- +// SubmitRead +// +// Registered mode (--iouring_register_buffers=true): +// IORING_OP_READ_FIXED into _read_slot.buf / _read_slot.buf_index. +// _read_slot is always valid here (AllocateResources guarantees it). +// +// Unregistered mode (--iouring_register_buffers=false): +// IORING_OP_READ into a per-call malloc bounce buffer (64 KiB). +// The buffer is owned by IOBuf after PollCq and free()'d when consumed. +// --------------------------------------------------------------------------- + +int IouringEndpoint::SubmitRead(int fd) { + IouringReqContext* ctx = new IouringReqContext{}; + ctx->fd = fd; + ctx->socket_id = _socket->id(); + + if (IsFixedBuffersEnabled()) { + // Registered path – _read_slot is always populated. + ctx->op = IOURING_OP_READ_FIXED; + const IouringReadSlot slot = _read_slot; + int ret = SubmitOneSqe([&](struct io_uring_sqe* sqe) { + io_uring_prep_read_fixed(sqe, fd, + slot.buf, + static_cast(slot.size), + /*offset=*/0, + slot.buf_index); + sqe->user_data = reinterpret_cast(ctx) | kBrpcCqeTag; + }); + if (ret < 0) { delete ctx; return -1; } + return 0; + } + + // Unregistered path – allocate a temporary bounce buffer. + constexpr size_t kBounceSize = 65536; + void* bounce = malloc(kBounceSize); + if (!bounce) { delete ctx; errno = ENOMEM; return -1; } + ctx->op = IOURING_OP_READ; + + int ret = SubmitOneSqe([&](struct io_uring_sqe* sqe) { + io_uring_prep_read(sqe, fd, bounce, static_cast(kBounceSize), + /*offset=*/0); + sqe->user_data = reinterpret_cast(ctx) | kBrpcCqeTag; + }); + if (ret < 0) { free(bounce); delete ctx; return -1; } + ctx->bounce = bounce; // PollCq takes ownership and wraps it in IOBuf + return 0; +} + + +ssize_t IouringEndpoint::CutFromIOBufList(butil::IOBuf** from, size_t ndata) { + CHECK(from != nullptr); + CHECK(ndata > 0); + + if (!IsWritable()) { errno = EAGAIN; return -1; } + + const int fd = _socket->fd(); + + // ------------------------------------------------------------------ + // WRITE_FIXED path (--iouring_register_buffers=true) + // ------------------------------------------------------------------ + // Every IOBuf block comes from IouringMemPool (a pre-registered slab). + // GetBufIndex() must always succeed; if it returns -1 a block somehow + // escaped the pool – LOG(ERROR) and fail rather than silently degrading. + // + // One IORING_OP_WRITE_FIXED SQE is submitted per block so the kernel can + // DMA directly from pinned pages with no per-op get_user_pages overhead. + // Called on the Poller thread; no locking needed. + // ------------------------------------------------------------------ + if (IsFixedBuffersEnabled()) { + Poller* poller = GetPoller(); + if (!poller) { errno = ENODEV; return -1; } + IouringMemPool& mp = IouringMemPool::Instance(); + struct io_uring* ring = &poller->ring; + + ssize_t total_bytes = 0; + int sqes_queued = 0; + + for (size_t i = 0; i < ndata; ++i) { + if (!from[i] || from[i]->empty()) { continue; } + butil::IOBuf& buf = *from[i]; + while (!buf.empty()) { + const void* seg_ptr = buf.fetch1(); + size_t seg_len = buf.backing_block(0).size(); + if (seg_len == 0) { break; } + + int buf_idx = mp.GetBufIndex(ring, seg_ptr); + if (buf_idx < 0) { + // Every block must be registered when + // --iouring_register_buffers=true. A -1 here means a + // block escaped the pool – this is a programming error. + LOG(ERROR) << "io_uring: unregistered IOBuf block ptr=" + << seg_ptr << " fd=" << fd + << "; submission aborted."; + errno = EINVAL; + break; + } + + struct io_uring_sqe* sqe = io_uring_get_sqe(ring); + if (!sqe) { errno = ENOBUFS; break; } + + io_uring_prep_write_fixed(sqe, fd, + const_cast(seg_ptr), + static_cast(seg_len), + 0, buf_idx); + IouringReqContext* ctx = new IouringReqContext{}; + ctx->op = IOURING_OP_WRITE_FIXED; + ctx->fd = fd; + ctx->socket_id = _socket->id(); + sqe->user_data = reinterpret_cast(ctx) | kBrpcCqeTag; + total_bytes += static_cast(seg_len); + ++sqes_queued; + buf.pop_front(seg_len); + } + } + + if (sqes_queued > 0) { + int ret = io_uring_submit(ring); + if (ret < 0) { + errno = -ret; + return total_bytes > 0 ? total_bytes : -1; + } + _inflight_writes.fetch_add(sqes_queued, butil::memory_order_relaxed); + } + return total_bytes > 0 ? total_bytes : (sqes_queued == 0 ? 0 : -1); + } + + // ------------------------------------------------------------------ + // Plain WRITEV path (--iouring_register_buffers=false) + // ------------------------------------------------------------------ + // Collect all iovec entries from the IOBuf list and submit a single + // IORING_OP_WRITEV. Cap at IOURING_IOV_MAX to stay within SQ limits. + // ------------------------------------------------------------------ + static const size_t IOURING_IOV_MAX = 256; + std::vector iov; + ssize_t total_bytes = 0; + + for (size_t i = 0; i < ndata; ++i) { + if (!from[i] || from[i]->empty()) { continue; } + butil::IOBuf& buf = *from[i]; + while (!buf.empty() && iov.size() < IOURING_IOV_MAX) { + const void* seg_ptr = buf.fetch1(); + size_t seg_len = buf.backing_block(0).size(); + if (seg_len == 0) { break; } + iov.push_back({const_cast(seg_ptr), seg_len}); + total_bytes += static_cast(seg_len); + buf.pop_front(seg_len); + } + if (iov.size() >= IOURING_IOV_MAX) { break; } + } + + if (iov.empty()) { return 0; } + + IouringReqContext* ctx = new IouringReqContext{}; + ctx->op = IOURING_OP_WRITE; + ctx->fd = fd; + ctx->socket_id = _socket->id(); + + int ret = SubmitOneSqe([&](struct io_uring_sqe* sqe) { + io_uring_prep_writev(sqe, fd, iov.data(), + static_cast(iov.size()), 0); + sqe->user_data = reinterpret_cast(ctx) | kBrpcCqeTag; + }); + if (ret < 0) { delete ctx; return -1; } + _inflight_writes.fetch_add(1, butil::memory_order_relaxed); + return total_bytes; +} + +bool IouringEndpoint::IsWritable() const { + return _inflight_writes.load(butil::memory_order_relaxed) < MAX_INFLIGHT_WRITES; +} + +// --------------------------------------------------------------------------- +// PollCq – CQE completion handler +// +// READ_FIXED path (--iouring_register_buffers=true, zero-copy) +// ------------------------------------------------------------- +// The kernel has written |res| bytes directly into ep->_read_slot.buf (a +// pre-registered, pinned page). IOBuf::append_user_data() wraps those bytes +// without copying; the destructor returns the block to IouringMemPool +// (thread-safe) when the last IOBuf reference is dropped. +// A fresh slot is acquired immediately and the next READ_FIXED queued. +// +// READ path (--iouring_register_buffers=false) +// -------------------------------------------- +// ctx->bounce points to a per-call malloc'd bounce buffer (64 KiB). +// IOBuf takes ownership (free() destructor) when the data is wrapped. +// --------------------------------------------------------------------------- + +void IouringEndpoint::PollCq(Socket* m) { + IouringEndpoint* ep = static_cast(m->user()); + if (!ep) { return; } + + SocketUniquePtr s; + if (Socket::Address(ep->_socket->id(), &s) < 0) { return; } + + // CQ-side operations need only the ring pointer; no lock required here + // because io_uring_peek_batch_cqe / io_uring_cqe_seen operate on the CQ + // which is only touched by the Poller thread. + Poller* poller = ep->GetPoller(); + if (!poller) { return; } + struct io_uring* ring = &poller->ring; + + struct io_uring_cqe* cqes[FLAGS_iouring_max_cqe_poll_once]; + InputMessageClosure last_msg; + int progress = Socket::PROGRESS_INIT; + + while (true) { + const int cnt = io_uring_peek_batch_cqe( + ring, cqes, static_cast(FLAGS_iouring_max_cqe_poll_once)); + + if (cnt <= 0) { + if (s->Failed()) { return; } + if (!m->MoreReadEvents(&progress)) { break; } + continue; + } + + ssize_t bytes_read = 0; + + for (int i = 0; i < cnt; ++i) { + struct io_uring_cqe* cqe = cqes[i]; + const uint64_t udata = cqe->user_data; + + // Only process CQEs that bRPC submitted (bit 63 == 1). + // All other CQEs are user-submitted; leave them in the ring so the + // user callback (called right after PollCq returns) can + // drain and handle them. Users need no special tagging – bit 63 + // is never set in a canonical user-space pointer or small integer. + if (!(udata & kBrpcCqeTag)) { + continue; // do NOT call io_uring_cqe_seen() + } + + // Strip the tag bit to recover the original IouringReqContext*. + // udata == kBrpcCqeTag (NOP wake-up SQE) → ctx will be nullptr. + IouringReqContext* ctx = + reinterpret_cast( + static_cast(udata & ~kBrpcCqeTag)); + + if (!ctx) { + io_uring_cqe_seen(ring, cqe); + continue; + } + + const int res = cqe->res; + io_uring_cqe_seen(ring, cqe); + + // --------------------------------------------------------------- + // Error handling + // --------------------------------------------------------------- + if (res < 0) { + if (res == -ECANCELED) { + // Op was cancelled. + // READ_FIXED: the slot is still owned by the endpoint + // (_read_slot); DeallocateResources will release it. + // IOURING_OP_READ: free the bounce buffer. + if (ctx->op == IOURING_OP_READ) { free(ctx->bounce); } + delete ctx; + continue; + } + + const int saved_errno = -res; + LOG(WARNING) << "io_uring CQE error fd=" << ctx->fd + << " socket_id=" << ctx->socket_id + << " op=" << (int)ctx->op + << ": " << berror(saved_errno); + SocketUniquePtr cs; + if (Socket::Address(ctx->socket_id, &cs) == 0) { + cs->SetFailed(saved_errno, "io_uring op error: %s", + berror(saved_errno)); + } + if (ctx->op == IOURING_OP_READ) { free(ctx->bounce); } + delete ctx; + continue; + } + + // --------------------------------------------------------------- + // Dispatch by operation type + // --------------------------------------------------------------- + if (ctx->op == IOURING_OP_READ || ctx->op == IOURING_OP_READ_FIXED) { + if (res == 0) { + // EOF + SocketUniquePtr cs; + if (Socket::Address(ctx->socket_id, &cs) == 0) { + cs->SetEOF(); + } + if (ctx->op == IOURING_OP_READ) { free(ctx->bounce); } + delete ctx; + continue; + } + + if (ctx->op == IOURING_OP_READ_FIXED) { + // --------------------------------------------------------- + // Registered mode – zero-copy. + // + // The kernel has written |res| bytes into the slot's pinned + // buffer. We hand it to IOBuf zero-copy; the destructor + // returns it to IouringMemPool (thread-safe) when the last + // IOBuf reference is dropped. + // + // Slot rotation: + // 1. Steal the slot from the endpoint (ep->_read_slot={}). + // 2. Acquire a fresh slot for the next READ_FIXED. + // 3. Submit the next read. + // --------------------------------------------------------- + IouringReadSlot consumed_slot = ep->_read_slot; + ep->_read_slot = {}; + + // Already on the Poller thread – no lock needed. + // Acquire the next slot before handing off consumed_slot so + // that back-to-back arrivals never stall waiting for a slot. + { + Poller* p = ep->GetPoller(); + if (p && p->slot_pool.initialized()) { + p->slot_pool.Acquire(&ep->_read_slot); + } + } + + // Zero-copy: wrap slot memory – IouringMemPool is + // thread-safe so the destructor can run on any thread. + struct SlotDeleter { + static void destroy(void* ptr) { + IouringMemPool::Instance().Deallocate(ptr); + } + }; + butil::IOBuf tmp; + tmp.append_user_data(consumed_slot.buf, + static_cast(res), + SlotDeleter::destroy); + m->_read_buf.append(std::move(tmp)); + + ep->SubmitRead(ctx->fd); + } else { + // --------------------------------------------------------- + // Unregistered mode – bounce buffer. + // + // The bounce buffer was malloc'd in SubmitRead and stored in + // ctx->bounce. Transfer ownership to IOBuf (free() is + // called when IOBuf discards the block). + // --------------------------------------------------------- + butil::IOBuf tmp; + tmp.append_user_data(ctx->bounce, + static_cast(res), + free); + ctx->bounce = nullptr; // ownership transferred + m->_read_buf.append(std::move(tmp)); + + ep->SubmitRead(ctx->fd); + } + + bytes_read += res; + + } else { + // WRITE / WRITE_FIXED completion + IouringEndpoint* dep = ep; + if (ctx->socket_id != ep->_socket->id()) { + SocketUniquePtr cs; + if (Socket::Address(ctx->socket_id, &cs) == 0) { + dep = static_cast(cs->user()); + } + } + if (dep) { + dep->_inflight_writes.fetch_sub(1, butil::memory_order_relaxed); + dep->_socket->WakeAsEpollOut(); + } + } + + delete ctx; + } // for each cqe + + if (bytes_read > 0) { + const int64_t received_us = butil::cpuwide_time_us(); + const int64_t base_realtime = butil::gettimeofday_us() - received_us; + InputMessenger* messenger = static_cast(s->user()); + if (messenger && messenger->ProcessNewMessage( + s.get(), bytes_read, false, + received_us, base_realtime, last_msg) < 0) { + return; + } + } + } +} + +// --------------------------------------------------------------------------- +// DebugInfo +// --------------------------------------------------------------------------- + +void IouringEndpoint::DebugInfo(std::ostream& os, + butil::StringPiece connector) const { + os << "iouring_polling_mode=" << FLAGS_iouring_polling_mode + << connector + << "iouring_inflight_writes=" + << _inflight_writes.load(butil::memory_order_relaxed) + << connector << "iouring_writable=" << IsWritable() + << connector << "iouring_register_buffers=" << IsFixedBuffersEnabled(); + if (IsFixedBuffersEnabled() && _read_slot.buf != nullptr) { + os << " buf_index=" << _read_slot.buf_index + << " slot_size=" << _read_slot.size; + } +} + +// --------------------------------------------------------------------------- +// GlobalInitialize / GlobalRelease +// --------------------------------------------------------------------------- + +int IouringEndpoint::GlobalInitialize() { + _poller_groups = std::vector(FLAGS_task_group_ntags); + return 0; +} + +void IouringEndpoint::GlobalRelease() { + for (int i = 0; i < FLAGS_task_group_ntags; ++i) { + PollingModeRelease(i); + } +} + +// --------------------------------------------------------------------------- +// PollerDrainOpQueue +// +// Dequeues all pending SidOps from poller->op_queue and applies them: +// ADD – track the socket, acquire a read slot (fixed mode), issue first read. +// REMOVE – release the read slot (fixed mode), stop tracking the socket. +// +// Must run exclusively on the Poller thread so that slot_pool is accessed +// without any locking. +// --------------------------------------------------------------------------- + +void IouringEndpoint::PollerDrainOpQueue( + Poller* poller, + std::unordered_set& tracked_sids) { + SidOp op; + while (poller->op_queue.Dequeue(op)) { + if (op.type == SidOp::ADD) { + tracked_sids.emplace(op.sid); + SocketUniquePtr s_add; + if (Socket::Address(op.sid, &s_add) == 0) { + IouringEndpoint* ep = + static_cast(s_add->user()); + if (ep) { + // Acquire a fixed read slot for this connection. + if (IsFixedBuffersEnabled()) { + if (!poller->slot_pool.initialized() || + !poller->slot_pool.Acquire(&ep->_read_slot)) { + LOG(ERROR) + << "IouringEndpoint: slot pool " + "exhausted for socket " + << op.sid << "; dropping connection."; + tracked_sids.erase(op.sid); + s_add->SetFailed(ENOMEM, + "io_uring slot pool exhausted"); + continue; + } + } + // Issue the first SubmitRead on the Poller thread. + if (ep->SubmitRead(s_add->fd()) < 0) { + LOG(WARNING) + << "IouringEndpoint: first SubmitRead " + "failed for socket " + << op.sid << ": " << berror(); + } + } + } + } else { + // REMOVE: release the read slot (if any) back to the pool. + if (IsFixedBuffersEnabled() && op.read_slot.buf != nullptr) { + if (poller->slot_pool.initialized()) { + poller->slot_pool.Release(op.read_slot); + } + } + tracked_sids.erase(op.sid); + } + } +} + +// --------------------------------------------------------------------------- +// PollingModeInitialize +// --------------------------------------------------------------------------- + +std::vector IouringEndpoint::_poller_groups; + +int IouringEndpoint::PollingModeInitialize( + bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn) { + + if (tag < 0 || tag >= static_cast(_poller_groups.size())) { + LOG(ERROR) << "io_uring: invalid bthread tag " << tag; + return -1; + } + + auto& group = _poller_groups[tag]; + auto& pollers = group.pollers; + auto& running = group.running; + + bool expected = false; + if (!running.compare_exchange_strong(expected, true)) { return 0; } + + // ----------------------------------------------------------------------- + // Poller thread arguments + // ----------------------------------------------------------------------- + struct FnArgs { + Poller* poller; + std::atomic* running; + bthread_tag_t tag; + int index; // poller index within the group + }; + + // ----------------------------------------------------------------------- + // Poller thread body + // ----------------------------------------------------------------------- + auto fn = [](void* p) -> void* { + std::unique_ptr args(static_cast(p)); + Poller* poller = args->poller; + std::atomic* running = args->running; + + // 1. Create the ring. + struct io_uring_params params = BuildRingParams(); + const unsigned sq_size = static_cast(GetIouringSqSize()); + + int ret = io_uring_queue_init_params(sq_size, &poller->ring, ¶ms); + if (ret < 0) { + LOG(ERROR) << "io_uring_queue_init_params failed: " << berror(-ret); + running->store(false, std::memory_order_relaxed); + return nullptr; + } + poller->ring_initialized = true; + + // 2. Initialise the fixed-buffer infrastructure. + if (IsFixedBuffersEnabled()) { + // 2a. Register this ring with IouringMemPool. + // The callback is called for each existing region immediately + // (to bring the ring up to date) and for each future region + // (when the pool grows). It issues register_buffers_update / + // full re-registration to pin the new pages in this ring. + IouringMemPool::Instance().AddRingRegistrar( + &poller->ring, + [poller](void* base, size_t size, size_t block_size, + int buf_index_base) { + // Build one iovec per block in this new region. + const int n = static_cast(size / block_size); + std::vector iovs(n); + for (int i = 0; i < n; ++i) { + iovs[i].iov_base = + static_cast(base) + i * block_size; + iovs[i].iov_len = block_size; + } + // Register the new region's buffers. + // io_uring_register_buffers_update (kernel >= 5.13) allows + // incremental updates; fall back to a full re-registration + // on older kernels or if the function is unavailable. + int ret = -ENOSYS; +#ifdef IORING_REGISTER_BUFFERS_UPDATE + ret = io_uring_register_buffers_update( + &poller->ring, + static_cast(buf_index_base), + iovs.data(), + static_cast(n)); +#endif + if (ret < 0) { + // Full re-registration path. + // For the first region just call register_buffers. + // For subsequent regions we must rebuild the complete + // table; here we only have the new slice so we + // attempt a best-effort register and log on failure. + if (buf_index_base > 0) { + // Unregister previous table before re-registering. + io_uring_unregister_buffers(&poller->ring); + } + int r2 = io_uring_register_buffers(&poller->ring, + iovs.data(), + static_cast(n)); + if (r2 < 0) { + LOG(WARNING) + << "io_uring_register_buffers failed for new " + "region (buf_index_base=" << buf_index_base + << "): " << berror(-r2) + << " – WRITE_FIXED will fall back to WRITEV."; + } + } + }); + + // 2b. Initialise the per-ring read-slot pool. + // Slot size matches the IOBuf block size so slots come from the + // same registered slab and share the write-path registration. + const int initial = FLAGS_iouring_read_slot_num; + const int max = FLAGS_iouring_read_slot_max; + const size_t sz = + static_cast(FLAGS_iouring_iobuf_block_size); + + if (!poller->slot_pool.Init(&poller->ring, initial, max, sz)) { + LOG(ERROR) << "io_uring slot pool init failed; " + "io_uring disabled (--iouring_register_buffers=true " + "requires a working slot pool)."; + running->store(false, std::memory_order_relaxed); + // Exit the poller lambda so the ring is torn down. + return nullptr; + } + LOG(INFO) << "io_uring read slot pool ready: initial=" << initial + << " max=" << max << " slot_buf_size=" << sz; + } + + if (poller->init_fn) { + poller->init_fn(IouringPollerHandle(args->tag, args->index)); + } + + // 3. CQE reap strategy. + const IouringPollingMode mode = GetPollingMode(); + const int hybrid_spins = FLAGS_iouring_hybrid_spin_count; + struct io_uring_cqe* cqes[FLAGS_iouring_max_cqe_poll_once]; // used by SQPOLL/HYBRID peek only + std::unordered_set tracked_sids; + SidOp op; + + // 4. Main loop. + while (running->load(std::memory_order_relaxed)) { + // a) Drain op_queue. + // All slot_pool operations happen here, on the Poller thread, + // so no locking is ever needed for slot_pool. + PollerDrainOpQueue(poller, tracked_sids); + + // b) Reap CQEs. + bool got_cqe = false; // used by SQPOLL/IOPOLL/HYBRID branches + + if (mode == IouringPollingMode::NONE) { + // Interrupt-driven mode: block up to 1 ms waiting for a CQE. + // The 1 ms timeout keeps the Poller loop responsive to new + // connections arriving in op_queue without burning CPU when + // there is no I/O traffic. + struct io_uring_cqe* cqe = nullptr; + struct __kernel_timespec ts{0, 1000000}; // 1 ms + int r = io_uring_wait_cqe_timeout(&poller->ring, &cqe, &ts); + if (r == 0 && cqe) { + io_uring_cqe_seen(&poller->ring, cqe); + } + + } else if (mode == IouringPollingMode::IOPOLL) { + // IOPOLL (IORING_SETUP_IOPOLL): the kernel never generates + // interrupts; CQEs are posted only after an explicit + // io_uring_submit() triggers the poll. Use peek to drain + // whatever is already available, then yield to avoid + // spinning at 100 % when there is no block I/O in flight. + int cnt = io_uring_peek_batch_cqe( + &poller->ring, cqes, + static_cast(FLAGS_iouring_max_cqe_poll_once)); + got_cqe = (cnt > 0); + if (!got_cqe && FLAGS_iouring_poller_yield) { + bthread_yield(); + } + + } else if (mode == IouringPollingMode::SQPOLL) { + // SQPOLL (IORING_SETUP_SQPOLL): the kernel SQ thread submits + // I/O automatically; just peek for completed CQEs. + int cnt = io_uring_peek_batch_cqe( + &poller->ring, cqes, + static_cast(FLAGS_iouring_max_cqe_poll_once)); + got_cqe = (cnt > 0); + if (!got_cqe && FLAGS_iouring_poller_yield) { + bthread_yield(); + } + + } else { + // HYBRID: busy-spin N times, then fall back to a zero-timeout + // wait so the Poller thread blocks rather than burning CPU + // when no CQE arrives. + for (int spin = 0; spin < hybrid_spins && !got_cqe; ++spin) { + int cnt = io_uring_peek_batch_cqe( + &poller->ring, cqes, + static_cast(FLAGS_iouring_max_cqe_poll_once)); + got_cqe = (cnt > 0); + } + if (!got_cqe) { + struct io_uring_cqe* cqe = nullptr; + struct __kernel_timespec ts_zero{0, 0}; + io_uring_wait_cqe_timeout(&poller->ring, &cqe, &ts_zero); + } + } + + // c) Dispatch to each socket. + for (SocketId sid : tracked_sids) { + SocketUniquePtr s; + if (Socket::Address(sid, &s) < 0) { continue; } + IouringEndpoint::PollCq(s.get()); + } + + // d) Optional user callback. Runs on the Poller thread after + // every PollCq pass. The handle gives safe access to ring() + // for CQ draining and Submit() for SQ submission. + if (poller->callback) { + poller->callback(IouringPollerHandle(args->tag, args->index)); + } + + if (FLAGS_iouring_poller_yield && + mode != IouringPollingMode::SQPOLL) { + bthread_yield(); + } + } // while running + + // 5. Tear-down. + if (poller->release_fn) { + poller->release_fn(IouringPollerHandle(args->tag, args->index)); + } + + if (poller->ring_initialized) { + // Unregister this ring from the global mem-pool before destroying + // the ring so no future region growth tries to update a dead ring. + if (IsFixedBuffersEnabled()) { + IouringMemPool::Instance().RemoveRingRegistrar(&poller->ring); + } + // slot_pool.Destroy() calls io_uring_unregister_buffers before + // we exit the ring – order matters. + poller->slot_pool.Destroy(); + io_uring_queue_exit(&poller->ring); + poller->ring_initialized = false; + } + + return nullptr; + }; // lambda + + // Start the single Poller bthread for this tag. + for (int i = 0; i < 1; ++i) { + auto* fargs = new FnArgs{&pollers[i], &running, tag, i}; + bthread_attr_t attr = BTHREAD_ATTR_NORMAL; + attr.tag = tag; + bthread_attr_set_name(&attr, "IouringPoller"); + pollers[i].callback = callback; + pollers[i].init_fn = init_fn; + pollers[i].release_fn = release_fn; + + if (bthread_start_background(&pollers[i].tid, &attr, fn, fargs) != 0) { + LOG(ERROR) << "Fail to start io_uring poller bthread tag=" << tag + << " index=" << i; + delete fargs; + running.store(false, std::memory_order_relaxed); + return -1; + } + } + return 0; +} + +void IouringEndpoint::PollingModeRelease(bthread_tag_t tag) { + if (tag < 0 || tag >= static_cast(_poller_groups.size())) { return; } + + auto& group = _poller_groups[tag]; + auto& pollers = group.pollers; + auto& running = group.running; + + running.store(false, std::memory_order_relaxed); + + if (pollers[0].tid != INVALID_BTHREAD) { + bthread_join(pollers[0].tid, nullptr); + pollers[0].tid = INVALID_BTHREAD; + } +} + +// --------------------------------------------------------------------------- +// PollerAddSid / PollerRemoveSid +// --------------------------------------------------------------------------- + +void IouringEndpoint::PollerAddSid() { + bthread_tag_t tag = bthread_self_tag(); + if (tag < 0 || tag >= static_cast(_poller_groups.size())) { tag = 0; } + auto& pollers = _poller_groups[tag].pollers; + const size_t index = butil::fmix32(_socket->id()) % pollers.size(); + pollers[index].op_queue.Enqueue(SidOp{_socket->id(), SidOp::ADD}); +} + +void IouringEndpoint::PollerRemoveSid(const IouringReadSlot& slot) { + bthread_tag_t tag = bthread_self_tag(); + if (tag < 0 || tag >= static_cast(_poller_groups.size())) { tag = 0; } + auto& pollers = _poller_groups[tag].pollers; + const size_t index = butil::fmix32(_socket->id()) % pollers.size(); + pollers[index].op_queue.Enqueue(SidOp{_socket->id(), SidOp::REMOVE, slot}); +} + +} // namespace iouring +} // namespace brpc + +#endif // BRPC_WITH_IOURING diff --git a/src/brpc/iouring/iouring_endpoint.h b/src/brpc/iouring/iouring_endpoint.h new file mode 100644 index 0000000000..362f02170a --- /dev/null +++ b/src/brpc/iouring/iouring_endpoint.h @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_IOURING_ENDPOINT_H +#define BRPC_IOURING_ENDPOINT_H + +#if BRPC_WITH_IOURING + +#include +#include +#include +#include +#include +#include "brpc/iouring/iouring_helper.h" // IouringPollerHandle, kBrpcCqeTag, IouringPollingMode +#include "butil/atomicops.h" +#include "butil/iobuf.h" +#include "butil/macros.h" +#include "butil/containers/mpsc_queue.h" +#include "brpc/socket.h" +#include "brpc/iouring/iouring_block_pool.h" + +namespace brpc { +class Socket; +class IouringTransport; +namespace iouring { + +// Tag to identify the operation type in user_data of SQE / CQE. +// +// Design note: there are exactly two modes, controlled by +// --iouring_register_buffers. The two modes are mutually exclusive and +// never mixed within a single endpoint: +// +// registered → IOURING_OP_READ_FIXED + IOURING_OP_WRITE_FIXED +// unregistered → IOURING_OP_READ + IOURING_OP_WRITE +// +// No partial / per-block fallback exists. If fixed-buffer initialisation +// fails the entire io_uring transport is disabled. +enum IouringOpType : uint8_t { + IOURING_OP_READ = 0, // IORING_OP_READ (--iouring_register_buffers=false) + IOURING_OP_WRITE = 1, // IORING_OP_WRITEV (--iouring_register_buffers=false) + IOURING_OP_READ_FIXED = 2, // IORING_OP_READ_FIXED (--iouring_register_buffers=true) + IOURING_OP_WRITE_FIXED = 3, // IORING_OP_WRITE_FIXED (--iouring_register_buffers=true) +}; + +// Per-request context stored as user_data in SQE / CQE. +// +// bounce is non-null only for IOURING_OP_READ (unregistered mode); +// it points to the malloc'd bounce buffer allocated in SubmitRead. +// PollCq wraps it in IOBuf (which takes ownership and calls free()) and +// re-submits the next plain READ. +struct IouringReqContext { + IouringOpType op; // operation type + int fd; // file descriptor + SocketId socket_id; // owning socket id + void* bounce{nullptr}; // unregistered-mode bounce buf (may be null) +}; + +// --------------------------------------------------------------------------- +// IouringEndpoint – per-Socket async I/O endpoint backed by an io_uring ring. +// +// Two I/O modes, selected once at startup by --iouring_register_buffers: +// +// Registered-buffer mode (--iouring_register_buffers=true) +// ---------------------------------------------------------- +// Every IOBuf block comes from IouringMemPool (a pre-registered slab). +// Each Poller ring owns one IouringReadSlotPool of receive buffers. +// +// AllocateResources() +// Posts an ADD SidOp to the Poller's op_queue. The Poller thread +// acquires a read slot from slot_pool and issues the first SubmitRead +// there – entirely on the Poller thread, no locking needed. +// +// SubmitRead() +// Issues IORING_OP_READ_FIXED into _read_slot.buf / _read_slot.buf_index. +// +// PollCq() (READ_FIXED branch) +// res bytes are already in the slot's pinned memory. +// IOBuf::append_user_data() wraps them zero-copy; the destructor returns +// the block to IouringMemPool (thread-safe) when the last ref drops. +// A fresh slot is acquired immediately and the next READ_FIXED is queued. +// +// CutFromIOBufList() +// Every IOBuf block comes from the registered slab; each block gets its +// own IORING_OP_WRITE_FIXED SQE (no WRITEV fallback, no mixed batches). +// +// DeallocateResources() +// Posts a REMOVE SidOp carrying _read_slot. The Poller thread releases +// the slot back to slot_pool. +// +// Unregistered mode (--iouring_register_buffers=false) +// ------------------------------------------------------ +// SubmitRead() → IORING_OP_READ into a per-call malloc bounce buffer. +// CutFromIOBufList() → one IORING_OP_WRITEV per call. +// +// Thread safety +// ------------- +// All SQ operations (SubmitRead, CutFromIOBufList, SubmitOneSqe) run on the +// Poller thread. IouringReadSlotPool is only accessed from the Poller thread +// so it needs no locking. AllocateResources / DeallocateResources enqueue +// a SidOp message; slot_pool operations are performed when the Poller dequeues +// the message. +// --------------------------------------------------------------------------- + +class BAIDU_CACHELINE_ALIGNMENT IouringEndpoint : public SocketUser { +friend class ::brpc::Socket; +friend class ::brpc::IouringTransport; +friend class IouringPollerHandle; // needs _poller_groups and Poller +public: + explicit IouringEndpoint(Socket* s); + ~IouringEndpoint() override; + + static int GlobalInitialize(); + static void GlobalRelease(); + + void Reset(); + + // Submit async read. + // registered mode → IORING_OP_READ_FIXED into _read_slot + // unregistered mode → IORING_OP_READ into a malloc bounce buffer + int SubmitRead(int fd); + + // Cut data from IOBuf list and submit write SQE(s). + ssize_t CutFromIOBufList(butil::IOBuf** data, size_t ndata); + + bool IsWritable() const; + + static void PollCq(Socket* m); + + static int PollingModeInitialize( + bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn); + static void PollingModeRelease(bthread_tag_t tag); + + + void DebugInfo(std::ostream& os, + butil::StringPiece connector = "\n") const; + +private: + int AllocateResources(); + void DeallocateResources(); + + void PollerAddSid(); + void PollerRemoveSid(const IouringReadSlot& slot = IouringReadSlot{}); + + // ----------------------------------------------------------------------- + // Per-endpoint state + // ----------------------------------------------------------------------- + Socket* _socket; + butil::atomic _inflight_writes; + + // Fixed read slot (registered mode only; always valid after AllocateResources). + IouringReadSlot _read_slot; + + DISALLOW_COPY_AND_ASSIGN(IouringEndpoint); + + // ----------------------------------------------------------------------- + // Per-poller state + // ----------------------------------------------------------------------- + struct SidOp { + enum OpType { ADD, REMOVE }; + SocketId sid; + OpType type; + // Only meaningful for REMOVE + fixed-buffer mode: the read slot that + // was held by the endpoint. Returned to slot_pool on the Poller + // thread so that slot_pool never needs a lock. + IouringReadSlot read_slot; + + SidOp() : sid(0), type(ADD), read_slot() {} + SidOp(SocketId s, OpType t, IouringReadSlot rs = IouringReadSlot{}) + : sid(s), type(t), read_slot(rs) {} + }; + + struct BAIDU_CACHELINE_ALIGNMENT Poller { + bthread_t tid{INVALID_BTHREAD}; + butil::MPSCQueue> op_queue; + + // Called on the Poller thread with the handle bound to this Poller. + std::function callback; + std::function init_fn; + std::function release_fn; + + struct io_uring ring{}; + bool ring_initialized{false}; + + // Per-ring receive-buffer pool (READ_FIXED slots). + // Initialised once in the poller thread after ring creation. + // Active only when --iouring_register_buffers=true. + // All Acquire/Release calls happen on the Poller thread; no lock needed. + IouringReadSlotPool slot_pool; + }; + + // Drain all pending SidOps from poller->op_queue. + // Must be called exclusively on the Poller thread. + static void PollerDrainOpQueue(Poller* poller, + std::unordered_set& tracked_sids); + + struct BAIDU_CACHELINE_ALIGNMENT PollerGroup { + // Exactly one Poller per bthread_tag (SQ single-producer constraint). + PollerGroup() : pollers(1), running(false) {} + std::vector pollers; + std::atomic running; + }; + + static std::vector _poller_groups; + static struct io_uring_params BuildRingParams(); + + // Return the Poller that owns this endpoint's ring. + // Returns nullptr if the ring is not yet initialised. + // (Declared after Poller so the return type is complete.) + Poller* GetPoller() const; + + // Single-SQE helper: get one SQE, fill via |prepare_fn|, submit. + // Must be called on the Poller thread (no locking). + // Returns io_uring_submit() result (>= 0) or -1 (errno set). + // errno=ENOBUFS → SQ full + // errno=ENODEV → ring not initialised + int SubmitOneSqe(std::function prepare_fn); +}; + +} // namespace iouring +} // namespace brpc + +#else // !BRPC_WITH_IOURING + +class IouringEndpoint {}; + +#endif // BRPC_WITH_IOURING +#endif // BRPC_IOURING_ENDPOINT_H diff --git a/src/brpc/iouring/iouring_helper.cpp b/src/brpc/iouring/iouring_helper.cpp new file mode 100644 index 0000000000..509e78318c --- /dev/null +++ b/src/brpc/iouring/iouring_helper.cpp @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_IOURING + +#include +#include + +#include +#include + +#include "butil/atomicops.h" +#include "butil/iobuf.h" // butil::SetDefaultBlockSize +#include "butil/logging.h" +#include "brpc/iouring/iouring_block_pool.h" +#include "brpc/iouring/iouring_helper.h" +#include "brpc/iouring/iouring_endpoint.h" + +DECLARE_int32(task_group_ntags); + +namespace brpc { +namespace iouring { + +// --------------------------------------------------------------------------- +// gflags +// --------------------------------------------------------------------------- +DEFINE_int32(iouring_sq_size, 256, + "Submission-queue entries per io_uring ring."); +DEFINE_int32(iouring_cq_size, 0, + "Completion-queue entries per ring (0 = 2 * sq_size)."); +DEFINE_string(iouring_polling_mode, "none", + "io_uring polling mode: none | sqpoll | iopoll | hybrid."); +DEFINE_int32(iouring_sqpoll_idle_ms, 2000, + "SQPOLL kernel thread idle timeout (ms)."); +DEFINE_int32(iouring_sqpoll_cpu, -1, + "CPU for the SQPOLL kernel thread (-1 = no pin)."); +DEFINE_int32(iouring_hybrid_spin_count, 1000, + "Hybrid mode: busy-spin iterations before blocking."); + +// --------------------------------------------------------------------------- +// Internal state +// --------------------------------------------------------------------------- +static butil::atomic g_available(false); +static IouringPollingMode g_polling_mode = IouringPollingMode::NONE; + +// --------------------------------------------------------------------------- +// Polling mode helpers +// --------------------------------------------------------------------------- +static IouringPollingMode ParsePollingMode(const std::string& s) { + if (s == "sqpoll") return IouringPollingMode::SQPOLL; + if (s == "iopoll") return IouringPollingMode::IOPOLL; + if (s == "hybrid") return IouringPollingMode::HYBRID; + if (s != "none") { + LOG(WARNING) << "Unknown --iouring_polling_mode=\"" << s + << "\", using \"none\"."; + } + return IouringPollingMode::NONE; +} + +IouringPollingMode GetPollingMode() { return g_polling_mode; } + +// --------------------------------------------------------------------------- +// Kernel capability probe +// --------------------------------------------------------------------------- +static bool ProbeOpcodes() { + struct io_uring_probe* probe = io_uring_get_probe(); + if (!probe) { + LOG(ERROR) << "io_uring: kernel probe failed (>= 5.1 required)."; + return false; + } + + bool ok = true; + auto need = [&](uint8_t op, const char* name) { + if (!io_uring_opcode_supported(probe, op)) { + LOG(ERROR) << "io_uring: " << name << " not supported."; + ok = false; + } + }; + + need(IORING_OP_READV, "IORING_OP_READV"); + need(IORING_OP_WRITEV, "IORING_OP_WRITEV"); + need(IORING_OP_READ, "IORING_OP_READ"); + + if (IsFixedBuffersEnabled()) { + need(IORING_OP_READ_FIXED, "IORING_OP_READ_FIXED (disable --iouring_register_buffers)"); + need(IORING_OP_WRITE_FIXED, "IORING_OP_WRITE_FIXED (disable --iouring_register_buffers)"); + } + + if (g_polling_mode == IouringPollingMode::SQPOLL || + g_polling_mode == IouringPollingMode::HYBRID) { + // Warn if kernel < 5.11 (SQPOLL_NONFIXED unavailable). + struct io_uring ring_tmp{}; + struct io_uring_params p{}; + p.flags = IORING_SETUP_SQPOLL; + if (io_uring_queue_init_params(1, &ring_tmp, &p) == 0) { + if (!(p.features & IORING_FEAT_SQPOLL_NONFIXED)) { + LOG(WARNING) << "SQPOLL: kernel < 5.11 – all I/O buffers must " + "be pre-registered. Consider enabling " + "--iouring_register_buffers."; + } + io_uring_queue_exit(&ring_tmp); + } + } + + if (g_polling_mode == IouringPollingMode::IOPOLL) { + LOG(WARNING) << "IOPOLL is only effective for O_DIRECT block I/O."; + } + + io_uring_free_probe(probe); + return ok; +} + +// --------------------------------------------------------------------------- +// One-time global initialization +// --------------------------------------------------------------------------- +static void InitImpl() { + g_polling_mode = ParsePollingMode(FLAGS_iouring_polling_mode); + + if (!ProbeOpcodes()) { exit(1); } + + // Initialise the global IOBuf memory pool BEFORE any IOBuf is allocated. + // This replaces butil::iobuf::blockmem_allocate with our registered-slab + // allocator so that ALL subsequent IOBuf blocks reside in memory that is + // (or will be) registered with every io_uring ring. + if (IsFixedBuffersEnabled()) { + const size_t block_sz = + static_cast(FLAGS_iouring_iobuf_block_size); + // Align IOBuf's default block size with the pool's block size so that + // every IOBuf block fits exactly one registered slot. Must be done + // before IouringMemPool::Init() hooks blockmem_allocate (same pattern + // as rdma: butil::SetDefaultBlockSize(GetRdmaBlockSize())). + butil::SetDefaultBlockSize(block_sz); + if (!IouringMemPool::Instance().Init(block_sz)) { + LOG(FATAL) << "IouringMemPool::Init failed – " + "try reducing --iouring_mem_pool_initial_mb or " + "disabling --iouring_register_buffers."; + exit(1); + } + } + + if (IouringEndpoint::GlobalInitialize() < 0) { + LOG(FATAL) << "IouringEndpoint::GlobalInitialize failed"; + exit(1); + } + + atexit([]() { + g_available.store(false, butil::memory_order_release); + IouringEndpoint::GlobalRelease(); + if (IsFixedBuffersEnabled()) { + IouringMemPool::Instance().Destroy(); + } + }); + + g_available.store(true, butil::memory_order_relaxed); + + LOG(INFO) << "io_uring ready." + << " sq_size=" << FLAGS_iouring_sq_size + << " polling_mode=" << FLAGS_iouring_polling_mode + << " fixed_buffers=" << (IsFixedBuffersEnabled() ? "on" : "off"); +} + +static pthread_once_t g_once = PTHREAD_ONCE_INIT; + +void GlobalIouringInitializeOrDie() { + if (pthread_once(&g_once, InitImpl) != 0) { + LOG(FATAL) << "pthread_once failed for GlobalIouringInitializeOrDie"; + exit(1); + } +} + +void GlobalIouringRelease() { + if (g_available.exchange(false, butil::memory_order_acq_rel)) { + IouringEndpoint::GlobalRelease(); + } +} + +// --------------------------------------------------------------------------- +// Runtime queries +// --------------------------------------------------------------------------- +bool IsIouringAvailable() { + return g_available.load(butil::memory_order_acquire); +} + +void GlobalDisableIouring() { + if (g_available.exchange(false, butil::memory_order_acq_rel)) { + LOG(ERROR) << "io_uring disabled due to an unrecoverable error."; + } +} + +int GetIouringSqSize() { return FLAGS_iouring_sq_size; } + +int GetIouringCqSize() { + return FLAGS_iouring_cq_size > 0 ? FLAGS_iouring_cq_size + : FLAGS_iouring_sq_size * 2; +} + +// --------------------------------------------------------------------------- +// Poller group lifecycle (delegates to IouringEndpoint) +// --------------------------------------------------------------------------- +bool InitPollingModeWithTag( + bthread_tag_t tag, + std::function callback, + std::function init_fn, + std::function release_fn) { + return IouringEndpoint::PollingModeInitialize( + tag, + std::move(callback), + std::move(init_fn), + std::move(release_fn)) == 0; +} + +void ReleasePollingModeWithTag(bthread_tag_t tag) { + IouringEndpoint::PollingModeRelease(tag); +} + +} // namespace iouring +} // namespace brpc + +#else // !BRPC_WITH_IOURING + +#include +#include "butil/logging.h" + +namespace brpc { +namespace iouring { +void GlobalIouringInitializeOrDie() { + LOG(ERROR) << "Build with -DWITH_IOURING=ON to use io_uring."; + exit(1); +} +} +} + +#endif // BRPC_WITH_IOURING diff --git a/src/brpc/iouring/iouring_helper.h b/src/brpc/iouring/iouring_helper.h new file mode 100644 index 0000000000..c31d963295 --- /dev/null +++ b/src/brpc/iouring/iouring_helper.h @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_IOURING_HELPER_H +#define BRPC_IOURING_HELPER_H + +#if BRPC_WITH_IOURING + +#include +#include +#include "bthread/types.h" +#include +// Fixed-buffer / slot-pool interface: +#include "brpc/iouring/iouring_block_pool.h" + +namespace brpc { +namespace iouring { + +// Forward declaration so IouringPollerHandle can declare IouringEndpoint as +// a friend without including iouring_endpoint.h (which itself includes this +// header, so a mutual include would create a cycle). +class IouringEndpoint; + +// --------------------------------------------------------------------------- +// gflags (defined in iouring_helper.cpp) +// --------------------------------------------------------------------------- +DECLARE_int32(iouring_sq_size); +DECLARE_int32(iouring_cq_size); +DECLARE_string(iouring_polling_mode); +DECLARE_int32(iouring_sqpoll_idle_ms); +DECLARE_int32(iouring_sqpoll_cpu); +DECLARE_int32(iouring_hybrid_spin_count); + +// --------------------------------------------------------------------------- +// Polling mode enum +// +// Selects the io_uring ring setup flags and the CQE-reap strategy used by +// the Poller thread. All modes run on the Poller thread; they differ only +// in how aggressively the kernel / Poller busy-polls for completions. +// +// Defined here (not endpoint.h) to avoid a circular include. +// Explicit underlying type allows forward-declaration in endpoint.h. +// --------------------------------------------------------------------------- +enum class IouringPollingMode : int { + NONE = 0, // Interrupt-driven (default). Poller calls + // io_uring_wait_cqe_timeout with a 1 ms timeout; the kernel + // notifies via interrupt when I/O completes. The timeout + // keeps the Poller loop responsive to new connections without + // spinning at 100 % when idle. Suitable for most workloads. + SQPOLL = 1, // IORING_SETUP_SQPOLL – kernel SQ-polling thread submits I/O + // automatically; Poller peeks for CQEs. Lowest latency; + // requires CAP_SYS_NICE (or root). + IOPOLL = 2, // IORING_SETUP_IOPOLL – polled block-device completion. + // The kernel never generates interrupts; Poller must peek + // after each submit. Only valid for O_DIRECT block devices. + HYBRID = 3, // SQPOLL + bounded busy-spin: Poller busy-spins N times + // (--iouring_hybrid_spin_count), then falls back to a + // zero-timeout wait. Balances latency and CPU utilisation. +}; + +// --------------------------------------------------------------------------- +// bRPC CQE tag +// +// bRPC marks every SQE it submits by OR-ing bit 63 into user_data: +// +// sqe->user_data = reinterpret_cast(ctx) | kBrpcCqeTag; +// +// Bit 63 is safe to use because Linux x86_64 user-space virtual addresses +// only use bits 0–46 (canonical form); bits 47–63 are always 0 for any +// pointer returned by new/malloc. PollCq only processes CQEs where +// (user_data & kBrpcCqeTag) is set; all other CQEs are left in the ring +// for the user callback to consume. +// +// Users can set user_data to any value that has bit 63 clear – including +// raw pointers, integers, or any application-defined encoding – and bRPC +// will never touch those CQEs. +// --------------------------------------------------------------------------- +static constexpr uint64_t kBrpcCqeTag = (uint64_t(1) << 63); + +// --------------------------------------------------------------------------- +// IouringPollerHandle – opaque handle to a single Poller ring +// +// An IouringPollerHandle is injected by the framework into every callback, +// init_fn, and release_fn registered with InitPollingModeWithTag. It is +// valid only for the duration of that call and must not be stored. +// +// Design rationale +// ---------------- +// The framework already knows which Poller is running when it invokes a +// callback. Passing a handle avoids the "context-injection-then-re-query" +// anti-pattern where the user captures (tag, poller_index) in a closure +// only to pass them back to GetPollerRing / SubmitSqesWithLock. +// +// The handle exposes a single operation: +// +// Submit() – calls prepare_fn(ring) and a single io_uring_submit(). +// Always invoked on the Poller thread so no locking is needed. +// The prepare_fn receives the raw io_uring* and may both +// drain pending user CQEs and queue new SQEs. +// +// bRPC's PollCq skips any CQE whose user_data has bit 63 clear (user CQEs) +// and does NOT call io_uring_cqe_seen() on them. The user MUST drain all +// pending user CQEs inside Submit()'s prepare_fn on every callback invocation; +// failing to do so will eventually fill the CQ and block new submissions. +// +// Typical usage: +// +// brpc::iouring::InitPollingModeWithTag( +// tag, +// /*callback=*/[](brpc::iouring::IouringPollerHandle h) { +// h.Submit([](::io_uring* r) -> int { +// // ① Drain all pending user CQEs (bit 63 clear). +// // bRPC skips but does NOT mark them seen – must be done here. +// struct io_uring_cqe* cqe = nullptr; +// while (io_uring_peek_cqe(r, &cqe) == 0) { +// if (cqe->user_data & brpc::iouring::kBrpcCqeTag) break; +// process_my_completion(cqe->user_data, cqe->res); +// io_uring_cqe_seen(r, cqe); // MUST NOT be omitted +// } +// // ② Queue next user SQEs. +// ::io_uring_sqe* sqe = io_uring_get_sqe(r); +// if (!sqe) return 0; // SQ full – retry next round +// io_uring_prep_nop(sqe); +// sqe->user_data = my_token; // bit 63 MUST be 0 +// return 1; +// }); +// }); +// --------------------------------------------------------------------------- +class IouringPollerHandle { +public: + // SQE submission (and optional CQ drain). + // + // Must be called on the Poller thread (no locking needed). + // Calls prepare_fn(ring) so the caller can queue one or more SQEs via + // io_uring_get_sqe(), then issues a single io_uring_submit(). + // + // prepare_fn must return: + // > 0 number of SQEs queued → submit is issued + // == 0 nothing queued → submit is skipped + // < 0 abort → submit is skipped, errno should be set + // + // Returns io_uring_submit() result (>= 0) or -1 (errno set). + // ENODEV – ring not initialised + // EBUSY – prepare_fn returned < 0 + // other – io_uring_submit() error + int Submit(std::function prepare_fn) const; + + // Accessors (rarely needed). + bthread_tag_t tag() const { return tag_; } + int poller_index() const { return index_; } + +private: + // Only IouringEndpoint constructs handles. + friend class IouringEndpoint; + IouringPollerHandle(bthread_tag_t t, int idx) : tag_(t), index_(idx) {} + + bthread_tag_t tag_; + int index_; +}; + +// --------------------------------------------------------------------------- +// Lifecycle +// --------------------------------------------------------------------------- +void GlobalIouringInitializeOrDie(); +void GlobalIouringRelease(); + +// Register per-Poller hooks for the given bthread tag. +// +// Each callback / init_fn / release_fn receives an IouringPollerHandle that +// is bound to the specific Poller ring that invoked it. The handle is valid +// only for the duration of that invocation. +// +// init_fn – called once after the ring is created and registered. +// Use h.Submit() to queue initial SQEs. +// callback – called after every PollCq pass (each Poller loop iteration). +// Drain user CQEs via h.ring() then submit new ones via +// h.Submit(). +// release_fn – called just before the ring is destroyed on shutdown. +// +// All three run on the Poller thread (safe for CQ access without extra locks). +bool InitPollingModeWithTag( + bthread_tag_t tag, + std::function callback = nullptr, + std::function init_fn = nullptr, + std::function release_fn = nullptr); + +void ReleasePollingModeWithTag(bthread_tag_t tag); + +// --------------------------------------------------------------------------- +// Runtime queries +// --------------------------------------------------------------------------- +bool IsIouringAvailable(); +void GlobalDisableIouring(); +IouringPollingMode GetPollingMode(); +int GetIouringSqSize(); +int GetIouringCqSize(); + +} // namespace iouring +} // namespace brpc + +#else // !BRPC_WITH_IOURING + +namespace brpc { +namespace iouring { +void GlobalIouringInitializeOrDie(); +} +} + +#endif // BRPC_WITH_IOURING +#endif // BRPC_IOURING_HELPER_H diff --git a/src/brpc/iouring_transport.cpp b/src/brpc/iouring_transport.cpp new file mode 100644 index 0000000000..4b4d3c6cc0 --- /dev/null +++ b/src/brpc/iouring_transport.cpp @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#if BRPC_WITH_IOURING + +#include +#include "butil/logging.h" +#include "brpc/event_dispatcher.h" +#include "brpc/input_messenger.h" +#include "brpc/socket.h" +#include "brpc/reloadable_flags.h" +#include "brpc/iouring/iouring_helper.h" +#include "brpc/iouring/iouring_endpoint.h" +#include "brpc/iouring_transport.h" + +namespace brpc { + +DECLARE_bool(usercode_in_coroutine); +DECLARE_bool(usercode_in_pthread); + +extern SocketVarsCollector* g_vars; + +// ----------------------------------------------------------------------- +// IouringTransport::Init +// ----------------------------------------------------------------------- + +void IouringTransport::Init(Socket* socket, const SocketOptions& options) { + _socket = socket; + _default_connect = options.app_connect; + + // Tentatively adopt the caller's edge-trigger callback (or the default + // OnNewMessages). This may be cleared below if io_uring polling takes + // over the read path. + _on_edge_trigger = options.on_edge_triggered_events; + if (options.need_on_edge_trigger && _on_edge_trigger == nullptr) { + _on_edge_trigger = InputMessenger::OnNewMessages; + } + + // Create the endpoint + _iouring_ep = new iouring::IouringEndpoint(socket); + + // Register this socket with the Poller. AllocateResources enqueues an + // ADD SidOp; the Poller thread picks it up, acquires a read slot (when + // --iouring_register_buffers=true) and issues the first SubmitRead. + if (iouring::IsIouringAvailable()) { + if (_iouring_ep->AllocateResources() < 0) { + LOG(WARNING) << "Fail to allocate io_uring resources for " + << socket->description() << ", falling back to TCP"; + delete _iouring_ep; + _iouring_ep = nullptr; + } else { + // io_uring owns the read path regardless of polling mode. + // The Poller thread reaps READ / READ_FIXED CQEs and appends data + // to socket->_read_buf, then calls ProcessNewMessage directly + // (see PollCq). + // + // We must NOT register an epoll edge-trigger callback here, + // because OnNewMessages (the default callback) would call + // DoRead() = read(fd), racing with io_uring's reads and stealing + // data from the ring – causing partial reads, stalled connections + // and protocol parse failures. + // + // Clearing _on_edge_trigger makes HasOnEdgeTrigger() return + // false, so socket.cpp skips AddConsumer(fd) and this fd is + // never added to epoll for read events. + _on_edge_trigger = nullptr; + } + } + + // Create the TCP fallback transport (always available). + // When _iouring_ep is null (AllocateResources failed) _on_edge_trigger + // is still set, so the TCP path works normally via epoll + OnNewMessages. + _tcp_transport = std::make_shared(); + _tcp_transport->Init(socket, options); +} + +// ----------------------------------------------------------------------- +// IouringTransport::Release +// ----------------------------------------------------------------------- + +void IouringTransport::Release() { + if (_iouring_ep) { + delete _iouring_ep; + _iouring_ep = nullptr; + } +} + +// ----------------------------------------------------------------------- +// IouringTransport::Reset +// ----------------------------------------------------------------------- + +int IouringTransport::Reset(int32_t /*expected_nref*/) { + if (_iouring_ep) { + _iouring_ep->Reset(); + } + return 0; +} + +// ----------------------------------------------------------------------- +// IouringTransport::Connect +// ----------------------------------------------------------------------- + +std::shared_ptr IouringTransport::Connect() { + return _default_connect; +} + +// ----------------------------------------------------------------------- +// IouringTransport::CutFromIOBuf (single-buffer write) +// ----------------------------------------------------------------------- + +int IouringTransport::CutFromIOBuf(butil::IOBuf* buf) { + // If io_uring is available and has capacity, use it. + if (_iouring_ep && iouring::IsIouringAvailable() && + _iouring_ep->IsWritable()) { + butil::IOBuf* bufs[1] = {buf}; + ssize_t nw = _iouring_ep->CutFromIOBufList(bufs, 1); + if (nw >= 0) { + return 0; + } + if (errno != EAGAIN) { + return -1; + } + // Fall through to TCP fallback on EAGAIN + } + // Fallback: synchronous write via the fd + return buf->cut_into_file_descriptor(_socket->fd()); +} + +// ----------------------------------------------------------------------- +// IouringTransport::CutFromIOBufList (multi-buffer write) +// ----------------------------------------------------------------------- + +ssize_t IouringTransport::CutFromIOBufList(butil::IOBuf** buf, size_t ndata) { + if (_iouring_ep && iouring::IsIouringAvailable() && + _iouring_ep->IsWritable()) { + ssize_t nw = _iouring_ep->CutFromIOBufList(buf, ndata); + if (nw >= 0 || errno != EAGAIN) { + return nw; + } + // EAGAIN: fall through to synchronous path + } + return butil::IOBuf::cut_multiple_into_file_descriptor( + _socket->fd(), buf, ndata); +} + +// ----------------------------------------------------------------------- +// IouringTransport::WaitEpollOut +// ----------------------------------------------------------------------- +// +// When io_uring polling is active the write path is fully async: SQEs are +// submitted to the ring and completed by the Poller thread. There is no +// need to block on epoll(EPOLLOUT) – instead we wait on _epollout_butex, +// which is incremented by Socket::WakeAsEpollOut() each time a write CQE +// is reaped in IouringEndpoint::PollCq. This keeps the wait/wake path +// entirely within io_uring / bthread primitives and avoids the extra epoll +// file descriptor overhead. +// +// If io_uring is unavailable (e.g. AllocateResources failed) we fall back +// to the traditional epoll(EPOLLOUT) path via Socket::WaitEpollOut(). +// ----------------------------------------------------------------------- + +int IouringTransport::WaitEpollOut(butil::atomic* epollout_butex, + bool pollin, timespec duetime) { + g_vars->nwaitepollout << 1; + + if (_iouring_ep && iouring::IsIouringAvailable()) { + // io_uring path: wait for a write completion to wake us. + // PollCq calls dep->_socket->WakeAsEpollOut() on every WRITE/ + // WRITE_FIXED CQE, which does: + // _epollout_butex->fetch_add(1) + butex_wake_except(...) + // So we just need to butex_wait here. + const int expected = + epollout_butex->load(butil::memory_order_acquire); + if (bthread::butex_wait(epollout_butex, expected, &duetime) < 0) { + if (errno != EAGAIN && errno != ETIMEDOUT) { + const int saved_errno = errno; + PLOG(WARNING) << "butex_wait failed for " << _socket; + _socket->SetFailed(saved_errno, + "butex_wait failed for %s: %s", + _socket->description().c_str(), + berror(saved_errno)); + return 1; + } + } + return 0; + } + + // Fallback: io_uring is unavailable for this socket (AllocateResources + // failed at Init time, so _iouring_ep is null and writes go through the + // synchronous fd path which can return EAGAIN). Fall back to epoll so + // that KeepWrite does not busy-spin when the send buffer is full. + const int rc = _socket->WaitEpollOut(_socket->fd(), pollin, &duetime); + if (rc < 0 && errno != ETIMEDOUT) { + const int saved_errno = errno; + PLOG(WARNING) << "Fail to wait epollout of " << _socket; + _socket->SetFailed(saved_errno, "Fail to wait epollout of %s: %s", + _socket->description().c_str(), berror(saved_errno)); + return 1; + } + return 0; +} + +// ----------------------------------------------------------------------- +// IouringTransport::ProcessEvent +// ----------------------------------------------------------------------- + +void IouringTransport::ProcessEvent(bthread_attr_t attr) { + bthread_t tid; + if (FLAGS_usercode_in_coroutine) { + OnEdge(_socket); + } else if (bthread_start_background(&tid, &attr, OnEdge, _socket) != 0) { + LOG(FATAL) << "Fail to start ProcessEvent bthread"; + OnEdge(_socket); + } +} + +// ----------------------------------------------------------------------- +// IouringTransport::QueueMessage +// ----------------------------------------------------------------------- + +void IouringTransport::QueueMessage(InputMessageClosure& input_msg, + int* num_bthread_created, + bool /*last_msg*/) { + InputMessageBase* to_run_msg = input_msg.release(); + if (!to_run_msg) { + return; + } + bthread_t th; + bthread_attr_t tmp = + (FLAGS_usercode_in_pthread ? BTHREAD_ATTR_PTHREAD : BTHREAD_ATTR_NORMAL) | + BTHREAD_NOSIGNAL; + tmp.keytable_pool = _socket->keytable_pool(); + tmp.tag = bthread_self_tag(); + bthread_attr_set_name(&tmp, "ProcessInputMessage"); + if (!FLAGS_usercode_in_coroutine && + bthread_start_background(&th, &tmp, ProcessInputMessage, to_run_msg) == 0) { + ++*num_bthread_created; + } else { + ProcessInputMessage(to_run_msg); + } +} + +// ----------------------------------------------------------------------- +// IouringTransport::Debug +// ----------------------------------------------------------------------- + +void IouringTransport::Debug(std::ostream& os) { + if (_iouring_ep) { + _iouring_ep->DebugInfo(os); + } +} + +// ----------------------------------------------------------------------- +// IouringTransport::ContextInitOrDie (called once at server/channel start) +// ----------------------------------------------------------------------- + +int IouringTransport::ContextInitOrDie(bool /*serverOrNot*/, + const void* /*_options*/) { + iouring::GlobalIouringInitializeOrDie(); + return 0; +} + +} // namespace brpc + +#endif // BRPC_WITH_IOURING diff --git a/src/brpc/iouring_transport.h b/src/brpc/iouring_transport.h new file mode 100644 index 0000000000..364853fbaf --- /dev/null +++ b/src/brpc/iouring_transport.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_IOURING_TRANSPORT_H +#define BRPC_IOURING_TRANSPORT_H + +#if BRPC_WITH_IOURING + +#include "brpc/socket.h" +#include "brpc/channel.h" +#include "brpc/transport.h" +#include "brpc/tcp_transport.h" + +// Forward-declare IouringEndpoint to break the circular include cycle between +// iouring_endpoint.h (which would include iouring_transport.h) and this file. +namespace brpc { +namespace iouring { +class IouringEndpoint; +} // namespace iouring +} // namespace brpc + +namespace brpc { + +// IouringTransport wraps an IouringEndpoint and implements the Transport +// interface so that bRPC's Socket can use io_uring for async I/O instead of +// the default epoll-based mechanism. +// +// Design mirrors RdmaTransport: +// - On the write path, CutFromIOBufList delegates to IouringEndpoint which +// submits writev SQEs to the io_uring ring. +// - On the read path, the poller drives IouringEndpoint::PollCq, which reaps +// CQEs and appends received bytes to socket->_read_buf. +// - ProcessEvent / QueueMessage are identical to TcpTransport. +class IouringTransport : public Transport { + friend class TransportFactory; + friend class iouring::IouringEndpoint; +public: + void Init(Socket* socket, const SocketOptions& options) override; + void Release() override; + int Reset(int32_t expected_nref) override; + std::shared_ptr Connect() override; + int CutFromIOBuf(butil::IOBuf* buf) override; + ssize_t CutFromIOBufList(butil::IOBuf** buf, size_t ndata) override; + int WaitEpollOut(butil::atomic* _epollout_butex, + bool pollin, timespec duetime) override; + void ProcessEvent(bthread_attr_t attr) override; + void QueueMessage(InputMessageClosure& input_msg, + int* num_bthread_created, bool last_msg) override; + void Debug(std::ostream& os) override; + + iouring::IouringEndpoint* GetIouringEp() { + return _iouring_ep; + } + + // Global context initialization (called once per server/channel start) + static int ContextInitOrDie(bool serverOrNot, const void* _options); + +private: + // The io_uring endpoint + iouring::IouringEndpoint* _iouring_ep = nullptr; + + // Fallback TCP transport (used when io_uring is unavailable at runtime) + std::shared_ptr _tcp_transport; +}; + +} // namespace brpc + +#endif // BRPC_WITH_IOURING +#endif // BRPC_IOURING_TRANSPORT_H diff --git a/src/brpc/socket.h b/src/brpc/socket.h index 816fccdf27..43f6b4c2ce 100644 --- a/src/brpc/socket.h +++ b/src/brpc/socket.h @@ -57,6 +57,9 @@ namespace rdma { class RdmaEndpoint; class RdmaConnect; } +namespace iouring { +class IouringEndpoint; +} class Socket; class AuthContext; @@ -317,6 +320,7 @@ friend class policy::RtmpContext; friend class schan::ChannelBalancer; friend class rdma::RdmaEndpoint; friend class rdma::RdmaConnect; +friend class iouring::IouringEndpoint; friend class HealthCheckTask; friend class OnAppHealthCheckDone; friend class HealthCheckManager; @@ -327,6 +331,7 @@ friend void DereferenceSocket(Socket*); friend class Transport; friend class TcpTransport; friend class RdmaTransport; +friend class IouringTransport; friend class TransportFactory; class SharedPart; struct WriteRequest; diff --git a/src/brpc/socket_mode.h b/src/brpc/socket_mode.h index b5d42be4aa..18a760a153 100644 --- a/src/brpc/socket_mode.h +++ b/src/brpc/socket_mode.h @@ -19,8 +19,9 @@ #define BRPC_SOCKET_MODE_H namespace brpc { enum SocketMode { - SOCKET_MODE_TCP = 0, - SOCKET_MODE_RDMA = 1 + SOCKET_MODE_TCP = 0, + SOCKET_MODE_RDMA = 1, + SOCKET_MODE_IOURING = 2 }; } // namespace brpc #endif //BRPC_SOCKET_MODE_H \ No newline at end of file diff --git a/src/brpc/transport_factory.cpp b/src/brpc/transport_factory.cpp index b689e2edd2..ed9870f497 100644 --- a/src/brpc/transport_factory.cpp +++ b/src/brpc/transport_factory.cpp @@ -18,6 +18,7 @@ #include "brpc/transport_factory.h" #include "brpc/tcp_transport.h" #include "brpc/rdma_transport.h" +#include "brpc/iouring_transport.h" namespace brpc { int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const void* _options) { @@ -28,6 +29,11 @@ int TransportFactory::ContextInitOrDie(SocketMode mode, bool serverOrNot, const else if (mode == SOCKET_MODE_RDMA) { return RdmaTransport::ContextInitOrDie(serverOrNot, _options); } +#endif +#if BRPC_WITH_IOURING + else if (mode == SOCKET_MODE_IOURING) { + return IouringTransport::ContextInitOrDie(serverOrNot, _options); + } #endif else { LOG(ERROR) << "unknown transport type " << mode; @@ -43,6 +49,11 @@ std::unique_ptr TransportFactory::CreateTransport(SocketMode mode) { else if (mode == SOCKET_MODE_RDMA) { return std::unique_ptr(new RdmaTransport()); } +#endif +#if BRPC_WITH_IOURING + else if (mode == SOCKET_MODE_IOURING) { + return std::unique_ptr(new IouringTransport()); + } #endif else { LOG(ERROR) << "socket_mode set error"; diff --git a/src/bthread/types.h b/src/bthread/types.h index d46de1e835..f3c51a7e16 100644 --- a/src/bthread/types.h +++ b/src/bthread/types.h @@ -152,7 +152,14 @@ static const size_t BTHREAD_EPOLL_THREAD_NUM = 1; static const bthread_t BTHREAD_ATOMIC_INIT = 0; // Min/Max number of work pthreads. +// When RDMA or io_uring is compiled in, their dedicated Poller threads handle +// I/O directly, so the bare minimum drops to 1 worker + epoll thread. +// Otherwise keep the traditional floor of 3 + epoll (1 epoll + 2 workers). +#if defined(BRPC_WITH_RDMA) || defined(BRPC_WITH_IOURING) +static const int BTHREAD_MIN_CONCURRENCY = 1 + BTHREAD_EPOLL_THREAD_NUM; +#else static const int BTHREAD_MIN_CONCURRENCY = 3 + BTHREAD_EPOLL_THREAD_NUM; +#endif static const int BTHREAD_MAX_CONCURRENCY = 1024; // Min/max number of ParkingLot. static const int BTHREAD_MIN_PARKINGLOT = 4; diff --git a/src/butil/single_threaded_pool.h b/src/butil/single_threaded_pool.h index 7f34b93ccb..7f882fc50a 100644 --- a/src/butil/single_threaded_pool.h +++ b/src/butil/single_threaded_pool.h @@ -57,7 +57,7 @@ class SingleThreadedPool { Block* next; Node nodes[NITEM]; }; - static const size_t BLOCK_SIZE = sizeof(Block); + static const size_t POOL_BLOCK_SIZE = sizeof(Block); static const size_t NITEM = Block::NITEM; static const size_t ITEM_SIZE = ITEM_SIZE_IN;