From 59495bf916a1641ba4de6f86c7d40c8c77174903 Mon Sep 17 00:00:00 2001 From: Nic Date: Wed, 29 Apr 2026 22:55:13 +0800 Subject: [PATCH 1/3] fix(ai-proxy-multi): resolve _dns_value in construct_upstream when nil When CP pushes a config update, config_etcd replaces the entire config table, causing the runtime _dns_value field to be lost. This made construct_upstream() return an error during periodic health check validation, logging a noisy warning. Fix by calling resolve_endpoint() as fallback when _dns_value is nil. This is safe because the DNS resolver has TTL-based caching, and this path only triggers after config table replacement. --- apisix/plugins/ai-proxy-multi.lua | 6 +++- t/plugin/ai-proxy-multi3.t | 47 +++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/apisix/plugins/ai-proxy-multi.lua b/apisix/plugins/ai-proxy-multi.lua index 419af6aaaa2a..a21c477eb4f4 100644 --- a/apisix/plugins/ai-proxy-multi.lua +++ b/apisix/plugins/ai-proxy-multi.lua @@ -439,7 +439,11 @@ function _M.construct_upstream(instance) local upstream = {} local node = instance._dns_value if not node then - return nil, "failed to resolve endpoint for instance: " .. instance.name + resolve_endpoint(instance) + node = instance._dns_value + if not node then + return nil, "failed to resolve endpoint for instance: " .. instance.name + end end if not node.host or not node.port then diff --git a/t/plugin/ai-proxy-multi3.t b/t/plugin/ai-proxy-multi3.t index faeb33c915c2..abb093426bba 100644 --- a/t/plugin/ai-proxy-multi3.t +++ b/t/plugin/ai-proxy-multi3.t @@ -1060,3 +1060,50 @@ failed to get health check target status --- error_log releasing existing checker --- timeout: 5 + + + +=== TEST 16: construct_upstream resolves _dns_value when nil (config table replacement scenario) +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-proxy-multi") + + -- Simulate an instance after config table replacement: _dns_value is lost + local instance = { + name = "test-instance", + provider = "openai", + weight = 1, + priority = 0, + override = { + endpoint = "https://api.openai.com:443", + }, + auth = { + header = { + Authorization = "Bearer test-key", + }, + }, + } + + -- Confirm _dns_value is nil (simulating config table replacement) + assert(instance._dns_value == nil, "_dns_value should be nil initially") + + -- construct_upstream should resolve _dns_value as fallback + local upstream, err = plugin.construct_upstream(instance) + if not upstream then + ngx.say("FAIL: " .. err) + return + end + + -- Verify _dns_value was populated + assert(instance._dns_value ~= nil, "_dns_value should be set after construct_upstream") + + ngx.say("host: ", upstream.nodes[1].host) + ngx.say("port: ", upstream.nodes[1].port) + ngx.say("passed") + } + } +--- response_body +host: api.openai.com +port: 443 +passed From 6389779bc49faeb5e5239eec7299d35f4609564b Mon Sep 17 00:00:00 2001 From: Nic Date: Wed, 29 Apr 2026 23:00:07 +0800 Subject: [PATCH 2/3] fix: also handle nil _dns_value in fetch_health_instances Add defensive resolve_endpoint() fallback in fetch_health_instances() where _dns_value is accessed for health check target status. If resolution fails, treat the instance as healthy to avoid dropping traffic. --- apisix/plugins/ai-proxy-multi.lua | 20 +++++++++++++++----- t/plugin/ai-proxy-multi3.t | 6 +++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/apisix/plugins/ai-proxy-multi.lua b/apisix/plugins/ai-proxy-multi.lua index a21c477eb4f4..54bc2fcf9339 100644 --- a/apisix/plugins/ai-proxy-multi.lua +++ b/apisix/plugins/ai-proxy-multi.lua @@ -253,12 +253,22 @@ local function fetch_health_instances(conf, checkers) local port = ins.checks and ins.checks.active and ins.checks.active.port local node = ins._dns_value - local ok, err = checker:get_target_status(node.host, port or node.port, host) - if ok then + if not node then + resolve_endpoint(ins) + node = ins._dns_value + end + if node then + local ok, err = checker:get_target_status(node.host, port or node.port, host) + if ok then + transform_instances(new_instances, ins) + elseif err then + core.log.warn("failed to get health check target status, addr: ", + node.host, ":", port or node.port, ", host: ", host, ", err: ", err) + end + else + core.log.warn("failed to resolve endpoint for instance: ", ins.name, + ", treating as healthy") transform_instances(new_instances, ins) - elseif err then - core.log.warn("failed to get health check target status, addr: ", - node.host, ":", port or node.port, ", host: ", host, ", err: ", err) end else transform_instances(new_instances, ins) diff --git a/t/plugin/ai-proxy-multi3.t b/t/plugin/ai-proxy-multi3.t index abb093426bba..21930b0d13c6 100644 --- a/t/plugin/ai-proxy-multi3.t +++ b/t/plugin/ai-proxy-multi3.t @@ -1063,7 +1063,7 @@ releasing existing checker -=== TEST 16: construct_upstream resolves _dns_value when nil (config table replacement scenario) +=== TEST 14: construct_upstream resolves _dns_value when nil (config table replacement scenario) --- config location /t { content_by_lua_block { @@ -1076,7 +1076,7 @@ releasing existing checker weight = 1, priority = 0, override = { - endpoint = "https://api.openai.com:443", + endpoint = "https://127.0.0.1:443", }, auth = { header = { @@ -1104,6 +1104,6 @@ releasing existing checker } } --- response_body -host: api.openai.com +host: 127.0.0.1 port: 443 passed From 03e6bc1b74d32439cca2eff051cfecbd6b7da337 Mon Sep 17 00:00:00 2001 From: Nic Date: Thu, 30 Apr 2026 10:20:29 +0800 Subject: [PATCH 3/3] f Signed-off-by: Nic --- apisix/plugins/ai-proxy-multi.lua | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/apisix/plugins/ai-proxy-multi.lua b/apisix/plugins/ai-proxy-multi.lua index 54bc2fcf9339..a21c477eb4f4 100644 --- a/apisix/plugins/ai-proxy-multi.lua +++ b/apisix/plugins/ai-proxy-multi.lua @@ -253,22 +253,12 @@ local function fetch_health_instances(conf, checkers) local port = ins.checks and ins.checks.active and ins.checks.active.port local node = ins._dns_value - if not node then - resolve_endpoint(ins) - node = ins._dns_value - end - if node then - local ok, err = checker:get_target_status(node.host, port or node.port, host) - if ok then - transform_instances(new_instances, ins) - elseif err then - core.log.warn("failed to get health check target status, addr: ", - node.host, ":", port or node.port, ", host: ", host, ", err: ", err) - end - else - core.log.warn("failed to resolve endpoint for instance: ", ins.name, - ", treating as healthy") + local ok, err = checker:get_target_status(node.host, port or node.port, host) + if ok then transform_instances(new_instances, ins) + elseif err then + core.log.warn("failed to get health check target status, addr: ", + node.host, ":", port or node.port, ", host: ", host, ", err: ", err) end else transform_instances(new_instances, ins)