diff --git a/apisix/plugins/api-breaker.lua b/apisix/plugins/api-breaker.lua index 440f0ee60940..277b6622fc63 100644 --- a/apisix/plugins/api-breaker.lua +++ b/apisix/plugins/api-breaker.lua @@ -23,245 +23,643 @@ local error = error local ipairs = ipairs -local shared_buffer = ngx.shared["plugin-".. plugin_name] +local shared_buffer = ngx.shared["plugin-" .. plugin_name] if not shared_buffer then - error("failed to get ngx.shared dict when load plugin " .. plugin_name) + error("failed to get ngx.shared dict when load plugin " .. plugin_name) end +-- Circuit breaker states (only for ratio policy) +local CLOSED = 0 +local OPEN = 1 +local HALF_OPEN = 2 local schema = { - type = "object", + type = "object", + properties = { + break_response_code = { + type = "integer", + minimum = 200, + maximum = 599, + }, + break_response_body = { + type = "string" + }, + break_response_headers = { + type = "array", + items = { + type = "object", + properties = { + key = { + type = "string", + minLength = 1 + }, + value = { + type = "string", + minLength = 1 + } + }, + required = { "key", "value" }, + } + }, + max_breaker_sec = { + type = "integer", + minimum = 3, + default = 300, + description = "Circuit breaker duration in seconds (applies to both count and ratio policies)" + }, + policy = { + type = "string", + enum = { "unhealthy-count", "unhealthy-ratio" }, + default = "unhealthy-count", + } + }, + required = { "break_response_code" }, + ["if"] = { + properties = { + policy = { + enum = { "unhealthy-count" }, + }, + }, + }, + ["then"] = { properties = { - break_response_code = { + unhealthy = { + type = "object", + properties = { + http_statuses = { + type = "array", + minItems = 1, + items = { + type = "integer", + minimum = 500, + maximum = 599, + }, + uniqueItems = true, + default = { 500 } + }, + failures = { type = "integer", - minimum = 200, - maximum = 599, + minimum = 1, + default = 3, + } }, - break_response_body = { - type = "string" - }, - break_response_headers = { + default = { http_statuses = { 500 }, failures = 3 } + }, + healthy = { + type = "object", + properties = { + http_statuses = { type = "array", + minItems = 1, items = { - type = "object", - properties = { - key = { - type = "string", - minLength = 1 - }, - value = { - type = "string", - minLength = 1 - } - }, - required = {"key", "value"}, - } - }, - max_breaker_sec = { + type = "integer", + minimum = 200, + maximum = 499, + }, + uniqueItems = true, + default = { 200 } + }, + successes = { type = "integer", - minimum = 3, - default = 300, + minimum = 1, + default = 3, + } + }, + default = { http_statuses = { 200 }, successes = 3 } + } + } + }, + ["else"] = { + ["if"] = { + properties = { + policy = { + enum = { "unhealthy-ratio" }, }, + }, + }, + ["then"] = { + properties = { unhealthy = { - type = "object", - properties = { - http_statuses = { - type = "array", - minItems = 1, - items = { - type = "integer", - minimum = 500, - maximum = 599, - }, - uniqueItems = true, - default = {500} - }, - failures = { - type = "integer", - minimum = 1, - default = 3, - } + type = "object", + properties = { + http_statuses = { + type = "array", + minItems = 1, + items = { + type = "integer", + minimum = 500, + maximum = 599, + }, + uniqueItems = true, + default = { 500 } + }, + error_ratio = { + type = "number", + minimum = 0, + maximum = 1, + default = 0.5, + description = "Failure rate threshold to trigger circuit breaker" + }, + min_request_threshold = { + type = "integer", + minimum = 1, + default = 10, + description = "Minimum number of calls before circuit breaker can be triggered" + }, + sliding_window_size = { + type = "integer", + minimum = 10, + maximum = 3600, + default = 300, + description = "Size of the sliding window in seconds" }, - default = {http_statuses = {500}, failures = 3} + permitted_number_of_calls_in_half_open_state = { + type = "integer", + minimum = 1, + maximum = 20, + default = 3, + description = "Number of permitted calls when circuit breaker is half-open" + } + }, + default = { + http_statuses = { 500 }, + error_ratio = 0.5, + min_request_threshold = 10, + sliding_window_size = 300, + permitted_number_of_calls_in_half_open_state = 3 + } }, healthy = { - type = "object", - properties = { - http_statuses = { - type = "array", - minItems = 1, - items = { - type = "integer", - minimum = 200, - maximum = 499, - }, - uniqueItems = true, - default = {200} - }, - successes = { - type = "integer", - minimum = 1, - default = 3, - } + type = "object", + properties = { + http_statuses = { + type = "array", + minItems = 1, + items = { + type = "integer", + minimum = 200, + maximum = 499, + }, + uniqueItems = true, + default = { 200 } }, - default = {http_statuses = {200}, successes = 3} + success_ratio = { + type = "number", + minimum = 0, + maximum = 1, + default = 0.6, + description = "Success rate threshold to close circuit breaker from half-open state" + } + }, + default = { http_statuses = { 200 }, success_ratio = 0.6 } } - }, - required = {"break_response_code"}, + } + } + } } - +-- Key generation functions (based on latest APISIX version) local function gen_healthy_key(ctx) - return "healthy-" .. core.request.get_host(ctx) .. ctx.var.uri + return "healthy-" .. core.request.get_host(ctx) .. ctx.var.uri end - local function gen_unhealthy_key(ctx) - return "unhealthy-" .. core.request.get_host(ctx) .. ctx.var.uri + return "unhealthy-" .. core.request.get_host(ctx) .. ctx.var.uri end - local function gen_lasttime_key(ctx) - return "unhealthy-lasttime" .. core.request.get_host(ctx) .. ctx.var.uri + return "unhealthy-lasttime" .. core.request.get_host(ctx) .. ctx.var.uri +end + +-- New key generation functions for ratio policy +local function gen_state_key(ctx) + return "cb-state-" .. core.request.get_host(ctx) .. ctx.var.uri +end + +local function gen_total_requests_key(ctx) + return "cb-total-" .. core.request.get_host(ctx) .. ctx.var.uri end +local function gen_window_start_time_key(ctx) + return "cb-window-" .. core.request.get_host(ctx) .. ctx.var.uri +end + +local function gen_last_state_change_key(ctx) + return "cb-last-change-" .. core.request.get_host(ctx) .. ctx.var.uri +end + +local function gen_half_open_calls_key(ctx) + return "cb-half-open-calls-" .. core.request.get_host(ctx) .. ctx.var.uri +end + +local function gen_half_open_success_key(ctx) + return "cb-half-open-success-" .. core.request.get_host(ctx) .. ctx.var.uri +end local _M = { - version = 0.1, - name = plugin_name, - priority = 1005, - schema = schema, + version = 0.1, + name = plugin_name, + priority = 1005, + schema = schema, } - function _M.check_schema(conf) - return core.schema.check(schema, conf) + return core.schema.check(schema, conf) end +-- Circuit breaker state management functions +local function get_circuit_breaker_state(ctx) + local state_key = gen_state_key(ctx) + local state, err = shared_buffer:get(state_key) + if err then + core.log.warn("failed to get circuit breaker state: ", err) + return CLOSED + end + return state or CLOSED +end -function _M.access(conf, ctx) - local unhealthy_key = gen_unhealthy_key(ctx) - -- unhealthy counts - local unhealthy_count, err = shared_buffer:get(unhealthy_key) - if err then - core.log.warn("failed to get unhealthy_key: ", - unhealthy_key, " err: ", err) - return - end +local function set_circuit_breaker_state(ctx, state) + local state_key = gen_state_key(ctx) + local last_change_key = gen_last_state_change_key(ctx) + local current_time = ngx.time() - if not unhealthy_count then - return + shared_buffer:set(state_key, state) + shared_buffer:set(last_change_key, current_time) + + core.log.info("Circuit breaker state changed to: ", state, " at: ", current_time) +end + +-- Sliding window management +local function reset_sliding_window(ctx, current_time, window_size) + local window_start_key = gen_window_start_time_key(ctx) + local total_requests_key = gen_total_requests_key(ctx) + local unhealthy_key = gen_unhealthy_key(ctx) + + shared_buffer:set(window_start_key, current_time) + shared_buffer:set(total_requests_key, 0) + shared_buffer:set(unhealthy_key, 0) + + -- Reset circuit breaker state to CLOSED when sliding window resets + shared_buffer:delete(gen_state_key(ctx)) + shared_buffer:delete(gen_last_state_change_key(ctx)) + shared_buffer:delete(gen_half_open_calls_key(ctx)) + shared_buffer:delete(gen_half_open_success_key(ctx)) + + core.log.info("Sliding window reset at: ", current_time, " window size: ", window_size, "s") +end + +local function check_and_reset_window(ctx, conf) + local current_time = ngx.time() + local window_start_key = gen_window_start_time_key(ctx) + local window_start_time, err = shared_buffer:get(window_start_key) + + if err then + core.log.warn("failed to get window start time: ", err) + return + end + + local window_size = conf.unhealthy.sliding_window_size or 300 + + if not window_start_time or (current_time - window_start_time) >= window_size then + reset_sliding_window(ctx, current_time, window_size) + end +end + +-- Count-based circuit breaker (based on latest APISIX version) +local function count_based_access(conf, ctx) + local unhealthy_key = gen_unhealthy_key(ctx) + -- unhealthy counts + local unhealthy_count, err = shared_buffer:get(unhealthy_key) + if err then + core.log.warn("failed to get unhealthy_key: ", + unhealthy_key, " err: ", err) + return + end + + if not unhealthy_count then + return + end + + -- timestamp of the last time a unhealthy state was triggered + local lasttime_key = gen_lasttime_key(ctx) + local lasttime, err = shared_buffer:get(lasttime_key) + if err then + core.log.warn("failed to get lasttime_key: ", + lasttime_key, " err: ", err) + return + end + + if not lasttime then + return + end + + local failure_times = math.floor(unhealthy_count / conf.unhealthy.failures) + if failure_times < 1 then + failure_times = 1 + end + + -- cannot exceed the maximum value of the user configuration + local breaker_time = 2 ^ failure_times + if breaker_time > conf.max_breaker_sec then + breaker_time = conf.max_breaker_sec + end + core.log.info("breaker_time: ", breaker_time) + + -- breaker + if lasttime + breaker_time >= ngx.time() then + if conf.break_response_body then + if conf.break_response_headers then + for _, value in ipairs(conf.break_response_headers) do + local val = core.utils.resolve_var(value.value, ctx.var) + core.response.add_header(value.key, val) + end + end + return conf.break_response_code, conf.break_response_body end + return conf.break_response_code + end - -- timestamp of the last time a unhealthy state was triggered - local lasttime_key = gen_lasttime_key(ctx) - local lasttime, err = shared_buffer:get(lasttime_key) + return +end + +-- Ratio-based circuit breaker +local function ratio_based_access(conf, ctx) + -- Check and reset sliding window first to ensure consistent state + check_and_reset_window(ctx, conf) + + local current_state = get_circuit_breaker_state(ctx) + local current_time = ngx.time() + + -- Handle OPEN state + if current_state == OPEN then + local last_change_key = gen_last_state_change_key(ctx) + local last_change_time, err = shared_buffer:get(last_change_key) if err then - core.log.warn("failed to get lasttime_key: ", - lasttime_key, " err: ", err) - return + core.log.warn("failed to get last change time: ", err) + return conf.break_response_code, conf.break_response_body or "Service temporarily unavailable" end - if not lasttime then - return + local wait_duration = conf.max_breaker_sec or 60 + if last_change_time and (current_time - last_change_time) >= wait_duration then + -- Transition to HALF_OPEN + set_circuit_breaker_state(ctx, HALF_OPEN) + -- Reset half-open counters + shared_buffer:set(gen_half_open_calls_key(ctx), 0) + shared_buffer:set(gen_half_open_success_key(ctx), 0) + core.log.info("Circuit breaker transitioned from OPEN to HALF_OPEN") + return -- Allow this request to pass + else + -- Still in OPEN state, reject request + if conf.break_response_headers then + for _, value in ipairs(conf.break_response_headers) do + local val = core.utils.resolve_var(value.value, ctx.var) + core.response.add_header(value.key, val) + end + end + return conf.break_response_code, conf.break_response_body or "Service temporarily unavailable" end + end - local failure_times = math.floor(unhealthy_count / conf.unhealthy.failures) - if failure_times < 1 then - failure_times = 1 + -- Handle HALF_OPEN state + if current_state == HALF_OPEN then + local half_open_calls_key = gen_half_open_calls_key(ctx) + local half_open_calls, err = shared_buffer:incr(half_open_calls_key, 1, 0) + if err then + core.log.warn("failed to increment half-open calls: ", err) end - -- cannot exceed the maximum value of the user configuration - local breaker_time = 2 ^ failure_times - if breaker_time > conf.max_breaker_sec then - breaker_time = conf.max_breaker_sec - end - core.log.info("breaker_time: ", breaker_time) - - -- breaker - if lasttime + breaker_time >= ngx.time() then - if conf.break_response_body then - if conf.break_response_headers then - for _, value in ipairs(conf.break_response_headers) do - local val = core.utils.resolve_var(value.value, ctx.var) - core.response.add_header(value.key, val) - end - end - return conf.break_response_code, conf.break_response_body - end - return conf.break_response_code + local permitted_calls = conf.unhealthy.permitted_number_of_calls_in_half_open_state or 3 + if half_open_calls > permitted_calls then + -- Too many calls in half-open state, reject + return conf.break_response_code, conf.break_response_body or "Service temporarily unavailable" end + -- Allow request to pass for evaluation return -end + end + -- CLOSED state - check if we should transition to OPEN + local total_requests_key = gen_total_requests_key(ctx) + local unhealthy_key = gen_unhealthy_key(ctx) -function _M.log(conf, ctx) - local unhealthy_key = gen_unhealthy_key(ctx) - local healthy_key = gen_healthy_key(ctx) - local upstream_status = core.response.get_upstream_status(ctx) + local total_requests, err = shared_buffer:get(total_requests_key) + if err then + core.log.warn("failed to get total requests: ", err) + return + end - if not upstream_status then - return + local unhealthy_count, err = shared_buffer:get(unhealthy_key) + if err then + core.log.warn("failed to get unhealthy count: ", err) + return + end + + if total_requests and unhealthy_count and total_requests > 0 then + local minimum_calls = conf.unhealthy.min_request_threshold or 10 + local failure_threshold = conf.unhealthy.error_ratio or 0.5 + + if total_requests >= minimum_calls then + local failure_rate = unhealthy_count / total_requests + -- Use precise comparison to avoid floating point issues + local rounded_failure_rate = math.floor(failure_rate * 10000 + 0.5) / 10000 + local rounded_threshold = math.floor(failure_threshold * 10000 + 0.5) / 10000 + + core.log.info("Circuit breaker check - total: ", total_requests, + " failures: ", unhealthy_count, + " rate: ", rounded_failure_rate, + " threshold: ", rounded_threshold) + + if rounded_failure_rate >= rounded_threshold then + -- Transition to OPEN state + set_circuit_breaker_state(ctx, OPEN) + core.log.warn("Circuit breaker OPENED - failure rate: ", rounded_failure_rate, + " >= threshold: ", rounded_threshold) + return conf.break_response_code, conf.break_response_body or "Service temporarily unavailable" + end end + end - -- unhealthy process - if core.table.array_find(conf.unhealthy.http_statuses, - upstream_status) - then - local unhealthy_count, err = shared_buffer:incr(unhealthy_key, 1, 0) - if err then - core.log.warn("failed to incr unhealthy_key: ", unhealthy_key, - " err: ", err) - end - core.log.info("unhealthy_key: ", unhealthy_key, " count: ", - unhealthy_count) - - shared_buffer:delete(healthy_key) - - -- whether the user-configured number of failures has been reached, - -- and if so, the timestamp for entering the unhealthy state. - if unhealthy_count % conf.unhealthy.failures == 0 then - shared_buffer:set(gen_lasttime_key(ctx), ngx.time(), - conf.max_breaker_sec) - core.log.info("update unhealthy_key: ", unhealthy_key, " to ", - unhealthy_count) - end + return +end - return - end +function _M.access(conf, ctx) + if conf.policy == "unhealthy-ratio" then + return ratio_based_access(conf, ctx) + else + -- Default to count-based (unhealthy-count) + return count_based_access(conf, ctx) + end +end - -- health process - if not core.table.array_find(conf.healthy.http_statuses, upstream_status) then - return - end +-- Count-based logging (based on latest APISIX version) +local function count_based_log(conf, ctx) + local unhealthy_key = gen_unhealthy_key(ctx) + local healthy_key = gen_healthy_key(ctx) + local upstream_status = core.response.get_upstream_status(ctx) + + if not upstream_status then + return + end - local unhealthy_count, err = shared_buffer:get(unhealthy_key) + -- unhealthy process + if core.table.array_find(conf.unhealthy.http_statuses, + upstream_status) + then + local unhealthy_count, err = shared_buffer:incr(unhealthy_key, 1, 0) if err then - core.log.warn("failed to `get` unhealthy_key: ", unhealthy_key, - " err: ", err) + core.log.warn("failed to incr unhealthy_key: ", unhealthy_key, + " err: ", err) end - - if not unhealthy_count then - return + core.log.info("unhealthy_key: ", unhealthy_key, " count: ", + unhealthy_count) + + shared_buffer:delete(healthy_key) + + -- whether the user-configured number of failures has been reached, + -- and if so, the timestamp for entering the unhealthy state. + if unhealthy_count % conf.unhealthy.failures == 0 then + shared_buffer:set(gen_lasttime_key(ctx), ngx.time(), + conf.max_breaker_sec) + core.log.info("update unhealthy_key: ", unhealthy_key, " to ", + unhealthy_count) end - local healthy_count, err = shared_buffer:incr(healthy_key, 1, 0) + return + end + + -- health process + if not core.table.array_find(conf.healthy.http_statuses, upstream_status) then + return + end + + local unhealthy_count, err = shared_buffer:get(unhealthy_key) + if err then + core.log.warn("failed to `get` unhealthy_key: ", unhealthy_key, + " err: ", err) + end + + if not unhealthy_count then + return + end + + local healthy_count, err = shared_buffer:incr(healthy_key, 1, 0) + if err then + core.log.warn("failed to `incr` healthy_key: ", healthy_key, + " err: ", err) + end + + -- clear related status + if healthy_count >= conf.healthy.successes then + -- stat change to normal + core.log.info("change to normal, ", healthy_key, " ", healthy_count) + shared_buffer:delete(gen_lasttime_key(ctx)) + shared_buffer:delete(unhealthy_key) + shared_buffer:delete(healthy_key) + end + + return +end + +-- Ratio-based logging +local function ratio_based_log(conf, ctx) + local upstream_status = core.response.get_upstream_status(ctx) + if not upstream_status then + return + end + + local current_state = get_circuit_breaker_state(ctx) + + -- Increment total request counter + local total_requests_key = gen_total_requests_key(ctx) + local total_requests, err = shared_buffer:incr(total_requests_key, 1, 0) + if err then + core.log.warn("failed to increment total requests: ", err) + end + + -- Handle response based on status + local is_failure = core.table.array_find(conf.unhealthy.http_statuses, upstream_status) + local is_success = not is_failure and core.table.array_find(conf.healthy.http_statuses, upstream_status) + + if is_failure then + -- Increment failure counter + local unhealthy_key = gen_unhealthy_key(ctx) + local unhealthy_count, err = shared_buffer:incr(unhealthy_key, 1, 0) if err then - core.log.warn("failed to `incr` healthy_key: ", healthy_key, - " err: ", err) + core.log.warn("failed to increment unhealthy count: ", err) + end + + core.log.info("Request failed - status: ", upstream_status, + " total: ", total_requests, + " failures: ", unhealthy_count) + + -- If in HALF_OPEN state and got a failure, immediately go back to OPEN + if current_state == HALF_OPEN then + set_circuit_breaker_state(ctx, OPEN) + core.log.warn("Circuit breaker returned to OPEN state due to failure in HALF_OPEN") + -- Clean up half-open counters + shared_buffer:delete(gen_half_open_calls_key(ctx)) + shared_buffer:delete(gen_half_open_success_key(ctx)) end + elseif is_success then + core.log.info("Request succeeded - status: ", upstream_status, " total: ", total_requests) + + -- Handle HALF_OPEN state success + if current_state == HALF_OPEN then + local half_open_success_key = gen_half_open_success_key(ctx) + local success_count, err = shared_buffer:incr(half_open_success_key, 1, 0) + if err then + core.log.warn("failed to increment half-open success count: ", err) + end + + local half_open_calls_key = gen_half_open_calls_key(ctx) + local total_calls, err = shared_buffer:get(half_open_calls_key) + if err then + core.log.warn("failed to get half-open calls count: ", err) + return + end + + local permitted_calls = conf.unhealthy.permitted_number_of_calls_in_half_open_state or 3 + if total_calls and total_calls >= permitted_calls then + -- Check success rate threshold + local success_ratio = 0.6 -- Default value + if conf.healthy and conf.healthy.success_ratio then + success_ratio = conf.healthy.success_ratio + end - -- clear related status - if healthy_count >= conf.healthy.successes then - -- stat change to normal - core.log.info("change to normal, ", healthy_key, " ", healthy_count) - shared_buffer:delete(gen_lasttime_key(ctx)) - shared_buffer:delete(unhealthy_key) - shared_buffer:delete(healthy_key) + local success_rate = success_count / total_calls + if success_rate >= success_ratio then + -- Transition back to CLOSED state + set_circuit_breaker_state(ctx, CLOSED) + core.log.info("Circuit breaker transitioned from HALF_OPEN to CLOSED - success rate: ", + success_rate, " >= threshold: ", success_ratio) + + -- Clean up all counters for fresh start + shared_buffer:delete(gen_half_open_calls_key(ctx)) + shared_buffer:delete(gen_half_open_success_key(ctx)) + shared_buffer:delete(gen_unhealthy_key(ctx)) + shared_buffer:delete(gen_total_requests_key(ctx)) + shared_buffer:delete(gen_window_start_time_key(ctx)) + else + -- Success rate too low, return to OPEN state + set_circuit_breaker_state(ctx, OPEN) + core.log.warn("Circuit breaker returned to OPEN state - success rate: ", + success_rate, " < threshold: ", success_ratio) + -- Clean up half-open counters + shared_buffer:delete(gen_half_open_calls_key(ctx)) + shared_buffer:delete(gen_half_open_success_key(ctx)) + end + end end + end +end - return +function _M.log(conf, ctx) + if conf.policy == "unhealthy-ratio" then + ratio_based_log(conf, ctx) + else + -- Default to count-based (unhealthy-count) + count_based_log(conf, ctx) + end end return _M diff --git a/docs/en/latest/plugins/api-breaker.md b/docs/en/latest/plugins/api-breaker.md index c60070d42ea2..9b401257ef46 100644 --- a/docs/en/latest/plugins/api-breaker.md +++ b/docs/en/latest/plugins/api-breaker.md @@ -30,14 +30,30 @@ description: This document describes the information about the Apache APISIX api The `api-breaker` Plugin implements circuit breaker functionality to protect Upstream services. +This plugin supports two circuit breaker policies: + +- **Failure count-based circuit breaking (`unhealthy-count`)**: Triggers circuit breaker when consecutive failure count reaches the threshold +- **Error ratio-based circuit breaking (`unhealthy-ratio`)**: Triggers circuit breaker when error rate within a sliding time window reaches the threshold + :::note +**Failure count-based circuit breaking (`unhealthy-count`)**: + Whenever the Upstream service responds with a status code from the configured `unhealthy.http_statuses` list for the configured `unhealthy.failures` number of times, the Upstream service will be considered unhealthy. The request is then retried in 2, 4, 8, 16 ... seconds until the `max_breaker_sec`. In an unhealthy state, if the Upstream service responds with a status code from the configured list `healthy.http_statuses` for `healthy.successes` times, the service is considered healthy again. +**Error ratio-based circuit breaking (`unhealthy-ratio`)**: + +This policy is based on sliding time window statistics for error rate. When the total number of requests reaches `min_request_threshold` and the error rate exceeds `error_ratio` within the `sliding_window_size` time window, the circuit breaker enters the open state for `max_breaker_sec` seconds. + +The circuit breaker has three states: +- **CLOSED**: Normal request forwarding +- **OPEN**: Directly returns circuit breaker response without forwarding requests +- **HALF_OPEN**: Allows a limited number of requests to test if the service has recovered + ::: ## Attributes @@ -47,15 +63,35 @@ In an unhealthy state, if the Upstream service responds with a status code from | break_response_code | integer | True | | [200, ..., 599] | HTTP error code to return when Upstream is unhealthy. | | break_response_body | string | False | | | Body of the response message to return when Upstream is unhealthy. | | break_response_headers | array[object] | False | | [{"key":"header_name","value":"can contain Nginx $var"}] | Headers of the response message to return when Upstream is unhealthy. Can only be configured when the `break_response_body` attribute is configured. The values can contain APISIX variables. For example, we can use `{"key":"X-Client-Addr","value":"$remote_addr:$remote_port"}`. | -| max_breaker_sec | integer | False | 300 | >=3 | Maximum time in seconds for circuit breaking. | +| max_breaker_sec | integer | False | 300 | >=3 | Maximum time in seconds for circuit breaking. Applies to both circuit breaker policies. | +| policy | string | False | "unhealthy-count" | ["unhealthy-count", "unhealthy-ratio"] | Circuit breaker policy. `unhealthy-count` for failure count-based circuit breaking, `unhealthy-ratio` for error ratio-based circuit breaking. | + +### Failure count-based circuit breaking (policy = "unhealthy-count") + +| Name | Type | Required | Default | Valid values | Description | +|-------------------------|----------------|----------|---------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | unhealthy.http_statuses | array[integer] | False | [500] | [500, ..., 599] | Status codes of Upstream to be considered unhealthy. | | unhealthy.failures | integer | False | 3 | >=1 | Number of failures within a certain period of time for the Upstream service to be considered unhealthy. | | healthy.http_statuses | array[integer] | False | [200] | [200, ..., 499] | Status codes of Upstream to be considered healthy. | | healthy.successes | integer | False | 3 | >=1 | Number of consecutive healthy requests for the Upstream service to be considered healthy. | +### Error ratio-based circuit breaking (policy = "unhealthy-ratio") + +| Name | Type | Required | Default | Valid values | Description | +|-------------------------|----------------|----------|---------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| unhealthy.http_statuses | array[integer] | False | [500] | [500, ..., 599] | Status codes of Upstream to be considered unhealthy. | +| unhealthy.error_ratio | number | False | 0.5 | [0, 1] | Error rate threshold to trigger circuit breaker. For example, 0.5 means circuit breaker triggers when error rate reaches 50%. | +| unhealthy.min_request_threshold | integer | False | 10 | >=1 | Minimum number of requests required within the sliding window to trigger circuit breaker. Circuit breaker will only evaluate error rate when request count reaches this threshold. | +| unhealthy.sliding_window_size | integer | False | 300 | [10, 3600] | Size of the sliding window in seconds. The time range used to calculate error rate. | +| unhealthy.permitted_number_of_calls_in_half_open_state | integer | False | 3 | [1, 20] | Number of permitted calls when circuit breaker is in half-open state. Used to test if the service has recovered. | +| healthy.http_statuses | array[integer] | False | [200] | [200, ..., 499] | Status codes of Upstream to be considered healthy. | +| healthy.success_ratio | number | False | 0.6 | [0, 1] | Success rate threshold to close circuit breaker from half-open state. For example, 0.6 means circuit breaker closes when success rate reaches 60%. | + ## Enable Plugin -The example below shows how you can configure the Plugin on a specific Route: +### Failure count-based circuit breaking example + +The example below shows how you can configure the Plugin with failure count-based circuit breaking policy on a specific Route: :::note You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command: @@ -73,6 +109,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes/1" \ "plugins": { "api-breaker": { "break_response_code": 502, + "policy": "unhealthy-count", "unhealthy": { "http_statuses": [500, 503], "failures": 3 @@ -95,6 +132,47 @@ curl "http://127.0.0.1:9180/apisix/admin/routes/1" \ In this configuration, a response code of `500` or `503` three times within a certain period of time triggers the unhealthy status of the Upstream service. A response code of `200` restores its healthy status. +### Error ratio-based circuit breaking example + +The example below shows how to enable error ratio-based circuit breaking policy. This configuration triggers circuit breaker when the request count reaches 10 and error rate exceeds 50% within a 5-minute sliding window: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/2" \ +-H "X-API-KEY: $admin_key" -X PUT -d ' +{ + "plugins": { + "api-breaker": { + "break_response_code": 503, + "break_response_body": "Service temporarily unavailable due to high error rate", + "break_response_headers": [ + {"key": "X-Circuit-Breaker", "value": "open"}, + {"key": "Retry-After", "value": "60"} + ], + "policy": "unhealthy-ratio", + "max_breaker_sec": 60, + "unhealthy": { + "http_statuses": [500, 502, 503, 504], + "error_ratio": 0.5, + "min_request_threshold": 10, + "sliding_window_size": 300, + "permitted_number_of_calls_in_half_open_state": 3 + }, + "healthy": { + "http_statuses": [200, 201, 202], + "successes": 3 + } + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "uri": "/api" +}' +``` + ## Example usage Once you have configured the Plugin as shown above, you can test it out by sending a request. diff --git a/docs/zh/latest/plugins/api-breaker.md b/docs/zh/latest/plugins/api-breaker.md index 5183f84763e7..6251350268fc 100644 --- a/docs/zh/latest/plugins/api-breaker.md +++ b/docs/zh/latest/plugins/api-breaker.md @@ -6,7 +6,6 @@ keywords: - API Breaker description: 本文介绍了 Apache APISIX api-breaker 插件的相关操作,你可以使用此插件的 API 熔断机制来保护上游业务服务。 --- -