From 6bdd4cecd9643a579df17221dcac2902b1c47664 Mon Sep 17 00:00:00 2001 From: Brian Olsen Date: Thu, 14 May 2026 12:12:56 -0600 Subject: [PATCH] nexthop health status don't overflow fail count --- src/proxy/http/remap/NextHopHealthStatus.cc | 24 +++++-- .../unit-tests/test_NextHopRoundRobin.cc | 69 +++++++++++++++++++ 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/proxy/http/remap/NextHopHealthStatus.cc b/src/proxy/http/remap/NextHopHealthStatus.cc index 6c86c26bf30..35e42aa6115 100644 --- a/src/proxy/http/remap/NextHopHealthStatus.cc +++ b/src/proxy/http/remap/NextHopHealthStatus.cc @@ -120,8 +120,12 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const char *hostname, const int new_fail_count = h->failCount = 1; } } else if (result.retry == true) { - h->failedAt = _now; - new_fail_count = h->failCount += 1; + h->failedAt = _now; + if (h->failCount < UINT32_MAX) { + new_fail_count = ++h->failCount; + } else { + new_fail_count = UINT32_MAX; + } } } // end lock guard @@ -132,19 +136,25 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const char *hostname, const int { // lock guard std::lock_guard lock(h->_mutex); if ((h->failedAt.load() + retry_time) < _now) { + h->failedAt = _now; new_fail_count = h->failCount = 1; - h->failedAt = _now; } else { - new_fail_count = h->failCount += 1; + if (h->failCount < UINT32_MAX) { + new_fail_count = ++h->failCount; + } else { + new_fail_count = UINT32_MAX; + } } } // end of lock_guard - NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %d for %s", sm_id, new_fail_count, h->hostname.c_str()); + NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] Parent fail count increased to %" PRIu32 " for %s", sm_id, new_fail_count, + h->hostname.c_str()); } if (new_fail_count >= fail_threshold) { h->set_unavailable(); - NH_Note("[%" PRId64 "] Failure threshold met failcount:%d >= threshold:%" PRId64 ", http parent proxy %s marked down", sm_id, - new_fail_count, fail_threshold, h->hostname.c_str()); + NH_Note("[%" PRId64 "] Failure threshold met failcount:%" PRIu32 " >= threshold:%" PRId64 + ", http parent proxy %s marked down", + sm_id, new_fail_count, fail_threshold, h->hostname.c_str()); NH_Dbg(NH_DBG_CTL, "[%" PRId64 "] NextHop %s marked unavailable, h->available=%s", sm_id, h->hostname.c_str(), (h->available.load()) ? "true" : "false"); } diff --git a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc index 908dbc35e5b..52ae3b511f0 100644 --- a/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc +++ b/src/proxy/http/remap/unit-tests/test_NextHopRoundRobin.cc @@ -381,3 +381,72 @@ SCENARIO("Testing NextHopRoundRobin class, using policy 'latched'", "[NextHopRou } } } + +SCENARIO("Testing NextHopHealthStatus failCount overflow saturation", "[NextHopHealthStatus]") +{ + // No thread setup, forbid use of thread local allocators. + cmd_disable_pfreelist = true; + http_init(); + + GIVEN("Loading the round-robin-tests.yaml config for overflow test.") + { + NextHopStrategyFactory nhf(TS_SRC_DIR "/round-robin-tests.yaml"); + NextHopSelectionStrategy *const strategy = nhf.strategyInstance("rr-strict-exhaust-ring"); + + REQUIRE(nhf.strategies_loaded == true); + REQUIRE(strategy != nullptr); + + WHEN("failCount is near UINT32_MAX and markNextHop is called") + { + HttpSM sm; + ParentResult *result = &sm.t_state.parent_result; + TSHttpTxn txnp = reinterpret_cast(&sm); + + // Select a host so result is populated. + build_request(20001, &sm, nullptr, "rabbit.net", nullptr); + strategy->findNextHop(txnp); + REQUIRE(result->result == ParentResultType::SPECIFIED); + REQUIRE(result->hostname != nullptr); + + // Get the HostRecord for the selected host. + std::shared_ptr host_rec; + for (auto &group : strategy->host_groups) { + for (auto &h : group) { + if (h->hostname == result->hostname) { + host_rec = h; + break; + } + } + if (host_rec) { + break; + } + } + REQUIRE(host_rec != nullptr); + + // Set failCount to UINT32_MAX - 1 to test saturation. + host_rec->failCount = UINT32_MAX - 1; + host_rec->failedAt = time(nullptr); + + // Set fail_threshold very high so set_unavailable doesn't interfere. + extern char _my_txn_conf[]; + auto *oride = reinterpret_cast(_my_txn_conf); + oride->parent_fail_threshold = static_cast(UINT32_MAX) + 1; + oride->parent_retry_time = 30; // large retry window so we stay in the increment path + + THEN("failCount saturates at UINT32_MAX and does not wrap to 0") + { + // First markNextHop: UINT32_MAX-1 -> UINT32_MAX + strategy->markNextHop(txnp, result->hostname, result->port, NHCmd::MARK_DOWN); + CHECK(host_rec->failCount.load() == UINT32_MAX); + + // Second markNextHop: should stay at UINT32_MAX (saturated) + strategy->markNextHop(txnp, result->hostname, result->port, NHCmd::MARK_DOWN); + CHECK(host_rec->failCount.load() == UINT32_MAX); + + // Verify host is still available (threshold not met since threshold > UINT32_MAX) + CHECK(host_rec->available.load() == true); + } + br_destroy(sm); + } + } +}