From 0242f958543361f63b87d59e70669151767901db Mon Sep 17 00:00:00 2001 From: Nelson Boss Date: Fri, 22 May 2026 16:39:13 +0800 Subject: [PATCH 1/2] [improvement](executor) use real elapsed time to compute workload group metrics refresh interval Replace the fixed config-based interval with the actual monotonic time delta between two refreshes when calculating per-second CPU and scan IO rates in WorkloadGroupMetrics, so the rates stay accurate even when the refresh thread is delayed or the configured interval is changed at runtime. Also add a guard against division by zero when two refreshes happen within less than one second, and add unit tests covering: - Real elapsed time rate computation - Sub-second interval safety (no division by zero) - Proportional rate vs interval relationship - Memory metrics correctness - First-refresh boundary behavior --- .../workload_group/workload_group_metrics.cpp | 9 +- .../workload_group/workload_group_metrics.h | 1 + .../workload_group_metrics_test.cpp | 159 ++++++++++++++++++ 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 be/test/runtime/workload_group/workload_group_metrics_test.cpp diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp b/be/src/runtime/workload_group/workload_group_metrics.cpp index 41da5af1e788e8..c951cb7c44b1e7 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.cpp +++ b/be/src/runtime/workload_group/workload_group_metrics.cpp @@ -23,6 +23,7 @@ #include "runtime/workload_group/workload_group.h" #include "runtime/workload_management/io_throttle.h" #include "storage/olap_common.h" +#include "util/time.h" namespace doris { @@ -83,7 +84,13 @@ void WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes) } void WorkloadGroupMetrics::refresh_metrics() { - int interval_second = config::workload_group_metrics_interval_ms / 1000; + uint64_t current_time_ms = MonotonicMillis(); + uint64_t interval_second = (current_time_ms - _last_refresh_time_ms) / 1000; + _last_refresh_time_ms = current_time_ms; + + if (interval_second == 0) { + return; + } // cpu uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load(); diff --git a/be/src/runtime/workload_group/workload_group_metrics.h b/be/src/runtime/workload_group/workload_group_metrics.h index 67085d8374fa1d..ca04e0929ae2f8 100644 --- a/be/src/runtime/workload_group/workload_group_metrics.h +++ b/be/src/runtime/workload_group/workload_group_metrics.h @@ -73,6 +73,7 @@ class WorkloadGroupMetrics { std::atomic _cpu_time_nanos {0}; std::atomic _last_cpu_time_nanos {0}; + std::atomic _last_refresh_time_ms {0}; std::atomic _per_sec_cpu_time_nanos {0}; // used for system table std::atomic _per_sec_local_scan_bytes {0}; diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp b/be/test/runtime/workload_group/workload_group_metrics_test.cpp new file mode 100644 index 00000000000000..ec10e0a0e31c0b --- /dev/null +++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/workload_group/workload_group_metrics.h" + +#include + +#include +#include + +#include "runtime/workload_group/workload_group.h" +#include "util/time.h" + +namespace doris { + +class WorkloadGroupMetricsTest : public testing::Test { +protected: + void SetUp() override { + WorkloadGroupInfo wg_info {.id = 1, .name = "test_wg"}; + _wg = std::make_shared(wg_info); + _metrics = std::make_unique(_wg.get()); + } + + void TearDown() override { + _metrics.reset(); + _wg.reset(); + } + + std::shared_ptr _wg; + std::unique_ptr _metrics; +}; + +// Test that refresh_metrics uses real elapsed time to compute per-second rates. +// After sleeping for a known interval, the per-second CPU rate should reflect +// the actual elapsed time rather than the config-based fixed interval. +TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) { + // First call to refresh_metrics to initialize _last_refresh_time_ms + _metrics->refresh_metrics(); + + // Add known CPU time: 2,000,000,000 nanos = 2 seconds of CPU + const uint64_t cpu_delta_nanos = 2000000000ULL; + _metrics->update_cpu_time_nanos(cpu_delta_nanos); + + // Sleep for ~2 seconds so the real interval is ~2 seconds + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + + // Refresh metrics — should compute rate based on real ~2 second interval + _metrics->refresh_metrics(); + + // Expected: 2,000,000,000 nanos / 2 seconds = 1,000,000,000 nanos per second + // Allow tolerance for timing imprecision (the sleep may not be exactly 2s) + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + // With a 2 second interval and 2B nanos of CPU time added, + // the rate should be approximately 1B nanos/sec. + // Allow 50% tolerance due to scheduling variance. + EXPECT_GT(cpu_per_sec, 500000000ULL) + << "CPU per-second rate too low: " << cpu_per_sec + << " (expected ~1,000,000,000 with 2s interval)"; + EXPECT_LT(cpu_per_sec, 2500000000ULL) + << "CPU per-second rate too high: " << cpu_per_sec + << " (expected ~1,000,000,000 with 2s interval)"; +} + +// Test that when interval is less than 1 second, refresh_metrics does not +// cause division by zero and preserves previous rates. +TEST_F(WorkloadGroupMetricsTest, refresh_skips_when_interval_less_than_one_second) { + // First call to initialize _last_refresh_time_ms + _metrics->refresh_metrics(); + + // Add some CPU time + _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos + + // Call refresh immediately (< 1 second elapsed) — should not crash + // and should not update per-second rates (interval_second == 0 → early return) + _metrics->refresh_metrics(); + + // Per-second rate should still be 0 (from the initial state) + // because the sub-second refresh skips the rate calculation + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + EXPECT_EQ(cpu_per_sec, 0) + << "CPU per-second rate should remain unchanged when interval < 1s"; +} + +// Test that different real intervals produce proportionally different rates. +// A shorter interval with the same delta should yield a higher per-second rate. +TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) { + // --- First measurement: 1 second interval --- + _metrics->refresh_metrics(); + _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + _metrics->refresh_metrics(); + + uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second(); + + // --- Second measurement: add same delta, wait 2 seconds --- + _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos + + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + _metrics->refresh_metrics(); + + uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second(); + + // With the same absolute delta (1B nanos) but double the interval, + // the per-second rate should be roughly half. + // rate_1s ~ 1B/1s = 1B + // rate_2s ~ 1B/2s = 500M + // Allow generous tolerance for timing jitter + EXPECT_GT(rate_1s, rate_2s) + << "1s interval rate (" << rate_1s << ") should be higher than 2s interval rate (" + << rate_2s << ")"; +} + +// Test that memory metrics are correctly reported +TEST_F(WorkloadGroupMetricsTest, memory_used_reported_correctly) { + const int64_t mem_used = 1024L * 1024 * 512; // 512 MB + _metrics->update_memory_used_bytes(mem_used); + _metrics->refresh_metrics(); + + // Need to wait > 1 second for refresh to take effect + std::this_thread::sleep_for(std::chrono::milliseconds(1100)); + _metrics->refresh_metrics(); + + EXPECT_EQ(_metrics->get_memory_used(), mem_used); +} + +// Test that the first refresh (from _last_refresh_time_ms == 0) does not produce +// unreasonable rates since the interval is very large (time since boot). +TEST_F(WorkloadGroupMetricsTest, first_refresh_produces_near_zero_rate) { + // Add some CPU time before the first refresh + _metrics->update_cpu_time_nanos(5000000000ULL); // 5B nanos + + // First refresh: interval = current_time_ms / 1000 (time since boot in seconds) + // For a system with uptime > 5 seconds, rate = 5B / uptime_seconds + // This should be small relative to the delta + _metrics->refresh_metrics(); + + uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); + // With system uptime of at least 60 seconds (reasonable assumption), + // rate = 5B / 60+ < 84M nanos/sec + EXPECT_LT(cpu_per_sec, 1000000000ULL) + << "First refresh rate should be modest since interval is system uptime"; +} + +} // namespace doris From c0a92b48f61913d9d23140fe22b4a50a27f3a531 Mon Sep 17 00:00:00 2001 From: Nelson Boss Date: Mon, 25 May 2026 18:10:46 +0800 Subject: [PATCH 2/2] [fix](test) fix workload_group_metrics_test ASAN failure Use WorkloadGroup's built-in metrics instead of creating a duplicate WorkloadGroupMetrics instance, which caused metric entity lifecycle issues under ASAN. Also use unique IDs per test case to avoid entity conflicts, and reduce sleep times to prevent CI timeout in ASAN mode. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workload_group_metrics_test.cpp | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp b/be/test/runtime/workload_group/workload_group_metrics_test.cpp index 90996c0e68ead0..9f163dc3a29de7 100644 --- a/be/test/runtime/workload_group/workload_group_metrics_test.cpp +++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp @@ -19,6 +19,7 @@ #include +#include #include #include @@ -30,9 +31,12 @@ namespace doris { class WorkloadGroupMetricsTest : public testing::Test { protected: void SetUp() override { - WorkloadGroupInfo wg_info {.id = 1, .name = "test_wg"}; + // Use a unique id for each test instance to avoid metric entity conflicts + static std::atomic next_id {1}; + uint64_t id = next_id.fetch_add(1); + WorkloadGroupInfo wg_info {.id = id, .name = "test_wg_" + std::to_string(id)}; _wg = std::make_shared(wg_info); - _metrics = std::make_unique(_wg.get()); + _metrics = _wg->get_metrics(); } void TearDown() override { @@ -41,7 +45,7 @@ class WorkloadGroupMetricsTest : public testing::Test { } std::shared_ptr _wg; - std::unique_ptr _metrics; + std::shared_ptr _metrics; }; // Test that refresh_metrics uses real elapsed time to compute per-second rates. @@ -55,22 +59,17 @@ TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) { const uint64_t cpu_delta_nanos = 2000000000ULL; _metrics->update_cpu_time_nanos(cpu_delta_nanos); - // Sleep for ~2 seconds so the real interval is ~2 seconds - std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + // Sleep for ~1.1 seconds so the real interval is ~1 second + std::this_thread::sleep_for(std::chrono::milliseconds(1100)); - // Refresh metrics — should compute rate based on real ~2 second interval + // Refresh metrics — should compute rate based on real ~1 second interval _metrics->refresh_metrics(); - // Expected: 2,000,000,000 nanos / 2 seconds = 1,000,000,000 nanos per second - // Allow tolerance for timing imprecision (the sleep may not be exactly 2s) + // Expected: 2,000,000,000 nanos / 1 second = ~2,000,000,000 nanos per second + // Allow generous tolerance for timing imprecision in CI uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second(); - // With a 2 second interval and 2B nanos of CPU time added, - // the rate should be approximately 1B nanos/sec. - // Allow 50% tolerance due to scheduling variance. - EXPECT_GT(cpu_per_sec, 500000000ULL) << "CPU per-second rate too low: " << cpu_per_sec - << " (expected ~1,000,000,000 with 2s interval)"; - EXPECT_LT(cpu_per_sec, 2500000000ULL) << "CPU per-second rate too high: " << cpu_per_sec - << " (expected ~1,000,000,000 with 2s interval)"; + EXPECT_GT(cpu_per_sec, 500000000ULL) << "CPU per-second rate too low: " << cpu_per_sec; + EXPECT_LT(cpu_per_sec, 4000000000ULL) << "CPU per-second rate too high: " << cpu_per_sec; } // Test that when interval is less than 1 second, refresh_metrics does not @@ -99,7 +98,7 @@ TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) { _metrics->refresh_metrics(); _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + std::this_thread::sleep_for(std::chrono::milliseconds(1100)); _metrics->refresh_metrics(); uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second(); @@ -107,15 +106,13 @@ TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) { // --- Second measurement: add same delta, wait 2 seconds --- _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos - std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + std::this_thread::sleep_for(std::chrono::milliseconds(2100)); _metrics->refresh_metrics(); uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second(); // With the same absolute delta (1B nanos) but double the interval, // the per-second rate should be roughly half. - // rate_1s ~ 1B/1s = 1B - // rate_2s ~ 1B/2s = 500M // Allow generous tolerance for timing jitter EXPECT_GT(rate_1s, rate_2s) << "1s interval rate (" << rate_1s << ") should be higher than 2s interval rate (" << rate_2s << ")";