diff --git a/.changesets/feat_bnjjj_fix_3767.md b/.changesets/feat_bnjjj_fix_3767.md new file mode 100644 index 0000000000..a79b1e059d --- /dev/null +++ b/.changesets/feat_bnjjj_fix_3767.md @@ -0,0 +1,42 @@ +### feat(telemetry): add metrics for query plan warmup and schema load ([Issue #3767](https://github.com/apollographql/router/issues/3767)) + +It adds histogram metrics for `apollo_router_query_planning_warmup_duration` and `apollo_router_schema_load_duration`. + +Example in Prometheus: + +``` +# HELP apollo_router_query_planning_warmup_duration apollo_router_query_planning_warmup_duration +# TYPE apollo_router_query_planning_warmup_duration histogram +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.05"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.1"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.25"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.5"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="2.5"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="100"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1000"} 1 +apollo_router_query_planning_warmup_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 1 +apollo_router_query_planning_warmup_duration_sum{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 0.022390619 +apollo_router_query_planning_warmup_duration_count{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 1 +# HELP apollo_router_schema_load_duration apollo_router_schema_load_duration +# TYPE apollo_router_schema_load_duration histogram +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.05"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.1"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.25"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="0.5"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="2.5"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="5"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="10"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="20"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="100"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="1000"} 8 +apollo_router_schema_load_duration_bucket{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version="",le="+Inf"} 8 +apollo_router_schema_load_duration_sum{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 0.023486205999999996 +apollo_router_schema_load_duration_count{service_name="apollo-router",otel_scope_name="apollo/router",otel_scope_version=""} 8 +``` + +By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/3807 diff --git a/apollo-router/src/plugins/telemetry/metrics/filter.rs b/apollo-router/src/plugins/telemetry/metrics/filter.rs index 919a073cd7..929a8a9d17 100644 --- a/apollo-router/src/plugins/telemetry/metrics/filter.rs +++ b/apollo-router/src/plugins/telemetry/metrics/filter.rs @@ -37,8 +37,10 @@ impl FilterMeterProvider { FilterMeterProvider::builder() .delegate(delegate) .allow( - Regex::new(r"apollo\.(graphos\.cloud|router\.(operations?|config))(\..*|$)") - .expect("regex should have been valid"), + Regex::new( + r"apollo\.(graphos\.cloud|router\.(operations?|config|schema|query))(\..*|$)", + ) + .expect("regex should have been valid"), ) .build() } diff --git a/apollo-router/src/plugins/telemetry/utils.rs b/apollo-router/src/plugins/telemetry/utils.rs index f845192c2d..42b57f122c 100644 --- a/apollo-router/src/plugins/telemetry/utils.rs +++ b/apollo-router/src/plugins/telemetry/utils.rs @@ -1,3 +1,6 @@ +use std::time::Duration; +use std::time::Instant; + use tracing_core::field::Value; pub(crate) trait TracingUtils { @@ -13,3 +16,38 @@ impl TracingUtils for bool { } } } + +/// Timer implementing Drop to automatically compute the duration between the moment it has been created until it's dropped +///```ignore +/// Timer::new(|duration| { +/// tracing::info!(histogram.apollo_router_test = duration.as_secs_f64()); +/// }) +/// ``` +pub(crate) struct Timer +where + F: FnOnce(Duration), +{ + start: Instant, + f: Option, +} + +impl Timer +where + F: FnOnce(Duration), +{ + pub(crate) fn new(f: F) -> Self { + Self { + start: Instant::now(), + f: f.into(), + } + } +} + +impl Drop for Timer +where + F: FnOnce(Duration), +{ + fn drop(&mut self) { + self.f.take().expect("f must exist")(self.start.elapsed()) + } +} diff --git a/apollo-router/src/query_planner/bridge_query_planner.rs b/apollo-router/src/query_planner/bridge_query_planner.rs index 4a677aaad1..a1008498c0 100644 --- a/apollo-router/src/query_planner/bridge_query_planner.rs +++ b/apollo-router/src/query_planner/bridge_query_planner.rs @@ -476,7 +476,7 @@ impl Service for BridgeQueryPlanner { ) .await; let duration = start.elapsed().as_secs_f64(); - tracing::info!(histogram.apollo_router_query_planning_time = duration,); + tracing::info!(histogram.apollo_router_query_planning_time = duration); match res { Ok(query_planner_content) => Ok(QueryPlannerResponse::builder() diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index 034f428f32..52876b6e0e 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -22,6 +22,7 @@ use crate::error::CacheResolverError; use crate::error::QueryPlannerError; use crate::plugins::authorization::AuthorizationPlugin; use crate::plugins::authorization::CacheKeyMetadata; +use crate::plugins::telemetry::utils::Timer; use crate::query_planner::labeler::add_defer_labels; use crate::query_planner::BridgeQueryPlanner; use crate::query_planner::QueryPlanResult; @@ -107,6 +108,11 @@ where query_analysis: &QueryAnalysisLayer, cache_keys: Vec, ) { + let _timer = Timer::new(|duration| { + ::tracing::info!( + histogram.apollo.router.query.planning.warmup.duration = duration.as_secs_f64() + ); + }); let schema_id = self.schema.schema_id.clone(); let mut service = ServiceBuilder::new().service( diff --git a/apollo-router/src/spec/schema.rs b/apollo-router/src/spec/schema.rs index 744e3d2cee..1703484b9d 100644 --- a/apollo-router/src/spec/schema.rs +++ b/apollo-router/src/spec/schema.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; +use std::time::Instant; use apollo_compiler::diagnostics::ApolloDiagnostic; use apollo_compiler::ApolloCompiler; @@ -61,6 +62,7 @@ impl Schema { } pub(crate) fn parse(sdl: &str, configuration: &Configuration) -> Result { + let start = Instant::now(); let mut compiler = ApolloCompiler::new(); let id = compiler.add_type_system(sdl, "schema.graphql"); @@ -129,6 +131,9 @@ impl Schema { let mut hasher = Sha256::new(); hasher.update(sdl.as_bytes()); let schema_id = Some(format!("{:x}", hasher.finalize())); + tracing::info!( + histogram.apollo.router.schema.load.duration = start.elapsed().as_secs_f64() + ); Ok(Schema { raw_sdl: Arc::new(sdl.to_string()), diff --git a/docs/source/configuration/metrics.mdx b/docs/source/configuration/metrics.mdx index a250824e5b..f85ff5d428 100644 --- a/docs/source/configuration/metrics.mdx +++ b/docs/source/configuration/metrics.mdx @@ -103,6 +103,8 @@ The coprocessor operations metric has the following attributes: - `apollo_router_processing_time` - Time spent processing a request (outside of waiting for external or subgraph requests) in seconds. - `apollo_router_query_planning_time` - Time spent planning queries in seconds. +- `apollo_router_query_planning_warmup_duration` - Time spent planning queries in seconds. +- `apollo_router_schema_load_duration` - Time spent loading the schema in seconds. #### Uplink @@ -121,6 +123,7 @@ The coprocessor operations metric has the following attributes: Note that the initial call to uplink during router startup will not be reflected in metrics. #### Subscription + - `apollo_router_opened_subscriptions` - Number of different opened subscriptions (not the number of clients with an opened subscriptions in case it's deduplicated) - `apollo_router_deduplicated_subscriptions_total` - Number of subscriptions that has been deduplicated - `apollo_router_skipped_event_count` - Number of subscription events that has been skipped because too many events have been received from the subgraph but not yet sent to the client.