diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 6201365ac4..d8dbafcc07 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -43,6 +43,17 @@ Queries that passed 64 bit integers for Float values would (incorrectly) fail to By [@o0Ignition0o](https://github.com/o0Ignition0o) in https://github.com/apollographql/router/pull/1951 +### prometheus: make sure http_requests_error_total and http_requests_total are incremented. ([PR #1953](https://github.com/apollographql/router/pull/1953)) + +`http_requests_error_total` did only increment for requests that would be an `INTERNAL_SERVER_ERROR` in the router (the service stack returning a BoxError). +What this means is that validation errors would not increment this counter. + +`http_requests_total` would only increment for successful requests, while the prometheus documentation mentions this key should be incremented regardless of whether the request succeeded or not. + +This PR makes sure we always increment `http_requests_total`, and we increment `http_requests_error_total` when the StatusCode is not 2XX. + +By [@o0Ignition0o](https://github.com/o0Ignition0o) in https://github.com/apollographql/router/pull/1953 + ### Set no_delay and keepalive on subgraph requests [Issue #1905](https://github.com/apollographql/router/issues/1905)) It was incorrectly removed in a previous pull request. diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 9c4678d6c6..1612f6b9fd 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -765,11 +765,12 @@ impl Telemetry { ); metric_attrs.extend(attributes.into_iter().map(|(k, v)| KeyValue::new(k, v))); - metrics.http_requests_total.add(1, &metric_attrs); - } else { - metrics.http_requests_total.add(1, &metric_attrs); } + if !parts.status.is_success() { + metric_attrs.push(KeyValue::new("error", parts.status.to_string())); + metrics.http_requests_error_total.add(1, &metric_attrs); + } let response = http::Response::from_parts( parts, once(ready(first_response.unwrap_or_default())) @@ -779,12 +780,12 @@ impl Telemetry { Ok(SupergraphResponse { context, response }) } - Err(err) => { - metrics.http_requests_error_total.add(1, &metric_attrs); - - Err(err) - } + Err(err) => Err(err), }; + + // http_requests_total - the total number of HTTP requests received + metrics.http_requests_total.add(1, &metric_attrs); + metrics .http_requests_duration .record(request_duration.as_secs_f64(), &metric_attrs); @@ -907,14 +908,14 @@ impl Telemetry { let errors = merge_config!(errors); AttributesForwardConf { - insert: (!insert.is_empty()).then(|| insert), + insert: (!insert.is_empty()).then_some(insert), request: (request.header.is_some() || request.body.is_some()) - .then(|| request), + .then_some(request), response: (response.header.is_some() || response.body.is_some()) - .then(|| response), + .then_some(response), errors: (errors.extensions.is_some() || errors.include_messages) - .then(|| errors), - context: (!context.is_empty()).then(|| context), + .then_some(errors), + context: (!context.is_empty()).then_some(context), } }), ) @@ -1439,6 +1440,19 @@ mod tests { .unwrap()) }); + let mut mock_bad_request_service = MockSupergraphService::new(); + mock_bad_request_service + .expect_call() + .times(1) + .returning(move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::BAD_REQUEST) + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }); + let mut mock_subgraph_service = MockSubgraphService::new(); mock_subgraph_service .expect_call() @@ -1589,6 +1603,21 @@ mod tests { .await .unwrap(); + let mut bad_request_supergraph_service = + dyn_plugin.supergraph_service(BoxService::new(mock_bad_request_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = bad_request_supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + let mut subgraph_service = dyn_plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); let subgraph_req = SubgraphRequest::fake_builder() @@ -1683,5 +1712,8 @@ mod tests { assert!(prom_metrics.contains(r#"http_request_duration_seconds_count{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header"}"#)); assert!(prom_metrics.contains(r#"http_request_duration_seconds_sum{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header"}"#)); assert!(prom_metrics.contains(r#"http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="1"}"#)); + assert!(prom_metrics.contains(r#"http_requests_total{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value"} 1"#)); + assert!(prom_metrics.contains(r#"http_requests_total{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400"} 1"#)); + assert!(prom_metrics.contains(r#"http_requests_error_total{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400"} 1"#)) } }