Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(server): Add Prometheus metrics. Closes #4751 #4952

Merged
merged 2 commits into from
Jan 27, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ require (
github.com/golang/protobuf v1.4.3
github.com/gorilla/websocket v1.4.2
github.com/grpc-ecosystem/go-grpc-middleware v1.1.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
github.com/grpc-ecosystem/grpc-gateway v1.16.0
github.com/imkira/go-interpol v1.1.0 // indirect
github.com/mattn/goreman v0.3.7
Expand Down
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmg
github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
github.com/grpc-ecosystem/go-grpc-middleware v1.1.0 h1:THDBEeQ9xZ8JEaCLyLQqXMMdRqNr0QAUJTIkQAUtFjg=
github.com/grpc-ecosystem/go-grpc-middleware v1.1.0/go.mod h1:f5nM7jw/oeRSadq3xCzHAvxcr8HZnzsqU6ILg/0NiiE=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ data:
scrape_configs:
- job_name: 'argo'
static_configs:
- targets: ['workflow-controller-metrics:9090']
- targets: ['workflow-controller-metrics:9090', 'argo-server:2746']
10 changes: 10 additions & 0 deletions server/apiserver/argoserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import (

grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
grpc_logrus "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"github.com/grpc-ecosystem/grpc-gateway/runtime"
"github.com/prometheus/client_golang/prometheus/promhttp"
log "github.com/sirupsen/logrus"
"github.com/soheilhy/cmux"
"golang.org/x/net/context"
Expand Down Expand Up @@ -212,6 +214,10 @@ func (as *argoServer) Run(ctx context.Context, port int, browserOpenFunc func(st

func (as *argoServer) newGRPCServer(instanceIDService instanceid.Service, offloadNodeStatusRepo sqldb.OffloadNodeStatusRepo, wfArchive sqldb.WorkflowArchive, eventServer *event.Controller, links []*v1alpha1.Link) *grpc.Server {
serverLog := log.NewEntry(log.StandardLogger())

// "Prometheus histograms are a great way to measure latency distributions of your RPCs. However, since it is bad practice to have metrics of high cardinality the latency monitoring metrics are disabled by default. To enable them please call the following in your server initialization code:"
grpc_prometheus.EnableHandlingTimeHistogram()

sOpts := []grpc.ServerOption{
// Set both the send and receive the bytes limit to be 100MB
// The proper way to achieve high performance is to have pagination
Expand All @@ -220,12 +226,14 @@ func (as *argoServer) newGRPCServer(instanceIDService instanceid.Service, offloa
grpc.MaxSendMsgSize(MaxGRPCMessageSize),
grpc.ConnectionTimeout(300 * time.Second),
grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(
grpc_prometheus.UnaryServerInterceptor,
grpc_logrus.UnaryServerInterceptor(serverLog),
grpcutil.PanicLoggerUnaryServerInterceptor(serverLog),
grpcutil.ErrorTranslationUnaryServerInterceptor,
as.gatekeeper.UnaryServerInterceptor(),
)),
grpc.StreamInterceptor(grpc_middleware.ChainStreamServer(
grpc_prometheus.StreamServerInterceptor,
grpc_logrus.StreamServerInterceptor(serverLog),
grpcutil.PanicLoggerStreamServerInterceptor(serverLog),
grpcutil.ErrorTranslationStreamServerInterceptor,
Expand All @@ -244,6 +252,7 @@ func (as *argoServer) newGRPCServer(instanceIDService instanceid.Service, offloa
cronworkflowpkg.RegisterCronWorkflowServiceServer(grpcServer, cronworkflow.NewCronWorkflowServer(instanceIDService))
workflowarchivepkg.RegisterArchivedWorkflowServiceServer(grpcServer, workflowarchive.NewWorkflowArchiveServer(wfArchive))
clusterwftemplatepkg.RegisterClusterWorkflowTemplateServiceServer(grpcServer, clusterworkflowtemplate.NewClusterWorkflowTemplateServer(instanceIDService))
grpc_prometheus.Register(grpcServer)
return grpcServer
}

Expand Down Expand Up @@ -296,6 +305,7 @@ func (as *argoServer) newHTTPServer(ctx context.Context, port int, artifactServe
mux.HandleFunc("/artifacts-by-uid/", artifactServer.GetArtifactByUID)
mux.HandleFunc("/oauth2/redirect", as.oAuth2Service.HandleRedirect)
mux.HandleFunc("/oauth2/callback", as.oAuth2Service.HandleCallback)
mux.Handle("/metrics", promhttp.Handler())
// we only enable HTST if we are secure mode, otherwise you would never be able access the UI
mux.HandleFunc("/", static.NewFilesServer(as.baseHRef, as.tlsConfig != nil && as.hsts, as.xframeOptions).ServerFiles)
return &httpServer
Expand Down
17 changes: 17 additions & 0 deletions test/e2e/argo_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,23 @@ func (s *ArgoServerSuite) TestVersion() {
})
}

func (s *ArgoServerSuite) TestMetrics() {
s.e().GET("/metrics").
Expect().
Status(200).
Body().
// https://blog.netsil.com/the-4-golden-signals-of-api-health-and-performance-in-cloud-native-applications-a6e87526e74
// Latency: The time it takes to service a request, with a focus on distinguishing between the latency of successful requests and the latency of failed requests
Contains(`grpc_server_handling_seconds_bucket`).
// Traffic: A measure of how much demand is being placed on the service. This is measured using a high-level service-specific metric, like HTTP requests per second in the case of an HTTP REST API.
Contains(`promhttp_metric_handler_requests_in_flight`).
// Errors: The rate of requests that fail. The failures can be explicit (e.g., HTTP 500 errors) or implicit (e.g., an HTTP 200 OK response with a response body having too few items).
Contains(`promhttp_metric_handler_requests_total{code="500"}`).
// Saturation: How “full” is the service. This is a measure of the system utilization, emphasizing the resources that are most constrained (e.g., memory, I/O or CPU). Services degrade in performance as they approach high saturation.
Contains(`process_cpu_seconds_total`).
Contains(`process_resident_memory_bytes`)
}

func (s *ArgoServerSuite) TestSubmitWorkflowTemplateFromGithubWebhook() {
s.bearerToken = ""

Expand Down