diff --git a/go.mod b/go.mod index 901317de..0833b49b 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( cloud.google.com/go/compute/metadata v0.9.0 cloud.google.com/go/container v1.49.0 cloud.google.com/go/iam v1.9.0 + cloud.google.com/go/monitoring v1.24.3 cloud.google.com/go/resourcemanager v1.13.0 cloud.google.com/go/serviceusage v1.14.0 cloud.google.com/go/storage v1.62.1 @@ -56,7 +57,6 @@ require ( cloud.google.com/go/auth v0.19.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/longrunning v0.9.0 // indirect - cloud.google.com/go/monitoring v1.24.3 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.55.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.55.0 // indirect diff --git a/monitoring/dashboards/README.md b/monitoring/dashboards/README.md new file mode 100644 index 00000000..b4340b1e --- /dev/null +++ b/monitoring/dashboards/README.md @@ -0,0 +1,23 @@ +# Cloud Monitoring dashboards + +Google Cloud Monitoring dashboard definitions for ATE. They turn the raw +`prometheus.googleapis.com/...` metrics that ATE emits into readable +per-method latency / throughput / error views. + +| File | Shows | +|------|-------| +| `ate-grpc-dashboard.json` | ateapi & atelet gRPC latency (p50/p95/p99), request rate, and error rate, by method | + +## Applying + +Dashboards are created/updated (idempotently) by setup: + +```sh +go run ./tools/setup-gcp --create-monitoring-dashboards # also part of: --all +``` + +Or apply any single file by hand: + +```sh +gcloud monitoring dashboards create --config-from-file=monitoring/dashboards/.json +``` diff --git a/monitoring/dashboards/ate-grpc-dashboard.json b/monitoring/dashboards/ate-grpc-dashboard.json new file mode 100644 index 00000000..066ac8d5 --- /dev/null +++ b/monitoring/dashboards/ate-grpc-dashboard.json @@ -0,0 +1,158 @@ +{ + "displayName": "ATE gRPC Server — latency / QPS / errors", + "mosaicLayout": { + "columns": 12, + "tiles": [ + { + "xPos": 0, + "yPos": 0, + "width": 6, + "height": 4, + "widget": { + "title": "ateapi — gRPC p99 latency by method", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "histogram_quantile(0.99, sum by (le, \"rpc.method\") (rate({\"rpc.server.call.duration_bucket\", top_level_controller_name=\"ate-api-server-deployment\"}[5m])))", + "unitOverride": "s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "seconds", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 0, + "yPos": 4, + "width": 6, + "height": 4, + "widget": { + "title": "ateapi — gRPC request rate by method (req/s)", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "sum by (\"rpc.method\") (rate({\"rpc.server.call.duration_count\", top_level_controller_name=\"ate-api-server-deployment\"}[5m]))", + "unitOverride": "1/s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "requests/s", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 0, + "yPos": 8, + "width": 6, + "height": 4, + "widget": { + "title": "ateapi — gRPC errors by status code (req/s, non-OK)", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "sum by (\"rpc.response.status_code\") (rate({\"rpc.server.call.duration_count\", top_level_controller_name=\"ate-api-server-deployment\", \"rpc.response.status_code\"!=\"OK\"}[5m]))", + "unitOverride": "1/s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "errors/s", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 6, + "yPos": 0, + "width": 6, + "height": 4, + "widget": { + "title": "atelet — gRPC p99 latency by method", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "histogram_quantile(0.99, sum by (le, \"rpc.method\") (rate({\"rpc.server.call.duration_bucket\", top_level_controller_name=\"atelet\"}[5m])))", + "unitOverride": "s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "seconds", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 6, + "yPos": 4, + "width": 6, + "height": 4, + "widget": { + "title": "atelet — gRPC request rate by method (req/s)", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "sum by (\"rpc.method\") (rate({\"rpc.server.call.duration_count\", top_level_controller_name=\"atelet\"}[5m]))", + "unitOverride": "1/s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "requests/s", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 6, + "yPos": 8, + "width": 6, + "height": 4, + "widget": { + "title": "atelet — gRPC errors by status code (req/s, non-OK)", + "xyChart": { + "dataSets": [ + { + "timeSeriesQuery": { + "prometheusQuery": "sum by (\"rpc.response.status_code\") (rate({\"rpc.server.call.duration_count\", top_level_controller_name=\"atelet\", \"rpc.response.status_code\"!=\"OK\"}[5m]))", + "unitOverride": "1/s" + }, + "plotType": "LINE", + "targetAxis": "Y1" + } + ], + "yAxis": { + "label": "errors/s", + "scale": "LINEAR" + } + } + } + } + ] + } +} diff --git a/tools/setup-gcp/cmd/dashboards.go b/tools/setup-gcp/cmd/dashboards.go new file mode 100644 index 00000000..720bc438 --- /dev/null +++ b/tools/setup-gcp/cmd/dashboards.go @@ -0,0 +1,94 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + + dashboard "cloud.google.com/go/monitoring/dashboard/apiv1" + "cloud.google.com/go/monitoring/dashboard/apiv1/dashboardpb" + "google.golang.org/api/iterator" + "google.golang.org/protobuf/encoding/protojson" +) + +// dashboardsToApply lists the Cloud Monitoring dashboard JSON files (relative to +// the repo root, so run setup from the repo root) that setup creates or updates. +var dashboardsToApply = []string{ + "monitoring/dashboards/ate-grpc-dashboard.json", +} + +// createMonitoringDashboards creates or updates each dashboard in +// dashboardsToApply. It is idempotent: dashboards are matched by displayName and +// updated in place, because CreateDashboard always creates a new dashboard (so +// calling it repeatedly would produce duplicates). +func createMonitoringDashboards(ctx context.Context, env *Environment) error { + client, err := dashboard.NewDashboardsClient(ctx) + if err != nil { + return fmt.Errorf("create dashboards client: %w", err) + } + defer client.Close() + + parent := "projects/" + env.ProjectID + + // Index existing dashboards by displayName to decide create vs update. + existing := map[string]*dashboardpb.Dashboard{} + it := client.ListDashboards(ctx, &dashboardpb.ListDashboardsRequest{Parent: parent}) + for { + d, err := it.Next() + if err == iterator.Done { + break + } + if err != nil { + return fmt.Errorf("list dashboards: %w", err) + } + existing[d.GetDisplayName()] = d + } + + for _, path := range dashboardsToApply { + data, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read %s: %w", path, err) + } + d := &dashboardpb.Dashboard{} + if err := protojson.Unmarshal(data, d); err != nil { + return fmt.Errorf("parse %s: %w", path, err) + } + + if cur, ok := existing[d.GetDisplayName()]; ok { + // Update in place: reuse the existing resource name and etag. + d.Name = cur.GetName() + d.Etag = cur.GetEtag() + slog.Info("Updating dashboard", + slog.String("displayName", d.GetDisplayName()), + slog.String("name", d.GetName()), + slog.String("file", filepath.Base(path))) + if _, err := client.UpdateDashboard(ctx, &dashboardpb.UpdateDashboardRequest{Dashboard: d}); err != nil { + return fmt.Errorf("update dashboard %q: %w", d.GetDisplayName(), err) + } + } else { + slog.Info("Creating dashboard", + slog.String("displayName", d.GetDisplayName()), + slog.String("file", filepath.Base(path))) + if _, err := client.CreateDashboard(ctx, &dashboardpb.CreateDashboardRequest{Parent: parent, Dashboard: d}); err != nil { + return fmt.Errorf("create dashboard %q: %w", d.GetDisplayName(), err) + } + } + } + return nil +} diff --git a/tools/setup-gcp/cmd/root.go b/tools/setup-gcp/cmd/root.go index 54dc2fff..5f817dc3 100644 --- a/tools/setup-gcp/cmd/root.go +++ b/tools/setup-gcp/cmd/root.go @@ -30,6 +30,7 @@ var ( grantGkeNodePermissionsFlag bool grantAteletPermissionsFlag bool enableApisFlag bool + createDashboardsFlag bool allFlag bool ) @@ -54,6 +55,7 @@ var rootCmd = &cobra.Command{ {"create iam policy bindings", &createIamPolicyBindingsFlag, createIamPolicyBindings}, {"grant gke node permissions", &grantGkeNodePermissionsFlag, grantGkeNodePermissions}, {"grant atelet permissions", &grantAteletPermissionsFlag, grantAteletPermissions}, + {"create monitoring dashboards", &createDashboardsFlag, createMonitoringDashboards}, } if cmd.Flags().NFlag() == 0 { @@ -97,5 +99,6 @@ func init() { rootCmd.Flags().BoolVar(&grantGkeNodePermissionsFlag, "grant-gke-node-permissions", false, "Grant GKE nodes permission to pull images") rootCmd.Flags().BoolVar(&grantAteletPermissionsFlag, "grant-atelet-permissions", false, "Grant atelet permission to read/write snapshots and pull images") rootCmd.Flags().BoolVar(&enableApisFlag, "enable-apis", false, "Enable required Google Cloud APIs") + rootCmd.Flags().BoolVar(&createDashboardsFlag, "create-monitoring-dashboards", false, "Create/update Cloud Monitoring dashboards from monitoring/dashboards/") rootCmd.Flags().BoolVar(&allFlag, "all", false, "Run all setup steps") }