Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,20 @@ docker compose up --build
| `temporal` | Workflow orchestration | `7233` (gRPC), `8233` (Web UI) |
| `minio` | S3-compatible snapshot storage | `9000` (API), `9001` (Console) |
| `endoflife` | Local EOL data override (nginx) | `8082` |
| `version-guard` | The server | `8081` (HTTP admin), `9090` (Temporal SDK metrics) |
| `version-guard` | The server | `8081` (HTTP admin), `9090` (OpenMetrics) |

The `endoflife` service serves patched EOL data for products with pending upstream PRs on [endoflife.date](https://endoflife.date), and proxies everything else to the live API. See [`deploy/endoflife-override/README.md`](./deploy/endoflife-override/README.md) for details on adding or updating overrides.

Once running, open the Temporal Web UI at http://localhost:8233 to trigger and monitor workflows.

Temporal SDK metrics are enabled by default and exposed at
http://localhost:9090/metrics. Set `TEMPORAL_METRICS_ENABLED=false` to disable
them, or set `TEMPORAL_METRICS_LISTEN_ADDRESS` to use a different address.
Temporal SDK metrics and Version Guard application metrics are enabled by
default and exposed at http://localhost:9090/metrics. Set
`TEMPORAL_METRICS_ENABLED=false` to disable them, or set
`TEMPORAL_METRICS_LISTEN_ADDRESS` to use a different address.

The same OpenMetrics endpoint exports `temporal_*`, `version_guard_*`,
`go_*`, and `process_*` series. Datadog/BPCI scrape configuration must allow
all four families for the RCA dashboard panels to populate.

#### End-to-end with `make compose-*`

Expand Down Expand Up @@ -322,7 +327,7 @@ Version Guard is configured via environment variables or CLI flags:
| `TEMPORAL_NAMESPACE` | Temporal namespace | `version-guard-dev` |
| `TEMPORAL_TASK_QUEUE` | Temporal task queue used by the worker | `version-guard-detection` |
| `TEMPORAL_METRICS_ENABLED` | Enable the Temporal Go SDK Prometheus/OpenMetrics endpoint | `true` |
| `TEMPORAL_METRICS_LISTEN_ADDRESS` | Prometheus listen address for Temporal SDK metrics | `0.0.0.0:9090` |
| `TEMPORAL_METRICS_LISTEN_ADDRESS` | Prometheus/OpenMetrics listen address for Temporal SDK and application metrics | `0.0.0.0:9090` |
| `HTTP_PORT` | HTTP admin port (`POST /scan`) | `8081` |
| `S3_BUCKET` | S3 bucket for snapshots | `version-guard-snapshots` |
| `AWS_REGION` | AWS region (for S3 snapshots) | `us-west-2` |
Expand Down
2 changes: 2 additions & 0 deletions cmd/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"go.temporal.io/sdk/client"

"github.com/block/Version-Guard/pkg/scan"
"github.com/block/Version-Guard/pkg/telemetry"
"github.com/block/Version-Guard/pkg/types"
)

Expand Down Expand Up @@ -218,6 +219,7 @@ func (c *ScanStartCmd) Run(ctx *Context) error {
res, err := trigger.Run(context.Background(), scan.Input{
ScanID: c.ScanID,
ResourceTypes: resourceTypes,
Source: telemetry.ScanSourceCLI,
})
if err != nil {
return fmt.Errorf("trigger scan: %w", err)
Expand Down
3 changes: 2 additions & 1 deletion cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,9 @@ func (s *ServerCLI) Run(_ *kong.Context) error {
fmt.Println("✓ Detection activities registered")

// Orchestrator workflow activities
orchestratorActivities := orchestrator.NewActivities(st, snapshotStore)
w.RegisterActivityWithOptions(orchestratorActivities.RecordResourceScanResult, activity.RegisterOptions{Name: orchestrator.RecordResourceScanResultActivityName})
if snapshotStore != nil {
orchestratorActivities := orchestrator.NewActivities(st, snapshotStore)
w.RegisterActivityWithOptions(orchestratorActivities.CreateSnapshot, activity.RegisterOptions{Name: orchestrator.CreateSnapshotActivityName})
w.RegisterActivityWithOptions(orchestratorActivities.NotifyEmitter, activity.RegisterOptions{Name: orchestrator.NotifyEmitterActivityName})
fmt.Println("✓ Orchestrator activities registered (with S3)")
Expand Down
6 changes: 6 additions & 0 deletions cmd/server/temporal_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
tallyprom "github.com/uber-go/tally/v4/prometheus"
"go.temporal.io/sdk/client"
sdktally "go.temporal.io/sdk/contrib/tally"

"github.com/block/Version-Guard/pkg/telemetry"
)

type temporalMetricsCloser struct {
Expand Down Expand Up @@ -45,6 +47,10 @@ func newTemporalMetricsHandler(listenAddress string) (client.MetricsHandler, io.
}

registry := prom.NewRegistry()
if err := telemetry.Register(registry); err != nil {
return nil, nil, fmt.Errorf("register application metrics: %w", err)
}

reporter := tallyprom.NewReporter(tallyprom.Options{
Registerer: registry,
Gatherer: registry,
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ require (
github.com/golang/mock v1.6.0 // indirect
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.2 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nexus-rpc/sdk-go v0.6.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions pkg/scan/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"net/http"

"github.com/block/Version-Guard/pkg/telemetry"
"github.com/block/Version-Guard/pkg/types"
)

Expand Down Expand Up @@ -61,6 +62,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
res, err := h.trigger.Run(r.Context(), Input{
ScanID: body.ScanID,
ResourceTypes: resourceTypes,
Source: telemetry.ScanSourceHTTP,
})
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
Expand Down
6 changes: 6 additions & 0 deletions pkg/scan/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ func TestHandler_POST_EmptyBody_TriggersFullScan(t *testing.T) {
assert.Equal(t, "wf", body.WorkflowID)
assert.Equal(t, "run", body.RunID)
assert.NotEmpty(t, body.ScanID)

require.Len(t, mock.calledArgs, 1)
in, ok := mock.calledArgs[0].(orchestrator.WorkflowInput)
require.True(t, ok, "workflow args[0] should be orchestrator.WorkflowInput")
assert.Equal(t, orchestrator.ScanScopeFull, in.ScanScope)
}

func TestHandler_POST_TargetedScan(t *testing.T) {
Expand All @@ -62,6 +67,7 @@ func TestHandler_POST_TargetedScan(t *testing.T) {
require.True(t, ok, "workflow args[0] should be orchestrator.WorkflowInput")
assert.Equal(t, []types.ResourceType{"aurora-mysql", "eks"}, in.ResourceTypes)
assert.Equal(t, "my-scan", in.ScanID)
assert.Equal(t, orchestrator.ScanScopeTargeted, in.ScanScope)
}

func TestHandler_RejectsNonPOST(t *testing.T) {
Expand Down
45 changes: 41 additions & 4 deletions pkg/scan/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ package scan
import (
"context"
"fmt"
"log/slog"
"time"

"github.com/google/uuid"
"go.temporal.io/sdk/client"

"github.com/block/Version-Guard/pkg/telemetry"
"github.com/block/Version-Guard/pkg/types"
"github.com/block/Version-Guard/pkg/workflow/orchestrator"
)
Expand Down Expand Up @@ -73,6 +75,10 @@ type Input struct {
// ScanID lets the caller pin a correlation ID. If empty, one is generated.
ScanID string

// Source identifies the trigger transport, e.g. "http" or "cli".
// Empty is recorded as "manual".
Source string

// ResourceTypes limits the scan to the given resource config IDs
// (e.g. "aurora-mysql", "eks"). Empty means scan all configured resources.
ResourceTypes []types.ResourceType
Expand All @@ -87,12 +93,37 @@ type Result struct {

// Run starts an OrchestratorWorkflow and returns identifiers describing the
// running execution. It does not wait for completion.
func (t *Trigger) Run(ctx context.Context, in Input) (Result, error) {
func (t *Trigger) Run(ctx context.Context, in Input) (res Result, err error) {
start := time.Now()
source := telemetry.NormalizeScanSource(in.Source)
result := telemetry.ResultFailure
scanID := in.ScanID
workflowID := ""
runID := ""
resourceTypeCount := len(in.ResourceTypes)
defer func() {
telemetry.RecordScanTrigger(source, result, t.taskQueue, time.Since(start))
attrs := []any{
"source", source,
"scanID", scanID,
"workflowID", workflowID,
"runID", runID,
"taskQueue", t.taskQueue,
"resourceTypeCount", resourceTypeCount,
}
if err != nil {
attrs = append(attrs, "event", "scan_trigger_failed", "error", err)
slog.Error("scan trigger failed", attrs...)
return
}
attrs = append(attrs, "event", "scan_triggered")
slog.Info("scan triggered", attrs...)
}()

if t.taskQueue == "" {
return Result{}, fmt.Errorf("scan: task queue is required")
}

scanID := in.ScanID
if scanID == "" {
scanID = uuid.NewString()
}
Expand All @@ -103,14 +134,17 @@ func (t *Trigger) Run(ctx context.Context, in Input) (Result, error) {
// the contract boundary that translates "no body / full scan"
// into the YAML-derived list.
resourceTypes := in.ResourceTypes
scanScope := orchestrator.ScanScopeTargeted
if len(resourceTypes) == 0 {
resourceTypes = t.defaultResourceTypes
scanScope = orchestrator.ScanScopeFull
}
resourceTypeCount = len(resourceTypes)
if len(resourceTypes) == 0 {
return Result{}, fmt.Errorf("scan: no resource types to scan and no default configured")
}

workflowID := buildWorkflowID(scanID)
workflowID = buildWorkflowID(scanID)

opts := client.StartWorkflowOptions{
ID: workflowID,
Expand All @@ -122,14 +156,17 @@ func (t *Trigger) Run(ctx context.Context, in Input) (Result, error) {
ScanID: scanID,
ResourceTypes: resourceTypes,
EmitterWebhookURL: t.emitterWebhookURL,
ScanScope: scanScope,
})
if err != nil {
return Result{}, fmt.Errorf("scan: execute workflow: %w", err)
}

runID = run.GetRunID()
result = telemetry.ResultSuccess
return Result{
WorkflowID: run.GetID(),
RunID: run.GetRunID(),
RunID: runID,
ScanID: scanID,
}, nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/scan/scan_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func TestTrigger_Run_FullScan(t *testing.T) {
// Empty caller list expands to the configured default — the
// orchestrator no longer carries a hardcoded fallback.
assert.Equal(t, defaults, in.ResourceTypes)
assert.Equal(t, orchestrator.ScanScopeFull, in.ScanScope)
}

func TestTrigger_Run_EmptyInputAndNoDefault_ReturnsError(t *testing.T) {
Expand Down Expand Up @@ -101,6 +102,7 @@ func TestTrigger_Run_TargetedScan(t *testing.T) {
require.Len(t, mock.calledArgs, 1)
in := mock.calledArgs[0].(orchestrator.WorkflowInput)
assert.Equal(t, targets, in.ResourceTypes)
assert.Equal(t, orchestrator.ScanScopeTargeted, in.ScanScope)
}

func TestTrigger_Run_GeneratesScanIDWhenEmpty(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/schedule/schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ func (m *Manager) EnsureSchedule(ctx context.Context, cfg Config) error {
Args: []interface{}{orchestrator.WorkflowInput{
ResourceTypes: cfg.ResourceTypes,
EmitterWebhookURL: cfg.EmitterWebhookURL,
ScanScope: orchestrator.ScanScopeFull,
}},
TaskQueue: cfg.TaskQueue,
WorkflowExecutionTimeout: 2 * time.Hour,
Expand Down Expand Up @@ -144,6 +145,7 @@ func (m *Manager) EnsureSchedule(ctx context.Context, cfg Config) error {
action.Args = []interface{}{orchestrator.WorkflowInput{
ResourceTypes: cfg.ResourceTypes,
EmitterWebhookURL: cfg.EmitterWebhookURL,
ScanScope: orchestrator.ScanScopeFull,
}}
}
return &client.ScheduleUpdate{
Expand Down Expand Up @@ -186,6 +188,9 @@ func scheduleActionMatches(action client.ScheduleAction, cfg *Config) bool {
if existing.EmitterWebhookURL != cfg.EmitterWebhookURL {
return false
}
if existing.ScanScope != orchestrator.ScanScopeFull {
return false
}
if !resourceTypesEqual(existing.ResourceTypes, cfg.ResourceTypes) {
return false
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/schedule/schedule_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ func TestEnsureSchedule_CreatesNew(t *testing.T) {
action := mock.createOpts.Action.(*client.ScheduleWorkflowAction)
assert.Equal(t, "test-queue", action.TaskQueue)
assert.Equal(t, 2*time.Hour, action.WorkflowExecutionTimeout)
require.Len(t, action.Args, 1)
in, ok := action.Args[0].(orchestrator.WorkflowInput)
require.True(t, ok)
assert.Equal(t, orchestrator.ScanScopeFull, in.ScanScope)
}

func TestEnsureSchedule_AlreadyExists_SameCron(t *testing.T) {
Expand All @@ -147,6 +151,7 @@ func TestEnsureSchedule_AlreadyExists_SameCron(t *testing.T) {
TaskQueue: "test-queue",
Args: []interface{}{orchestrator.WorkflowInput{
ResourceTypes: testResourceTypes,
ScanScope: orchestrator.ScanScopeFull,
}},
},
},
Expand Down Expand Up @@ -230,6 +235,7 @@ func TestEnsureSchedule_AlreadyExists_NewWebhookURL(t *testing.T) {
assert.Equal(t, "http://emitter:8080", in.EmitterWebhookURL,
"updated WorkflowInput must carry the new EmitterWebhookURL")
assert.Equal(t, testResourceTypes, in.ResourceTypes)
assert.Equal(t, orchestrator.ScanScopeFull, in.ScanScope)
}

func TestEnsureSchedule_AlreadyExists_DifferentCron(t *testing.T) {
Expand Down
Loading
Loading