diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 4ee3086..79374d0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -35,13 +35,10 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | - # PRs type=ref,event=pr - # Default branch builds type=raw,value=latest,enable={{is_default_branch}} type=ref,event=branch type=sha - # Semver tags type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} type=semver,pattern={{major}} @@ -85,4 +82,3 @@ jobs: if: always() with: sarif_file: 'trivy-results.sarif' - diff --git a/.gitignore b/.gitignore index 8deb67e..f99336f 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ go.work *.swo *~ -.DS_Store \ No newline at end of file +.DS_Store +.gocache/ diff --git a/api/runs/v1alpha1/steprun_types.go b/api/runs/v1alpha1/steprun_types.go index 8013ca6..57907a3 100644 --- a/api/runs/v1alpha1/steprun_types.go +++ b/api/runs/v1alpha1/steprun_types.go @@ -112,6 +112,12 @@ type StepRunSpec struct { // Can be a list to support fanning out to multiple parallel steps. // +optional DownstreamTargets []DownstreamTarget `json:"downstreamTargets,omitempty"` + + // RequestedManifest lists the metadata fields the controller expects the SDK + // to materialize alongside the offloaded output. These are derived from CEL expressions + // that reference this step's outputs (e.g., len(steps.foo.output.bar)). + // +optional + RequestedManifest []ManifestRequest `json:"requestedManifest,omitempty"` } // DownstreamTarget defines the destination for an Engram's output in real-time execution mode. @@ -140,7 +146,50 @@ type TerminateTarget struct { StopMode enums.StopMode `json:"stopMode"` } +// ManifestOperation enumerates the metadata operations supported for step manifests. +type ManifestOperation string + +const ( + // ManifestOperationExists records whether the referenced field exists/non-nil. + ManifestOperationExists ManifestOperation = "exists" + // ManifestOperationLength records the length of the referenced field when it is an array, map, or string. + ManifestOperationLength ManifestOperation = "length" +) + +// ManifestRequest describes a single output field and the metadata operations required for it. +type ManifestRequest struct { + // Path is the dot/bracket notation path relative to the step output root. + // Examples: "result.items", "tools", "items[0].id". + // +kubebuilder:validation:MinLength=1 + Path string `json:"path"` + // Operations lists the metadata operations that should be computed for this path. + // Defaults to ["exists"] when omitted. + // +optional + Operations []ManifestOperation `json:"operations,omitempty"` +} + +// StepManifestData captures the metadata emitted by the SDK for a single manifest path. +type StepManifestData struct { + // Exists indicates whether the referenced field was present and non-nil. + // +optional + Exists *bool `json:"exists,omitempty"` + // Length contains the computed length when requested and applicable. + // +optional + Length *int64 `json:"length,omitempty"` + // Truncated signals that the SDK could not compute the full metadata due to limits. + // +optional + Truncated bool `json:"truncated,omitempty"` + // Error contains a warning message emitted by the SDK when it cannot honour the manifest request. + // +optional + Error string `json:"error,omitempty"` + // Sample holds an optional representative slice of the data (implementation-defined). + // +optional + Sample *runtime.RawExtension `json:"sample,omitempty"` +} + // StepRunStatus tracks the detailed execution state of this individual step +// +kubebuilder:validation:XValidation:message="status.conditions reason field must be <= 64 characters",rule="!has(self.conditions) || self.conditions.all(c, !has(c.reason) || size(c.reason) <= 64)" +// +kubebuilder:validation:XValidation:message="status.conditions message field must be <= 2048 characters",rule="!has(self.conditions) || self.conditions.all(c, !has(c.message) || size(c.message) <= 2048)" type StepRunStatus struct { // observedGeneration is the most recent generation observed for this StepRun. It corresponds to the // StepRun's generation, which is updated on mutation by the API Server. @@ -198,6 +247,16 @@ type StepRunStatus struct { // Step coordination - which steps must complete before this one can start // Uses the same "needs" terminology as our Story API for consistency Needs []string `json:"needs,omitempty"` // StepRun names that must complete first + + // Manifest contains metadata captured for this step's output that enables CEL expressions + // to execute without hydrating large blobs from storage. + // The map key matches the ManifestRequest path. + // +optional + Manifest map[string]StepManifestData `json:"manifest,omitempty"` + + // ManifestWarnings contains any warnings produced while computing manifest data (e.g., unsupported operations). + // +optional + ManifestWarnings []string `json:"manifestWarnings,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/runs/v1alpha1/storyrun_types.go b/api/runs/v1alpha1/storyrun_types.go index ffd2105..235988c 100644 --- a/api/runs/v1alpha1/storyrun_types.go +++ b/api/runs/v1alpha1/storyrun_types.go @@ -80,6 +80,10 @@ type StoryRunSpec struct { } // StoryRunStatus tracks the current state and results of this story execution +// +kubebuilder:validation:XValidation:rule="!has(self.conditions) || self.conditions.exists(c, c.type == 'Ready')",message="status.conditions must include Ready when conditions are set" +// +kubebuilder:validation:XValidation:rule="!has(self.conditions) || self.conditions.all(c, has(c.lastTransitionTime))",message="status.conditions entries must set lastTransitionTime" +// +kubebuilder:validation:XValidation:message="status.conditions reason field must be <= 64 characters",rule="!has(self.conditions) || self.conditions.all(c, !has(c.reason) || size(c.reason) <= 64)" +// +kubebuilder:validation:XValidation:message="status.conditions message field must be <= 2048 characters",rule="!has(self.conditions) || self.conditions.all(c, !has(c.message) || size(c.message) <= 2048)" type StoryRunStatus struct { // observedGeneration is the most recent generation observed for this StoryRun. It corresponds to the // StoryRun's generation, which is updated on mutation by the API Server. diff --git a/api/runs/v1alpha1/zz_generated.deepcopy.go b/api/runs/v1alpha1/zz_generated.deepcopy.go index 07cf68f..cfa5abe 100644 --- a/api/runs/v1alpha1/zz_generated.deepcopy.go +++ b/api/runs/v1alpha1/zz_generated.deepcopy.go @@ -67,6 +67,26 @@ func (in *GRPCTarget) DeepCopy() *GRPCTarget { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ManifestRequest) DeepCopyInto(out *ManifestRequest) { + *out = *in + if in.Operations != nil { + in, out := &in.Operations, &out.Operations + *out = make([]ManifestOperation, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManifestRequest. +func (in *ManifestRequest) DeepCopy() *ManifestRequest { + if in == nil { + return nil + } + out := new(ManifestRequest) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StepExecutionOverrides) DeepCopyInto(out *StepExecutionOverrides) { *out = *in @@ -117,6 +137,36 @@ func (in *StepExecutionOverrides) DeepCopy() *StepExecutionOverrides { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StepManifestData) DeepCopyInto(out *StepManifestData) { + *out = *in + if in.Exists != nil { + in, out := &in.Exists, &out.Exists + *out = new(bool) + **out = **in + } + if in.Length != nil { + in, out := &in.Length, &out.Length + *out = new(int64) + **out = **in + } + if in.Sample != nil { + in, out := &in.Sample, &out.Sample + *out = new(runtime.RawExtension) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StepManifestData. +func (in *StepManifestData) DeepCopy() *StepManifestData { + if in == nil { + return nil + } + out := new(StepManifestData) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StepRun) DeepCopyInto(out *StepRun) { *out = *in @@ -207,6 +257,13 @@ func (in *StepRunSpec) DeepCopyInto(out *StepRunSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.RequestedManifest != nil { + in, out := &in.RequestedManifest, &out.RequestedManifest + *out = make([]ManifestRequest, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StepRunSpec. @@ -256,6 +313,18 @@ func (in *StepRunStatus) DeepCopyInto(out *StepRunStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.Manifest != nil { + in, out := &in.Manifest, &out.Manifest + *out = make(map[string]StepManifestData, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.ManifestWarnings != nil { + in, out := &in.ManifestWarnings, &out.ManifestWarnings + *out = make([]string, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StepRunStatus. diff --git a/api/v1alpha1/shared_types.go b/api/v1alpha1/shared_types.go index fade703..5325922 100644 --- a/api/v1alpha1/shared_types.go +++ b/api/v1alpha1/shared_types.go @@ -114,7 +114,7 @@ type ExecutionOverrides struct { ServiceAccountName *string `json:"serviceAccountName,omitempty"` // AutomountServiceAccountToken controls whether a service account token should be automatically mounted. - // Defaults to false. + // +kubebuilder:default=true // +optional AutomountServiceAccountToken *bool `json:"automountServiceAccountToken,omitempty"` diff --git a/api/v1alpha1/story_types.go b/api/v1alpha1/story_types.go index fdbd6be..9be37b3 100644 --- a/api/v1alpha1/story_types.go +++ b/api/v1alpha1/story_types.go @@ -53,6 +53,8 @@ type Story struct { } // StorySpec defines what the workflow does and how it should run +// +kubebuilder:validation:XValidation:rule="self.steps.all(step, has(step.ref) != has(step.type))",message="each step must set exactly one of ref or type" +// +kubebuilder:validation:XValidation:rule="self.steps.all(step, self.steps.exists_one(other, other.name == step.name))",message="step names must be unique" type StorySpec struct { // Pattern specifies the execution model for the Story. // "batch" stories are run to completion via a StoryRun. diff --git a/cmd/main.go b/cmd/main.go index a5c1b63..f96f03c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,14 +17,17 @@ limitations under the License. package main import ( + "context" "crypto/tls" "flag" "os" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -40,6 +43,7 @@ import ( setup "github.com/bubustack/bobrapet/internal/setup" "github.com/bubustack/bobrapet/pkg/cel" "github.com/bubustack/bobrapet/pkg/logging" + "github.com/bubustack/bobrapet/pkg/observability" catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" @@ -75,6 +79,11 @@ func main() { var secureMetrics bool var enableHTTP2 bool var tlsOpts []func(*tls.Config) + + // Operator configuration flags + var operatorConfigNamespace string + var operatorConfigName string + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -92,6 +101,13 @@ func main() { flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + + // Operator configuration flags (similar to kube-controller-manager --kubeconfig pattern) + flag.StringVar(&operatorConfigNamespace, "config-namespace", "bobrapet-system", + "The namespace where the operator configuration ConfigMap resides.") + flag.StringVar(&operatorConfigName, "config-name", "bobrapet-operator-config", + "The name of the operator configuration ConfigMap.") + opts := zap.Options{ Development: true, } @@ -140,7 +156,12 @@ func main() { managerCtx := ctrl.SetupSignalHandler() setup.SetupIndexers(managerCtx, mgr) - operatorConfigManager, controllerConfig, configResolver, celEvaluator := mustInitOperatorServices(mgr) + operatorConfigManager, controllerConfig, configResolver, celEvaluator := mustInitOperatorServices( + mgr, + managerCtx, + operatorConfigNamespace, + operatorConfigName, + ) deps := config.ControllerDependencies{ Client: mgr.GetClient(), @@ -214,19 +235,51 @@ func buildMetricsServerOptions( func mustInitOperatorServices( mgr ctrl.Manager, + startupCtx context.Context, + configNamespace string, + configName string, ) (*config.OperatorConfigManager, *config.ControllerConfig, *config.Resolver, *cel.Evaluator) { operatorConfigManager := config.NewOperatorConfigManager( mgr.GetClient(), - "bobrapet-system", - "bobrapet-operator-config", + configNamespace, + configName, ) - setupLog.Info("Operator configuration manager initialized") - if err := mgr.Add(operatorConfigManager); err != nil { - setupLog.Error(err, "unable to add operator config manager to manager") + operatorConfigManager.SetAPIReader(mgr.GetAPIReader()) + + setupLog.Info("Operator configuration manager initialized", + "configNamespace", configNamespace, + "configName", configName) + + // Setup the config manager as a reconciler (event-driven) + if err := operatorConfigManager.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to setup operator config manager controller") os.Exit(1) } + setupLog.Info("Operator config manager controller registered") + + loadCtx, cancel := context.WithTimeout(startupCtx, 15*time.Second) + defer cancel() + if err := operatorConfigManager.LoadInitial(loadCtx); err != nil { + if apierrors.IsNotFound(err) { + setupLog.Info("Operator config map not found; continuing with defaults", + "configNamespace", configNamespace, + "configName", configName) + } else { + setupLog.Error(err, "failed to load operator configuration during startup", + "configNamespace", configNamespace, + "configName", configName) + os.Exit(1) + } + } else { + setupLog.Info("Operator configuration loaded from ConfigMap", + "configNamespace", configNamespace, + "configName", configName) + } + controllerConfig := operatorConfigManager.GetControllerConfig() setupLog.Info("Controller configuration loaded") + config.EnableTelemetry(controllerConfig.TelemetryEnabled) + observability.EnableTracing(controllerConfig.TelemetryEnabled) configResolver := config.NewResolver(mgr.GetClient(), operatorConfigManager) setupLog.Info("Configuration resolver initialized") celLogger := logging.NewCELLogger(ctrl.Log) @@ -261,6 +314,12 @@ func mustSetupControllers( setupLog.Error(err, "unable to create controller", "controller", "Impulse") os.Exit(1) } + if err := (&controller.RealtimeEngramReconciler{ + ControllerDependencies: deps, + }).SetupWithManager(mgr, controllerConfig.BuildEngramControllerOptions()); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "RealtimeEngram") + os.Exit(1) + } if err := (&runscontroller.StoryRunReconciler{ ControllerDependencies: deps, }).SetupWithManager(mgr, controllerConfig.BuildStoryRunControllerOptions()); err != nil { @@ -293,7 +352,8 @@ func setupWebhooksIfEnabled(mgr ctrl.Manager, operatorConfigManager *config.Oper } setupLog.Info("setting up webhooks") if err := (&webhookv1alpha1.StoryWebhook{ - Config: operatorConfigManager.GetControllerConfig(), + Config: operatorConfigManager.GetControllerConfig(), + ConfigManager: operatorConfigManager, }).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "Story") os.Exit(1) @@ -311,12 +371,16 @@ func setupWebhooksIfEnabled(mgr ctrl.Manager, operatorConfigManager *config.Oper os.Exit(1) } if err := (&webhookrunsv1alpha1.StoryRunWebhook{ - Config: operatorConfigManager.GetControllerConfig(), + Config: operatorConfigManager.GetControllerConfig(), + ConfigManager: operatorConfigManager, }).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "StoryRun") os.Exit(1) } - if err := (&webhookrunsv1alpha1.StepRunWebhook{}).SetupWebhookWithManager(mgr); err != nil { + if err := (&webhookrunsv1alpha1.StepRunWebhook{ + Config: operatorConfigManager.GetControllerConfig(), + ConfigManager: operatorConfigManager, + }).SetupWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "StepRun") os.Exit(1) } diff --git a/config/crd/bases/bubustack.io_engrams.yaml b/config/crd/bases/bubustack.io_engrams.yaml index dcfac1a..3e2dd69 100644 --- a/config/crd/bases/bubustack.io_engrams.yaml +++ b/config/crd/bases/bubustack.io_engrams.yaml @@ -165,9 +165,9 @@ spec: description: Overrides allows for fine-tuning of execution behavior. properties: automountServiceAccountToken: - description: |- - AutomountServiceAccountToken controls whether a service account token should be automatically mounted. - Defaults to false. + default: true + description: AutomountServiceAccountToken controls whether a service + account token should be automatically mounted. type: boolean imagePullPolicy: description: Image pull policy override diff --git a/config/crd/bases/bubustack.io_impulses.yaml b/config/crd/bases/bubustack.io_impulses.yaml index fdbc102..791a5f0 100644 --- a/config/crd/bases/bubustack.io_impulses.yaml +++ b/config/crd/bases/bubustack.io_impulses.yaml @@ -78,9 +78,9 @@ spec: description: Fine-tune execution behavior for special cases properties: automountServiceAccountToken: - description: |- - AutomountServiceAccountToken controls whether a service account token should be automatically mounted. - Defaults to false. + default: true + description: AutomountServiceAccountToken controls whether a service + account token should be automatically mounted. type: boolean imagePullPolicy: description: Image pull policy override diff --git a/config/crd/bases/bubustack.io_stories.yaml b/config/crd/bases/bubustack.io_stories.yaml index a14c6b7..b349f94 100644 --- a/config/crd/bases/bubustack.io_stories.yaml +++ b/config/crd/bases/bubustack.io_stories.yaml @@ -313,9 +313,9 @@ spec: Use sparingly - most configuration should be at the story or engram level properties: automountServiceAccountToken: - description: |- - AutomountServiceAccountToken controls whether a service account token should be automatically mounted. - Defaults to false. + default: true + description: AutomountServiceAccountToken controls whether + a service account token should be automatically mounted. type: boolean imagePullPolicy: description: Image pull policy override @@ -534,6 +534,12 @@ spec: required: - steps type: object + x-kubernetes-validations: + - message: each step must set exactly one of ref or type + rule: self.steps.all(step, has(step.ref) != has(step.type)) + - message: step names must be unique + rule: self.steps.all(step, self.steps.exists_one(other, other.name == + step.name)) status: description: StoryStatus defines the observed state of a Story. properties: diff --git a/config/crd/bases/runs.bubustack.io_stepruns.yaml b/config/crd/bases/runs.bubustack.io_stepruns.yaml index 689e73f..bcfe272 100644 --- a/config/crd/bases/runs.bubustack.io_stepruns.yaml +++ b/config/crd/bases/runs.bubustack.io_stepruns.yaml @@ -192,6 +192,34 @@ spec: - Loop: {"items": ["file1.txt", "file2.txt"], "concurrency": 3} type: object x-kubernetes-preserve-unknown-fields: true + requestedManifest: + description: |- + RequestedManifest lists the metadata fields the controller expects the SDK + to materialize alongside the offloaded output. These are derived from CEL expressions + that reference this step's outputs (e.g., len(steps.foo.output.bar)). + items: + description: ManifestRequest describes a single output field and + the metadata operations required for it. + properties: + operations: + description: |- + Operations lists the metadata operations that should be computed for this path. + Defaults to ["exists"] when omitted. + items: + description: ManifestOperation enumerates the metadata operations + supported for step manifests. + type: string + type: array + path: + description: |- + Path is the dot/bracket notation path relative to the step output root. + Examples: "result.items", "tools", "items[0].id". + minLength: 1 + type: string + required: + - path + type: object + type: array retry: description: |- What to do if this step fails? @@ -371,6 +399,45 @@ spec: type: string lastFailureMsg: type: string + manifest: + additionalProperties: + description: StepManifestData captures the metadata emitted by the + SDK for a single manifest path. + properties: + error: + description: Error contains a warning message emitted by the + SDK when it cannot honour the manifest request. + type: string + exists: + description: Exists indicates whether the referenced field was + present and non-nil. + type: boolean + length: + description: Length contains the computed length when requested + and applicable. + format: int64 + type: integer + sample: + description: Sample holds an optional representative slice of + the data (implementation-defined). + type: object + x-kubernetes-preserve-unknown-fields: true + truncated: + description: Truncated signals that the SDK could not compute + the full metadata due to limits. + type: boolean + type: object + description: |- + Manifest contains metadata captured for this step's output that enables CEL expressions + to execute without hydrating large blobs from storage. + The map key matches the ManifestRequest path. + type: object + manifestWarnings: + description: ManifestWarnings contains any warnings produced while + computing manifest data (e.g., unsupported operations). + items: + type: string + type: array needs: description: |- Step coordination - which steps must complete before this one can start @@ -433,6 +500,13 @@ spec: format: date-time type: string type: object + x-kubernetes-validations: + - message: status.conditions reason field must be <= 64 characters + rule: '!has(self.conditions) || self.conditions.all(c, !has(c.reason) + || size(c.reason) <= 64)' + - message: status.conditions message field must be <= 2048 characters + rule: '!has(self.conditions) || self.conditions.all(c, !has(c.message) + || size(c.message) <= 2048)' type: object served: true storage: true diff --git a/config/crd/bases/runs.bubustack.io_storyruns.yaml b/config/crd/bases/runs.bubustack.io_storyruns.yaml index 7fa3fcd..7f7b9d4 100644 --- a/config/crd/bases/runs.bubustack.io_storyruns.yaml +++ b/config/crd/bases/runs.bubustack.io_storyruns.yaml @@ -354,6 +354,18 @@ spec: format: int32 type: integer type: object + x-kubernetes-validations: + - message: status.conditions must include Ready when conditions are set + rule: '!has(self.conditions) || self.conditions.exists(c, c.type == + ''Ready'')' + - message: status.conditions entries must set lastTransitionTime + rule: '!has(self.conditions) || self.conditions.all(c, has(c.lastTransitionTime))' + - message: status.conditions reason field must be <= 64 characters + rule: '!has(self.conditions) || self.conditions.all(c, !has(c.reason) + || size(c.reason) <= 64)' + - message: status.conditions message field must be <= 2048 characters + rule: '!has(self.conditions) || self.conditions.all(c, !has(c.message) + || size(c.message) <= 2048)' type: object served: true storage: true diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 15e15b7..a318b9c 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -41,6 +41,16 @@ patches: target: kind: Deployment +# Patch to override the config-name arg with the actual kustomized name +# After metrics patch adds arg at index 0, our config-name is at index 4 +- patch: |- + - op: replace + path: /spec/template/spec/containers/0/args/4 + value: --config-name=bobrapet-bobrapet-operator-config + target: + kind: Deployment + name: controller-manager + # Uncomment the patches line if you enable Metrics and CertManager # [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line. # This patch will protect the metrics with certManager self-signed certs. diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 1f4c672..dee45ca 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -61,8 +61,10 @@ spec: - command: - /manager args: - - --leader-elect - - --health-probe-bind-address=:8081 + - --leader-elect + - --health-probe-bind-address=:8081 + - --config-namespace=bobrapet-system + - --config-name=bobrapet-operator-config # metrics-bind-address is configured via kustomize patch to :8443 (HTTPS) image: controller:latest imagePullPolicy: IfNotPresent diff --git a/config/manager/operator-config.yaml b/config/manager/operator-config.yaml index 0b9e38a..de0d48a 100644 --- a/config/manager/operator-config.yaml +++ b/config/manager/operator-config.yaml @@ -4,63 +4,153 @@ metadata: name: bobrapet-operator-config namespace: bobrapet-system data: - # Controller Configuration - controller.max-concurrent-reconciles: "15" + # ==================== + # Global Controller Configuration + # ==================== + controller.max-concurrent-reconciles: "10" controller.requeue-base-delay: "2s" controller.requeue-max-delay: "1m" controller.health-check-interval: "30s" - controller.cleanup-interval: "6h" + controller.cleanup-interval: "1h" + controller.reconcile-timeout: "1m" + controller.default-engram-grpc-port: "50051" + controller.max-story-with-block-size-bytes: "65536" + # ==================== # Image Configuration + # ==================== images.registry: "ghcr.io/bubustack" images.default-engram: "engram-default:latest" images.default-impulse: "impulse-default:latest" images.pull-policy: "IfNotPresent" - # Resource Limits + # ==================== + # Default Resource Limits + # ==================== + resources.default.cpu-request: "100m" + resources.default.cpu-limit: "500m" + resources.default.memory-request: "128Mi" + resources.default.memory-limit: "512Mi" + + # ==================== + # Engram-Specific Resource Limits + # ==================== resources.engram.cpu-request: "100m" resources.engram.cpu-limit: "1" resources.engram.memory-request: "128Mi" resources.engram.memory-limit: "512Mi" + # ==================== # Retry and Timeout Configuration + # ==================== retry.max-retries: "3" - retry.exponential-backoff-base: "2s" - retry.exponential-backoff-max: "1m" + retry.exponential-backoff-base: "1s" + retry.exponential-backoff-max: "60s" timeout.default-step: "5m" timeout.approval-default: "24h" timeout.external-data-default: "30m" timeout.conditional-default: "10m" + # ==================== # Loop Processing Configuration + # ==================== loop.max-iterations: "10000" loop.default-batch-size: "100" loop.max-batch-size: "1000" loop.max-concurrency: "10" loop.max-concurrency-limit: "50" + # ==================== # Security Configuration + # ==================== security.run-as-non-root: "true" security.read-only-root-filesystem: "true" security.allow-privilege-escalation: "false" security.drop-capabilities: "ALL" security.run-as-user: "1000" + security.automount-service-account-token: "false" + security.service-account-name: "default" + # ==================== # Job Configuration - job.backoff-limit: "0" - job.ttl-seconds-after-finished: "300" + # ==================== + job.backoff-limit: "3" + job.ttl-seconds-after-finished: "3600" job.restart-policy: "Never" + # ==================== # CEL Configuration + # ==================== cel.evaluation-timeout: "30s" cel.max-expression-length: "1000" cel.enable-macros: "true" + # ==================== # Telemetry Configuration + # ==================== telemetry.enabled: "true" telemetry.trace-propagation: "true" + # ==================== # Development/Debug Configuration + # ==================== debug.enable-verbose-logging: "false" debug.enable-step-output-logging: "false" debug.enable-metrics: "true" + + # ==================== + # StoryRun Controller Configuration + # ==================== + storyrun.max-concurrent-reconciles: "8" + storyrun.rate-limiter.base-delay: "50ms" + storyrun.rate-limiter.max-delay: "5m" + storyrun.max-inline-inputs-size: "1024" + + # ==================== + # StepRun Controller Configuration + # ==================== + steprun.max-concurrent-reconciles: "15" + steprun.rate-limiter.base-delay: "100ms" + steprun.rate-limiter.max-delay: "2m" + + # ==================== + # Story Controller Configuration + # ==================== + story.max-concurrent-reconciles: "5" + story.rate-limiter.base-delay: "200ms" + story.rate-limiter.max-delay: "1m" + + # ==================== + # Engram Controller Configuration + # ==================== + engram.max-concurrent-reconciles: "5" + engram.rate-limiter.base-delay: "200ms" + engram.rate-limiter.max-delay: "1m" + engram.default-max-inline-size: "1024" + engram.default-grpc-port: "50051" + engram.default-storage-timeout-seconds: "300" + engram.default-graceful-shutdown-timeout-seconds: "20" + engram.default-termination-grace-period-seconds: "30" + engram.default-max-recv-msg-bytes: "10485760" + engram.default-max-send-msg-bytes: "10485760" + engram.default-dial-timeout-seconds: "10" + engram.default-channel-buffer-size: "16" + engram.default-reconnect-max-retries: "10" + engram.default-reconnect-base-backoff-millis: "500" + engram.default-reconnect-max-backoff-seconds: "30" + engram.default-hang-timeout-seconds: "30" + engram.default-message-timeout-seconds: "30" + + # ==================== + # Impulse Controller Configuration + # ==================== + impulse.max-concurrent-reconciles: "5" + impulse.rate-limiter.base-delay: "200ms" + impulse.rate-limiter.max-delay: "1m" + + # ==================== + # Template Controller Configuration + # ==================== + template.max-concurrent-reconciles: "2" + template.rate-limiter.base-delay: "500ms" + template.rate-limiter.max-delay: "10m" diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b4c980d..7e4af32 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -40,6 +40,8 @@ rules: - create - get - list + - patch + - update - watch - apiGroups: - apps @@ -136,6 +138,8 @@ rules: - create - get - list + - patch + - update - watch - apiGroups: - runs.bubustack.io diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 8a69862..03af105 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -76,19 +76,19 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-runs-bubustack-io-v1alpha1-steprun + path: /validate-bubustack-io-v1alpha1-engram failurePolicy: Fail - name: vsteprun-v1alpha1.kb.io + name: vengram-v1alpha1.kb.io rules: - apiGroups: - - runs.bubustack.io + - bubustack.io apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - - stepruns + - engrams sideEffects: None - admissionReviewVersions: - v1 @@ -96,19 +96,19 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-runs-bubustack-io-v1alpha1-storyrun + path: /validate-bubustack-io-v1alpha1-impulse failurePolicy: Fail - name: vstoryrun-v1alpha1.kb.io + name: vimpulse-v1alpha1.kb.io rules: - apiGroups: - - runs.bubustack.io + - bubustack.io apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - - storyruns + - impulses sideEffects: None - admissionReviewVersions: - v1 @@ -116,9 +116,9 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-bubustack-io-v1alpha1-engram + path: /validate-bubustack-io-v1alpha1-story failurePolicy: Fail - name: vengram-v1alpha1.kb.io + name: vstory-v1alpha1.kb.io rules: - apiGroups: - bubustack.io @@ -128,7 +128,7 @@ webhooks: - CREATE - UPDATE resources: - - engrams + - stories sideEffects: None - admissionReviewVersions: - v1 @@ -136,19 +136,19 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-bubustack-io-v1alpha1-impulse + path: /validate-runs-bubustack-io-v1alpha1-steprun failurePolicy: Fail - name: vimpulse-v1alpha1.kb.io + name: vsteprun-v1alpha1.kb.io rules: - apiGroups: - - bubustack.io + - runs.bubustack.io apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - - impulses + - stepruns sideEffects: None - admissionReviewVersions: - v1 @@ -156,17 +156,17 @@ webhooks: service: name: webhook-service namespace: system - path: /validate-bubustack-io-v1alpha1-story + path: /validate-runs-bubustack-io-v1alpha1-storyrun failurePolicy: Fail - name: vstory-v1alpha1.kb.io + name: vstoryrun-v1alpha1.kb.io rules: - apiGroups: - - bubustack.io + - runs.bubustack.io apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - - stories + - storyruns sideEffects: None diff --git a/go.mod b/go.mod index 33b309a..fea544e 100644 --- a/go.mod +++ b/go.mod @@ -5,14 +5,17 @@ go 1.24.5 require ( github.com/go-logr/logr v1.4.3 github.com/google/cel-go v0.26.1 - github.com/onsi/ginkgo/v2 v2.26.0 + github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 github.com/prometheus/client_golang v1.23.2 github.com/xeipuuv/gojsonschema v1.2.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/trace v1.38.0 google.golang.org/protobuf v1.36.10 k8s.io/api v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 sigs.k8s.io/controller-runtime v0.22.3 ) @@ -63,32 +66,29 @@ require ( github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/sdk v1.37.0 // indirect go.opentelemetry.io/proto/otlp v1.5.0 // indirect - go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.27.0 // indirect - golang.org/x/net v0.43.0 // indirect + golang.org/x/mod v0.28.0 // indirect + golang.org/x/net v0.44.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.35.0 // indirect - golang.org/x/term v0.34.0 // indirect - golang.org/x/text v0.28.0 // indirect + golang.org/x/sync v0.17.0 // indirect + golang.org/x/sys v0.36.0 // indirect + golang.org/x/term v0.35.0 // indirect + golang.org/x/text v0.29.0 // indirect golang.org/x/time v0.9.0 // indirect - golang.org/x/tools v0.36.0 // indirect + golang.org/x/tools v0.37.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/grpc v1.72.1 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/grpc v1.76.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect @@ -97,7 +97,6 @@ require ( k8s.io/component-base v0.34.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/randfill v1.0.0 // indirect diff --git a/go.sum b/go.sum index 67a5a25..715f498 100644 --- a/go.sum +++ b/go.sum @@ -33,8 +33,8 @@ github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BN github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= -github.com/gkampitakis/go-snaps v0.5.14 h1:3fAqdB6BCPKHDMHAKRwtPUwYexKtGrNuw8HX/T/4neo= -github.com/gkampitakis/go-snaps v0.5.14/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -110,16 +110,14 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE= -github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= @@ -170,24 +168,22 @@ go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJyS go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= -go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= -go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -205,52 +201,54 @@ golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= -golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= +golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U= +golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= -golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= +golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= -golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ= +golang.org/x/term v0.35.0/go.mod h1:TPGtkTLesOwf2DE8CgVYiZinHAOuy5AYUYT1lENIZnA= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= -golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= +golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= +golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= -google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= -google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= -google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b h1:ULiyYQ0FdsJhwwZUwbaXpZF5yUE3h+RA+gxvBu37ucc= +google.golang.org/genproto/googleapis/api v0.0.0-20250804133106-a7a43d27e69b/go.mod h1:oDOGiMSXHL4sDTJvFvIB9nRQCGdLP1o/iVaqQK8zB+M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/hack/makefiles/kind.mk b/hack/makefiles/kind.mk index d2fb19f..eb15725 100644 --- a/hack/makefiles/kind.mk +++ b/hack/makefiles/kind.mk @@ -30,6 +30,7 @@ kind-delete: ## Delete the kind cluster .PHONY: kind-get-kubeconfig kind-get-kubeconfig: ## Get the kubeconfig for the kind cluster + @mkdir -p $(HOME)/.kube @kind get kubeconfig --name $(KIND_CLUSTER_NAME) > $(KUBECONFIG) @echo "Kubeconfig saved to $(KUBECONFIG)" diff --git a/internal/config/controller_config.go b/internal/config/controller_config.go index e0d738d..5595452 100644 --- a/internal/config/controller_config.go +++ b/internal/config/controller_config.go @@ -135,6 +135,14 @@ type ControllerConfig struct { EnableVerboseLogging bool `json:"enableVerboseLogging,omitempty"` EnableStepOutputLogging bool `json:"enableStepOutputLogging,omitempty"` EnableMetrics bool `json:"enableMetrics,omitempty"` + + // Operator-level default storage configuration (applied when Story policy is absent) + DefaultStorageProvider string `json:"defaultStorageProvider,omitempty"` + DefaultS3Bucket string `json:"defaultS3Bucket,omitempty"` + DefaultS3Region string `json:"defaultS3Region,omitempty"` + DefaultS3Endpoint string `json:"defaultS3Endpoint,omitempty"` + DefaultS3UsePathStyle bool `json:"defaultS3UsePathStyle,omitempty"` + DefaultS3AuthSecretName string `json:"defaultS3AuthSecretName,omitempty"` } // Telemetry feature gate @@ -366,6 +374,11 @@ func (c *ControllerConfig) BuildEngramControllerOptions() controller.Options { } } +// BuildRealtimeEngramControllerOptions mirrors the Engram controller options for realtime workloads +func (c *ControllerConfig) BuildRealtimeEngramControllerOptions() controller.Options { + return c.BuildEngramControllerOptions() +} + // BuildImpulseControllerOptions builds controller.Options for Impulse func (c *ControllerConfig) BuildImpulseControllerOptions() controller.Options { return controller.Options{ diff --git a/internal/config/operator.go b/internal/config/operator.go index 57b6397..0e62840 100644 --- a/internal/config/operator.go +++ b/internal/config/operator.go @@ -26,8 +26,12 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) const strTrue = "true" @@ -67,26 +71,32 @@ func (m *OperatorConfigManager) GetControllerConfig() *ControllerConfig { return &m.currentConfig.Controller } +// SetAPIReader configures a non-cached reader for situations where the cache is not yet running. +func (m *OperatorConfigManager) SetAPIReader(reader client.Reader) { + m.apiReader = reader +} + // OperatorConfigManager manages the operator's dynamic configuration +// It implements the controller-runtime reconcile.Reconciler interface type OperatorConfigManager struct { client client.Client + apiReader client.Reader namespace string configMapName string currentConfig *OperatorConfig defaultConfig *OperatorConfig mu sync.RWMutex lastSyncTime time.Time - SyncInterval time.Duration } // NewOperatorConfigManager creates a new configuration manager func NewOperatorConfigManager(k8sClient client.Client, namespace, configMapName string) *OperatorConfigManager { manager := &OperatorConfigManager{ client: k8sClient, + apiReader: nil, namespace: namespace, configMapName: configMapName, defaultConfig: DefaultOperatorConfig(), - SyncInterval: 1 * time.Minute, } manager.currentConfig = manager.defaultConfig return manager @@ -104,10 +114,29 @@ func (ocm *OperatorConfigManager) RefreshConfig() { ocm.currentConfig = ocm.defaultConfig } +// LoadInitial performs a one-time synchronous load of the operator configuration. +func (m *OperatorConfigManager) LoadInitial(ctx context.Context) error { + config, err := m.loadAndParseConfigMap(ctx) + if err != nil { + return err + } + + m.mu.Lock() + m.currentConfig = config + m.lastSyncTime = time.Now() + m.mu.Unlock() + return nil +} + // loadAndParseConfigMap loads the ConfigMap and parses it into OperatorConfig func (m *OperatorConfigManager) loadAndParseConfigMap(ctx context.Context) (*OperatorConfig, error) { + reader := m.apiReader + if reader == nil { + reader = m.client + } + configMap := &corev1.ConfigMap{} - err := m.client.Get(ctx, types.NamespacedName{ + err := reader.Get(ctx, types.NamespacedName{ Name: m.configMapName, Namespace: m.namespace, }, configMap) @@ -121,53 +150,75 @@ func (m *OperatorConfigManager) loadAndParseConfigMap(ctx context.Context) (*Ope return config, nil } -// Start runs the configuration synchronization loop. -// It implements the controller-runtime manager.Runnable interface. -func (m *OperatorConfigManager) Start(ctx context.Context) error { +// Reconcile handles ConfigMap changes and updates the operator configuration. +// This is the event-driven approach that reacts to ConfigMap updates immediately. +func (m *OperatorConfigManager) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { logger := log.FromContext(ctx).WithName("config-manager") - logger.Info("Starting operator config synchronizer") - - // Immediately try to sync on startup - if err := m.sync(ctx); err != nil { - logger.Error(err, "Initial configuration sync failed") - // Depending on strictness, we might want to return the error - // and prevent the manager from starting if the config is critical. - } - - ticker := time.NewTicker(m.SyncInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - logger.Info("Stopping operator config synchronizer") - return nil - case <-ticker.C: - if err := m.sync(ctx); err != nil { - logger.Error(err, "Failed to sync operator configuration") - } - } + + // Only process our specific ConfigMap + if req.Name != m.configMapName || req.Namespace != m.namespace { + return reconcile.Result{}, nil } -} -// sync performs a single configuration synchronization. -func (m *OperatorConfigManager) sync(ctx context.Context) error { - logger := log.FromContext(ctx) + logger.Info("ConfigMap changed, reloading configuration", "configMap", req.NamespacedName) + newConfig, err := m.loadAndParseConfigMap(ctx) if err != nil { - return fmt.Errorf("failed to load operator configuration: %w", err) + logger.Error(err, "Failed to load operator configuration") + // Requeue with exponential backoff on error + return reconcile.Result{RequeueAfter: 30 * time.Second}, err } m.mu.Lock() defer m.mu.Unlock() m.currentConfig = newConfig m.lastSyncTime = time.Now() - logger.Info("Successfully synced operator configuration") - return nil + + logger.Info("Successfully reloaded operator configuration", + "configMap", req.NamespacedName, + "lastSync", m.lastSyncTime.Format(time.RFC3339)) + + return reconcile.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (m *OperatorConfigManager) SetupWithManager(mgr ctrl.Manager) error { + // Create a predicate that only watches our specific ConfigMap + configMapPredicate := predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return e.Object.GetName() == m.configMapName && + e.Object.GetNamespace() == m.namespace + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return e.ObjectNew.GetName() == m.configMapName && + e.ObjectNew.GetNamespace() == m.namespace + }, + DeleteFunc: func(e event.DeleteEvent) bool { + // If ConfigMap is deleted, we should know about it + return e.Object.GetName() == m.configMapName && + e.Object.GetNamespace() == m.namespace + }, + GenericFunc: func(e event.GenericEvent) bool { + return false + }, + } + + // Note: We don't load initial config here because the cache isn't started yet. + // The first reconcile event (triggered by the informer) will load the config. + // This happens almost immediately after the manager starts. + + return ctrl.NewControllerManagedBy(mgr). + Named("operator-config-manager"). + For(&corev1.ConfigMap{}). + WithEventFilter(configMapPredicate). + Complete(m) } // parseConfigMap parses a ConfigMap into OperatorConfig func (ocm *OperatorConfigManager) parseConfigMap(cm *corev1.ConfigMap, config *OperatorConfig) { + // Initialize nested structs with defaults first + config.Controller = *DefaultControllerConfig() + parseControllerTimings(cm, config) parseImageConfig(cm, config) parseResourceLimits(cm, config) @@ -180,6 +231,12 @@ func (ocm *OperatorConfigManager) parseConfigMap(cm *corev1.ConfigMap, config *O parseDebugConfig(cm, config) parseEngramDefaults(cm, config) parseStoryRunConfig(cm, config) + parseStepRunConfig(cm, config) + parseStoryConfig(cm, config) + parseEngramConfig(cm, config) + parseImpulseConfig(cm, config) + parseTemplateConfig(cm, config) + parseStorageDefaults(cm, config) } func parseControllerTimings(cm *corev1.ConfigMap, config *OperatorConfig) { @@ -190,6 +247,7 @@ func parseControllerTimings(cm *corev1.ConfigMap, config *OperatorConfig) { setCleanupInterval(cm, config) setReconcileTimeout(cm, config) setDefaultEngramGRPCPort(cm, config) + setMaxStoryWithBlockSizeBytes(cm, config) } func setMaxConcurrentReconciles(cm *corev1.ConfigMap, config *OperatorConfig) { @@ -248,6 +306,14 @@ func setDefaultEngramGRPCPort(cm *corev1.ConfigMap, config *OperatorConfig) { } } +func setMaxStoryWithBlockSizeBytes(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["controller.max-story-with-block-size-bytes"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.MaxStoryWithBlockSizeBytes = parsed + } + } +} + func parseImageConfig(cm *corev1.ConfigMap, config *OperatorConfig) { if val, exists := cm.Data["images.default-engram"]; exists { config.Controller.DefaultEngramImage = val @@ -268,6 +334,21 @@ func parseImageConfig(cm *corev1.ConfigMap, config *OperatorConfig) { } func parseResourceLimits(cm *corev1.ConfigMap, config *OperatorConfig) { + // Default resource limits + if val, exists := cm.Data["resources.default.cpu-request"]; exists { + config.Controller.DefaultCPURequest = val + } + if val, exists := cm.Data["resources.default.cpu-limit"]; exists { + config.Controller.DefaultCPULimit = val + } + if val, exists := cm.Data["resources.default.memory-request"]; exists { + config.Controller.DefaultMemoryRequest = val + } + if val, exists := cm.Data["resources.default.memory-limit"]; exists { + config.Controller.DefaultMemoryLimit = val + } + + // Engram-specific resource limits if val, exists := cm.Data["resources.engram.cpu-request"]; exists { config.Controller.EngramCPURequest = val } @@ -397,7 +478,6 @@ func setMaxConcurrencyLimit(cm *corev1.ConfigMap, config *OperatorConfig) { } func parseSecurityConfig(cm *corev1.ConfigMap, config *OperatorConfig) { - if val, exists := cm.Data["security.run-as-non-root"]; exists { config.Controller.RunAsNonRoot = val == strTrue } @@ -415,6 +495,9 @@ func parseSecurityConfig(cm *corev1.ConfigMap, config *OperatorConfig) { if val, exists := cm.Data["security.automount-service-account-token"]; exists { config.Controller.AutomountServiceAccountToken = val == strTrue } + if val, exists := cm.Data["security.service-account-name"]; exists { + config.Controller.ServiceAccountName = val + } } func parseJobConfig(cm *corev1.ConfigMap, config *OperatorConfig) { @@ -428,6 +511,16 @@ func parseJobConfig(cm *corev1.ConfigMap, config *OperatorConfig) { config.Controller.TTLSecondsAfterFinished = int32(parsed) } } + if val, exists := cm.Data["job.restart-policy"]; exists { + switch val { + case string(corev1.RestartPolicyAlways): + config.Controller.JobRestartPolicy = corev1.RestartPolicyAlways + case string(corev1.RestartPolicyOnFailure): + config.Controller.JobRestartPolicy = corev1.RestartPolicyOnFailure + case string(corev1.RestartPolicyNever): + config.Controller.JobRestartPolicy = corev1.RestartPolicyNever + } + } } func parseCELConfig(cm *corev1.ConfigMap, config *OperatorConfig) { @@ -476,9 +569,172 @@ func parseEngramDefaults(cm *corev1.ConfigMap, config *OperatorConfig) { } func parseStoryRunConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["storyrun.max-concurrent-reconciles"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.StoryRun.MaxConcurrentReconciles = parsed + } + } + if val, exists := cm.Data["storyrun.rate-limiter.base-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.StoryRun.RateLimiter.BaseDelay = parsed + } + } + if val, exists := cm.Data["storyrun.rate-limiter.max-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.StoryRun.RateLimiter.MaxDelay = parsed + } + } if val, exists := cm.Data["storyrun.max-inline-inputs-size"]; exists { if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { config.Controller.StoryRun.MaxInlineInputsSize = parsed } } } + +func parseStepRunConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["steprun.max-concurrent-reconciles"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.StepRun.MaxConcurrentReconciles = parsed + } + } + if val, exists := cm.Data["steprun.rate-limiter.base-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.StepRun.RateLimiter.BaseDelay = parsed + } + } + if val, exists := cm.Data["steprun.rate-limiter.max-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.StepRun.RateLimiter.MaxDelay = parsed + } + } +} + +func parseStoryConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["story.max-concurrent-reconciles"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.Story.MaxConcurrentReconciles = parsed + } + } + if val, exists := cm.Data["story.rate-limiter.base-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Story.RateLimiter.BaseDelay = parsed + } + } + if val, exists := cm.Data["story.rate-limiter.max-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Story.RateLimiter.MaxDelay = parsed + } + } +} + +func parseEngramConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + // Controller-level Engram config + parseIntField(cm, "engram.max-concurrent-reconciles", &config.Controller.Engram.MaxConcurrentReconciles, true) + parseDurationField(cm, "engram.rate-limiter.base-delay", &config.Controller.Engram.RateLimiter.BaseDelay) + parseDurationField(cm, "engram.rate-limiter.max-delay", &config.Controller.Engram.RateLimiter.MaxDelay) + + // Engram-specific controller config + parseEngramControllerConfig(cm, &config.Controller.Engram.EngramControllerConfig) +} + +func parseEngramControllerConfig(cm *corev1.ConfigMap, cfg *EngramControllerConfig) { + parseIntField(cm, "engram.default-grpc-port", &cfg.DefaultGRPCPort, true) + parseIntField(cm, "engram.default-storage-timeout-seconds", &cfg.DefaultStorageTimeoutSeconds, true) + parseIntField(cm, "engram.default-graceful-shutdown-timeout-seconds", &cfg.DefaultGracefulShutdownTimeoutSeconds, true) + parseInt64Field(cm, "engram.default-termination-grace-period-seconds", &cfg.DefaultTerminationGracePeriodSeconds, true) + parseIntField(cm, "engram.default-max-recv-msg-bytes", &cfg.DefaultMaxRecvMsgBytes, true) + parseIntField(cm, "engram.default-max-send-msg-bytes", &cfg.DefaultMaxSendMsgBytes, true) + parseIntField(cm, "engram.default-dial-timeout-seconds", &cfg.DefaultDialTimeoutSeconds, true) + parseIntField(cm, "engram.default-channel-buffer-size", &cfg.DefaultChannelBufferSize, true) + parseIntField(cm, "engram.default-reconnect-max-retries", &cfg.DefaultReconnectMaxRetries, false) // Can be 0 + parseIntField(cm, "engram.default-reconnect-base-backoff-millis", &cfg.DefaultReconnectBaseBackoffMillis, true) + parseIntField(cm, "engram.default-reconnect-max-backoff-seconds", &cfg.DefaultReconnectMaxBackoffSeconds, true) + parseIntField(cm, "engram.default-hang-timeout-seconds", &cfg.DefaultHangTimeoutSeconds, true) + parseIntField(cm, "engram.default-message-timeout-seconds", &cfg.DefaultMessageTimeoutSeconds, true) +} + +// Helper functions to reduce cyclomatic complexity +func parseIntField(cm *corev1.ConfigMap, key string, target *int, requirePositive bool) { + if val, exists := cm.Data[key]; exists { + if parsed, err := strconv.Atoi(val); err == nil { + if !requirePositive || parsed > 0 { + *target = parsed + } + } + } +} + +func parseInt64Field(cm *corev1.ConfigMap, key string, target *int64, requirePositive bool) { + if val, exists := cm.Data[key]; exists { + if parsed, err := strconv.ParseInt(val, 10, 64); err == nil { + if !requirePositive || parsed > 0 { + *target = parsed + } + } + } +} + +func parseDurationField(cm *corev1.ConfigMap, key string, target *time.Duration) { + if val, exists := cm.Data[key]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + *target = parsed + } + } +} + +func parseImpulseConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["impulse.max-concurrent-reconciles"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.Impulse.MaxConcurrentReconciles = parsed + } + } + if val, exists := cm.Data["impulse.rate-limiter.base-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Impulse.RateLimiter.BaseDelay = parsed + } + } + if val, exists := cm.Data["impulse.rate-limiter.max-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Impulse.RateLimiter.MaxDelay = parsed + } + } +} + +func parseTemplateConfig(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["template.max-concurrent-reconciles"]; exists { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + config.Controller.Template.MaxConcurrentReconciles = parsed + } + } + if val, exists := cm.Data["template.rate-limiter.base-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Template.RateLimiter.BaseDelay = parsed + } + } + if val, exists := cm.Data["template.rate-limiter.max-delay"]; exists { + if parsed, err := time.ParseDuration(val); err == nil { + config.Controller.Template.RateLimiter.MaxDelay = parsed + } + } +} + +func parseStorageDefaults(cm *corev1.ConfigMap, config *OperatorConfig) { + if val, exists := cm.Data["controller.storage.provider"]; exists { + config.Controller.DefaultStorageProvider = val + } + if val, exists := cm.Data["controller.storage.s3.bucket"]; exists { + config.Controller.DefaultS3Bucket = val + } + if val, exists := cm.Data["controller.storage.s3.region"]; exists { + config.Controller.DefaultS3Region = val + } + if val, exists := cm.Data["controller.storage.s3.endpoint"]; exists { + config.Controller.DefaultS3Endpoint = val + } + if val, exists := cm.Data["controller.storage.s3.use-path-style"]; exists { + config.Controller.DefaultS3UsePathStyle = val == strTrue + } + if val, exists := cm.Data["controller.storage.s3.auth-secret-name"]; exists { + config.Controller.DefaultS3AuthSecretName = val + } +} diff --git a/internal/config/resolver.go b/internal/config/resolver.go index c554616..2fa38c9 100644 --- a/internal/config/resolver.go +++ b/internal/config/resolver.go @@ -94,7 +94,7 @@ func (cr *Resolver) ResolveImagePullPolicy(ctx context.Context, step *runsv1alph // ResolveExecutionConfig resolves the final execution configuration for a step // by merging settings from all levels of the hierarchy. // The precedence order is: StepRun -> Story Step -> Engram -> EngramTemplate -> Operator Config -> Hardcoded Defaults. -func (cr *Resolver) ResolveExecutionConfig(ctx context.Context, step *runsv1alpha1.StepRun, story *v1alpha1.Story, engram *v1alpha1.Engram, template *catalogv1alpha1.EngramTemplate) (*ResolvedExecutionConfig, error) { +func (cr *Resolver) ResolveExecutionConfig(ctx context.Context, step *runsv1alpha1.StepRun, story *v1alpha1.Story, engram *v1alpha1.Engram, template *catalogv1alpha1.EngramTemplate, storyStep *v1alpha1.Step) (*ResolvedExecutionConfig, error) { operatorConfig := cr.configManager.GetConfig() // 1. Start with hardcoded defaults and operator-level configuration. @@ -107,7 +107,7 @@ func (cr *Resolver) ResolveExecutionConfig(ctx context.Context, step *runsv1alph cr.applyEngramConfig(engram, resolved) // 4. Apply Story-level policies and step-specific overrides. - cr.applyStoryConfig(story, step, resolved) + cr.applyStoryConfig(story, step, storyStep, resolved) // 5. Apply StepRun-specific overrides, which have the highest priority. cr.applyStepRunOverrides(step, resolved) @@ -123,7 +123,7 @@ func (cr *Resolver) ResolveExecutionConfig(ctx context.Context, step *runsv1alph // getOperatorDefaults initializes the configuration with values from the operator's config map. func (cr *Resolver) getOperatorDefaults(config *OperatorConfig) *ResolvedExecutionConfig { - return &ResolvedExecutionConfig{ + resolved := &ResolvedExecutionConfig{ ImagePullPolicy: config.Controller.ImagePullPolicy, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -149,6 +149,25 @@ func (cr *Resolver) getOperatorDefaults(config *OperatorConfig) *ResolvedExecuti BackoffBase: config.Controller.ExponentialBackoffBase, BackoffMax: config.Controller.ExponentialBackoffMax, } + + // Operator-level default storage + switch config.Controller.DefaultStorageProvider { + case "s3": + if config.Controller.DefaultS3Bucket != "" { + resolved.Storage = &v1alpha1.StoragePolicy{ + S3: &v1alpha1.S3StorageProvider{ + Bucket: config.Controller.DefaultS3Bucket, + Region: config.Controller.DefaultS3Region, + Endpoint: config.Controller.DefaultS3Endpoint, + Authentication: v1alpha1.S3Authentication{}, + }, + } + if name := config.Controller.DefaultS3AuthSecretName; name != "" { + resolved.Storage.S3.Authentication.SecretRef = &corev1.LocalObjectReference{Name: name} + } + } + } + return resolved } // applyEngramTemplateConfig applies settings from the EngramTemplate. @@ -410,17 +429,19 @@ func applyProbeOverrides(overrides *v1alpha1.ExecutionOverrides, config *Resolve } // applyStoryConfig applies settings from the Story -func (cr *Resolver) applyStoryConfig(story *v1alpha1.Story, stepRun *runsv1alpha1.StepRun, config *ResolvedExecutionConfig) { - if story == nil || stepRun == nil { +func (cr *Resolver) applyStoryConfig(story *v1alpha1.Story, stepRun *runsv1alpha1.StepRun, storyStep *v1alpha1.Step, config *ResolvedExecutionConfig) { + if story == nil { return } // Find the specific step in the story spec that this StepRun is executing - var currentStep *v1alpha1.Step - for i := range story.Spec.Steps { - if story.Spec.Steps[i].Name == stepRun.Spec.StepID { - currentStep = &story.Spec.Steps[i] - break + currentStep := storyStep + if currentStep == nil && stepRun != nil { + for i := range story.Spec.Steps { + if story.Spec.Steps[i].Name == stepRun.Spec.StepID { + currentStep = &story.Spec.Steps[i] + break + } } } @@ -434,6 +455,11 @@ func (cr *Resolver) applyStoryConfig(story *v1alpha1.Story, stepRun *runsv1alpha } } + // Apply step-level execution overrides (service account, security, retries, timeouts, probes). + if currentStep != nil { + cr.ApplyExecutionOverrides(currentStep.Execution, config) + } + // Wire storage policy from Story into resolved config so controllers can provision PVCs if story.Spec.Policy != nil && story.Spec.Policy.Storage != nil { config.Storage = story.Spec.Policy.Storage diff --git a/internal/controller/catalog/engramtemplate_controller.go b/internal/controller/catalog/engramtemplate_controller.go index 169f174..55d710e 100644 --- a/internal/controller/catalog/engramtemplate_controller.go +++ b/internal/controller/catalog/engramtemplate_controller.go @@ -47,19 +47,20 @@ type EngramTemplateReconciler struct { // +kubebuilder:rbac:groups=catalog.bubustack.io,resources=engramtemplates/finalizers,verbs=update // Reconcile validates and manages EngramTemplate lifecycle -func (r *EngramTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *EngramTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { // Initialize structured logging and metrics rl := logging.NewReconcileLogger(ctx, "engramtemplate") startTime := time.Now() defer func() { duration := time.Since(startTime) - metrics.RecordControllerReconcile("engramtemplate", duration, nil) + metrics.RecordControllerReconcile("engramtemplate", duration, err) }() var template catalogv1alpha1.EngramTemplate - if err := r.Get(ctx, req.NamespacedName, &template); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) + if err = r.Get(ctx, req.NamespacedName, &template); err != nil { + err = client.IgnoreNotFound(err) + return ctrl.Result{}, err } // Add template context to logger @@ -70,10 +71,10 @@ func (r *EngramTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Reque if missingField, handled := r.handleRequiredFields(ctx, &template); handled { r.updateErrorStatus(&template, fmt.Sprintf("%s is required", missingField)) rl.ReconcileError(fmt.Errorf("%s missing", missingField), fmt.Sprintf("%s is required for EngramTemplate", missingField)) - if err := r.updateStatus(ctx, &template); err != nil { + if err = r.updateStatus(ctx, &template); err != nil { return ctrl.Result{RequeueAfter: 5 * time.Second}, err } - return ctrl.Result{}, nil + return ctrl.Result{}, err } templateLogger.Info("Validating EngramTemplate", "image", template.Spec.Image, "version", template.Spec.Version) @@ -92,11 +93,11 @@ func (r *EngramTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Reque r.updateReadyStatus(&template) // List all Engrams that were created from this template (extracted helper) - if err := r.setUsageCount(ctx, &template, req.Name); err != nil { + if err = r.setUsageCount(ctx, &template, req.Name); err != nil { templateLogger.Error(err, "Failed to list engrams for template") } - if err := r.updateStatus(ctx, &template); err != nil { + if err = r.updateStatus(ctx, &template); err != nil { rl.ReconcileError(err, "Failed to update EngramTemplate status") return ctrl.Result{RequeueAfter: 5 * time.Second}, err } diff --git a/internal/controller/catalog/impulsetemplate_controller.go b/internal/controller/catalog/impulsetemplate_controller.go index 9012fde..bc08fdb 100644 --- a/internal/controller/catalog/impulsetemplate_controller.go +++ b/internal/controller/catalog/impulsetemplate_controller.go @@ -53,19 +53,20 @@ type ImpulseTemplateReconciler struct { // +kubebuilder:rbac:groups=catalog.bubustack.io,resources=impulsetemplates/finalizers,verbs=update // Reconcile validates and manages ImpulseTemplate lifecycle -func (r *ImpulseTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *ImpulseTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { // Initialize structured logging and metrics rl := logging.NewReconcileLogger(ctx, "impulsetemplate") startTime := time.Now() defer func() { duration := time.Since(startTime) - metrics.RecordControllerReconcile("impulsetemplate", duration, nil) + metrics.RecordControllerReconcile("impulsetemplate", duration, err) }() var template catalogv1alpha1.ImpulseTemplate - if err := r.Get(ctx, req.NamespacedName, &template); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) + if err = r.Get(ctx, req.NamespacedName, &template); err != nil { + err = client.IgnoreNotFound(err) + return ctrl.Result{}, err } // Add template context to logger @@ -76,10 +77,10 @@ func (r *ImpulseTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Requ if missing, handled := r.handleRequiredFields(&template); handled { r.updateErrorStatus(&template, fmt.Sprintf("%s is required", missing)) rl.ReconcileError(fmt.Errorf("%s missing", missing), fmt.Sprintf("%s is required for ImpulseTemplate", missing)) - if err := r.updateStatusWithRetry(ctx, &template); err != nil { + if err = r.updateStatusWithRetry(ctx, &template); err != nil { return ctrl.Result{RequeueAfter: 5 * time.Second}, err } - return ctrl.Result{}, nil + return ctrl.Result{}, err } templateLogger.Info("Validating ImpulseTemplate", "image", template.Spec.Image, "version", template.Spec.Version) @@ -88,10 +89,10 @@ func (r *ImpulseTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Requ if invalidMode, handled := r.validateSupportedModes(&template); handled { r.updateErrorStatus(&template, fmt.Sprintf("invalid supported mode '%s' for impulse template (must be deployment or statefulset)", invalidMode)) rl.ReconcileError(fmt.Errorf("invalid supported mode: %s", invalidMode), "Invalid supported mode for ImpulseTemplate") - if err := r.updateStatusWithRetry(ctx, &template); err != nil { + if err = r.updateStatusWithRetry(ctx, &template); err != nil { return ctrl.Result{RequeueAfter: 5 * time.Second}, err } - return ctrl.Result{}, nil + return ctrl.Result{}, err } templateLogger.V(1).Info("Supported modes validated", "modes", template.Spec.SupportedModes) @@ -117,7 +118,7 @@ func (r *ImpulseTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Requ template.Status.UsageCount = int32(len(impulses.Items)) } - if err := r.updateStatusWithRetry(ctx, &template); err != nil { + if err = r.updateStatusWithRetry(ctx, &template); err != nil { rl.ReconcileError(err, "Failed to update ImpulseTemplate status") return ctrl.Result{RequeueAfter: 5 * time.Second}, err } diff --git a/internal/controller/engram_controller.go b/internal/controller/engram_controller.go index dfe6a62..7ddb6fc 100644 --- a/internal/controller/engram_controller.go +++ b/internal/controller/engram_controller.go @@ -99,6 +99,10 @@ func (r *EngramReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res }) if patchErr != nil { + if errors.IsNotFound(patchErr) { + // Object was removed while we were patching status; nothing left to do. + return ctrl.Result{}, nil + } log.Error(patchErr, "Failed to update Engram status") return ctrl.Result{}, patchErr } diff --git a/internal/controller/impulse_controller.go b/internal/controller/impulse_controller.go index b15706f..0c1ef20 100644 --- a/internal/controller/impulse_controller.go +++ b/internal/controller/impulse_controller.go @@ -36,10 +36,12 @@ import ( catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" "github.com/bubustack/bobrapet/api/v1alpha1" config "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/naming" "github.com/bubustack/bobrapet/pkg/conditions" "github.com/bubustack/bobrapet/pkg/enums" "github.com/bubustack/bobrapet/pkg/logging" "github.com/bubustack/bobrapet/pkg/patch" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/handler" ) @@ -64,91 +66,60 @@ type ImpulseReconciler struct { func (r *ImpulseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := logging.NewReconcileLogger(ctx, "impulse").WithValues("impulse", req.NamespacedName) - timeout := r.ConfigResolver.GetOperatorConfig().Controller.ReconcileTimeout - if timeout > 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, timeout) - defer cancel() - } + ctx, cancel := r.withReconcileTimeout(ctx) + defer cancel() - var impulse v1alpha1.Impulse - if err := r.Get(ctx, req.NamespacedName, &impulse); err != nil { + impulse, err := r.fetchImpulse(ctx, req.NamespacedName) + if err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } - if !impulse.DeletionTimestamp.IsZero() { - // Handle deletion if finalizers are added. + if handled := r.handleImpulseShortCircuit(impulse); handled { return ctrl.Result{}, nil } - if impulse.Status.Phase.IsTerminal() && impulse.Status.ObservedGeneration == impulse.Generation { - return ctrl.Result{}, nil + template, stop, err := r.loadImpulseTemplate(ctx, impulse, log) + if err != nil { + return ctrl.Result{}, err } - - // Fetch the template - var template catalogv1alpha1.ImpulseTemplate - if err := r.Get(ctx, types.NamespacedName{Name: impulse.Spec.TemplateRef.Name}, &template); err != nil { - if errors.IsNotFound(err) { - log.Error(err, "ImpulseTemplate not found") - // Emit event for user visibility (guard recorder for tests) - if r.Recorder != nil { - r.Recorder.Event(&impulse, corev1.EventTypeWarning, conditions.ReasonTemplateNotFound, fmt.Sprintf("ImpulseTemplate '%s' not found", impulse.Spec.TemplateRef.Name)) - } - err := r.setImpulsePhase(ctx, &impulse, enums.PhaseBlocked, fmt.Sprintf("ImpulseTemplate '%s' not found", impulse.Spec.TemplateRef.Name)) - return ctrl.Result{}, err // Stop reconciliation, wait for watch to trigger. - } - return ctrl.Result{}, fmt.Errorf("failed to get ImpulseTemplate: %w", err) + if stop { + return ctrl.Result{}, nil } - // Resolve the full configuration - // The resolver expects an EngramTemplate, so we construct one from the ImpulseTemplate's spec. - engramTemplate := &catalogv1alpha1.EngramTemplate{ - Spec: catalogv1alpha1.EngramTemplateSpec{ - TemplateSpec: catalogv1alpha1.TemplateSpec{ - Version: template.Spec.Version, - Description: template.Spec.Description, - Image: template.Spec.Image, - SupportedModes: template.Spec.SupportedModes, - Execution: template.Spec.Execution, - SecretSchema: template.Spec.SecretSchema, - ConfigSchema: template.Spec.ConfigSchema, - }, - }, - } - resolvedConfig, err := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, nil, nil, engramTemplate) + resolvedConfig, err := r.resolveImpulseConfig(ctx, impulse, template, log) if err != nil { - err := r.setImpulsePhase(ctx, &impulse, enums.PhaseFailed, fmt.Sprintf("Failed to resolve configuration: %s", err)) - if err != nil { - log.Error(err, "Failed to update impulse status") - } - return ctrl.Result{}, fmt.Errorf("failed to resolve execution config for impulse '%s': %w", impulse.Name, err) + return ctrl.Result{}, err } - // Reconcile the deployment - deployment, err := r.reconcileDeployment(ctx, &impulse, &template, resolvedConfig) - if err != nil { - err := r.setImpulsePhase(ctx, &impulse, enums.PhaseFailed, fmt.Sprintf("Failed to reconcile Deployment: %s", err)) - return ctrl.Result{}, err + deployment, reconcileErr := r.reconcileDeployment(ctx, impulse, template, resolvedConfig) + if reconcileErr != nil { + return ctrl.Result{}, r.handleDeploymentError(ctx, impulse, reconcileErr) } // Update status based on the Deployment's state - if err := r.updateImpulseStatus(ctx, &impulse, deployment); err != nil { + if err := r.updateImpulseStatus(ctx, impulse, deployment); err != nil { return ctrl.Result{}, err } + + // Reconcile the Service whenever the template exposes ports. + if _, serviceErr := r.reconcileService(ctx, impulse, resolvedConfig); serviceErr != nil { + return ctrl.Result{}, r.handleServiceError(ctx, impulse, serviceErr) + } return ctrl.Result{}, nil } func (r *ImpulseReconciler) reconcileDeployment(ctx context.Context, impulse *v1alpha1.Impulse, template *catalogv1alpha1.ImpulseTemplate, execCfg *config.ResolvedExecutionConfig) (*appsv1.Deployment, error) { log := logging.NewReconcileLogger(ctx, "impulse-deployment").WithValues("impulse", impulse.Name) deployment := &appsv1.Deployment{} - deploymentName := fmt.Sprintf("%s-%s-impulse", impulse.Name, impulse.Spec.TemplateRef.Name) + deploymentName := naming.Compose(impulse.Name, impulse.Spec.TemplateRef.Name, "impulse") deploymentKey := types.NamespacedName{Name: deploymentName, Namespace: impulse.Namespace} + engramTemplate := convertImpulseTemplate(template) err := r.Get(ctx, deploymentKey, deployment) if err != nil { if errors.IsNotFound(err) { // Create it - newDeployment := r.buildDeploymentForImpulse(impulse, template, execCfg) + newDeployment := r.buildDeploymentForImpulse(impulse, engramTemplate, execCfg) if err := controllerutil.SetControllerReference(impulse, newDeployment, r.Scheme); err != nil { return nil, fmt.Errorf("failed to set owner reference on Deployment: %w", err) } @@ -162,7 +133,7 @@ func (r *ImpulseReconciler) reconcileDeployment(ctx context.Context, impulse *v1 } // It exists, so check for updates. - desiredDeployment := r.buildDeploymentForImpulse(impulse, template, execCfg) + desiredDeployment := r.buildDeploymentForImpulse(impulse, engramTemplate, execCfg) // A simple way to check for differences is to compare the pod templates. // For more complex scenarios, a 3-way merge or more specific field comparisons might be needed. if !reflect.DeepEqual(deployment.Spec.Template, desiredDeployment.Spec.Template) { @@ -177,6 +148,92 @@ func (r *ImpulseReconciler) reconcileDeployment(ctx context.Context, impulse *v1 return deployment, nil } +func (r *ImpulseReconciler) withReconcileTimeout(ctx context.Context) (context.Context, context.CancelFunc) { + timeout := r.ConfigResolver.GetOperatorConfig().Controller.ReconcileTimeout + if timeout <= 0 { + return ctx, func() {} + } + return context.WithTimeout(ctx, timeout) +} + +func (r *ImpulseReconciler) fetchImpulse(ctx context.Context, key types.NamespacedName) (*v1alpha1.Impulse, error) { + var impulse v1alpha1.Impulse + if err := r.Get(ctx, key, &impulse); err != nil { + return nil, err + } + return &impulse, nil +} + +func (r *ImpulseReconciler) handleImpulseShortCircuit(impulse *v1alpha1.Impulse) bool { + if !impulse.DeletionTimestamp.IsZero() { + return true + } + if impulse.Status.Phase.IsTerminal() && impulse.Status.ObservedGeneration == impulse.Generation { + return true + } + return false +} + +func (r *ImpulseReconciler) loadImpulseTemplate(ctx context.Context, impulse *v1alpha1.Impulse, log *logging.ControllerLogger) (*catalogv1alpha1.ImpulseTemplate, bool, error) { + var template catalogv1alpha1.ImpulseTemplate + err := r.Get(ctx, types.NamespacedName{Name: impulse.Spec.TemplateRef.Name}, &template) + if err == nil { + return &template, false, nil + } + if errors.IsNotFound(err) { + log.Error(err, "ImpulseTemplate not found") + if r.Recorder != nil { + r.Recorder.Event(impulse, corev1.EventTypeWarning, conditions.ReasonTemplateNotFound, fmt.Sprintf("ImpulseTemplate '%s' not found", impulse.Spec.TemplateRef.Name)) + } + if patchErr := r.setImpulsePhase(ctx, impulse, enums.PhaseBlocked, fmt.Sprintf("ImpulseTemplate '%s' not found", impulse.Spec.TemplateRef.Name)); patchErr != nil { + return nil, false, patchErr + } + return nil, true, nil + } + return nil, false, fmt.Errorf("failed to get ImpulseTemplate: %w", err) +} + +func (r *ImpulseReconciler) resolveImpulseConfig(ctx context.Context, impulse *v1alpha1.Impulse, template *catalogv1alpha1.ImpulseTemplate, log *logging.ControllerLogger) (*config.ResolvedExecutionConfig, error) { + engramTemplate := convertImpulseTemplate(template) + resolvedConfig, resolveErr := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, nil, nil, engramTemplate, nil) + if resolveErr != nil { + if r.Recorder != nil { + r.Recorder.Event(impulse, corev1.EventTypeWarning, conditions.ReasonInvalidConfiguration, fmt.Sprintf("Failed to resolve execution config: %s", resolveErr)) + } + if patchErr := r.setImpulsePhase(ctx, impulse, enums.PhaseFailed, fmt.Sprintf("Failed to resolve configuration: %s", resolveErr)); patchErr != nil { + log.Error(patchErr, "Failed to update impulse status") + return nil, patchErr + } + return nil, fmt.Errorf("failed to resolve execution config for impulse '%s': %w", impulse.Name, resolveErr) + } + + if len(impulse.Spec.Secrets) > 0 { + if resolvedConfig.Secrets == nil { + resolvedConfig.Secrets = make(map[string]string, len(impulse.Spec.Secrets)) + } + for key, value := range impulse.Spec.Secrets { + resolvedConfig.Secrets[key] = value + } + } + + r.ConfigResolver.ApplyExecutionOverrides(impulse.Spec.Execution, resolvedConfig) + return resolvedConfig, nil +} + +func (r *ImpulseReconciler) handleDeploymentError(ctx context.Context, impulse *v1alpha1.Impulse, reconcileErr error) error { + if patchErr := r.setImpulsePhase(ctx, impulse, enums.PhaseFailed, fmt.Sprintf("Failed to reconcile Deployment: %s", reconcileErr)); patchErr != nil { + return patchErr + } + return fmt.Errorf("failed to reconcile Deployment for impulse '%s': %w", impulse.Name, reconcileErr) +} + +func (r *ImpulseReconciler) handleServiceError(ctx context.Context, impulse *v1alpha1.Impulse, serviceErr error) error { + if patchErr := r.setImpulsePhase(ctx, impulse, enums.PhaseFailed, fmt.Sprintf("Failed to reconcile Service: %s", serviceErr)); patchErr != nil { + return patchErr + } + return fmt.Errorf("failed to reconcile Service for impulse '%s': %w", impulse.Name, serviceErr) +} + func (r *ImpulseReconciler) updateImpulseStatus(ctx context.Context, impulse *v1alpha1.Impulse, deployment *appsv1.Deployment) error { var newPhase enums.Phase var message string @@ -249,8 +306,8 @@ func (r *ImpulseReconciler) setImpulsePhase(ctx context.Context, impulse *v1alph } // buildDeploymentForImpulse creates a new Deployment object for an Impulse -func (r *ImpulseReconciler) buildDeploymentForImpulse(impulse *v1alpha1.Impulse, _ *catalogv1alpha1.ImpulseTemplate, execCfg *config.ResolvedExecutionConfig) *appsv1.Deployment { - deploymentName := fmt.Sprintf("%s-%s-impulse", impulse.Name, impulse.Spec.TemplateRef.Name) +func (r *ImpulseReconciler) buildDeploymentForImpulse(impulse *v1alpha1.Impulse, engramTemplate *catalogv1alpha1.EngramTemplate, execCfg *config.ResolvedExecutionConfig) *appsv1.Deployment { + deploymentName := naming.Compose(impulse.Name, impulse.Spec.TemplateRef.Name, "impulse") labels := map[string]string{ "app.kubernetes.io/name": "bobrapet-impulse", "app.kubernetes.io/instance": impulse.Name, @@ -272,6 +329,30 @@ func (r *ImpulseReconciler) buildDeploymentForImpulse(impulse *v1alpha1.Impulse, envVars = append(envVars, corev1.EnvVar{Name: "BUBU_IMPULSE_WITH", Value: string(impulse.Spec.With.Raw)}) } + podSpec := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: execCfg.ServiceAccountName, + AutomountServiceAccountToken: ptr.To(execCfg.AutomountServiceAccountToken), + SecurityContext: execCfg.ToPodSecurityContext(), + Containers: []corev1.Container{{ + Name: "impulse", + Image: execCfg.Image, + ImagePullPolicy: execCfg.ImagePullPolicy, + LivenessProbe: execCfg.LivenessProbe, + ReadinessProbe: execCfg.ReadinessProbe, + StartupProbe: execCfg.StartupProbe, + SecurityContext: execCfg.ToContainerSecurityContext(), + Resources: execCfg.Resources, + Env: envVars, + }}, + }, + } + applyStorageEnv(execCfg, &podSpec.Spec.Containers[0]) + applySecretArtifacts(engramTemplate, execCfg, &podSpec) + deployment := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, @@ -283,25 +364,134 @@ func (r *ImpulseReconciler) buildDeploymentForImpulse(impulse *v1alpha1.Impulse, Selector: &metav1.LabelSelector{ MatchLabels: labels, }, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: labels, - }, - Spec: corev1.PodSpec{ - ServiceAccountName: execCfg.ServiceAccountName, - Containers: []corev1.Container{{ - Name: "impulse", - Image: execCfg.Image, - Env: envVars, - }}, - }, - }, + Template: podSpec, }, } return deployment } +func (r *ImpulseReconciler) reconcileService(ctx context.Context, impulse *v1alpha1.Impulse, execCfg *config.ResolvedExecutionConfig) (*corev1.Service, error) { + if execCfg == nil || len(execCfg.ServicePorts) == 0 { + return nil, nil + } + + log := logging.NewReconcileLogger(ctx, "impulse-service").WithValues("impulse", impulse.Name) + serviceName := naming.Compose(impulse.Name, impulse.Spec.TemplateRef.Name, "impulse") + serviceKey := types.NamespacedName{Name: serviceName, Namespace: impulse.Namespace} + service := &corev1.Service{} + + if err := r.Get(ctx, serviceKey, service); err != nil { + if !errors.IsNotFound(err) { + return nil, fmt.Errorf("failed to get Service: %w", err) + } + newSvc := r.buildServiceForImpulse(impulse, execCfg, serviceName) + if err := controllerutil.SetControllerReference(impulse, newSvc, r.Scheme); err != nil { + return nil, fmt.Errorf("failed to set owner reference on Service: %w", err) + } + if err := r.Create(ctx, newSvc); err != nil { + return nil, fmt.Errorf("failed to create Service: %w", err) + } + log.Info("Created new Service for Impulse") + return newSvc, nil + } + + desired := r.buildServiceForImpulse(impulse, execCfg, serviceName) + desired.Spec.ClusterIP = service.Spec.ClusterIP + desired.Spec.ClusterIPs = append([]string(nil), service.Spec.ClusterIPs...) + desired.Spec.IPFamilies = append([]corev1.IPFamily(nil), service.Spec.IPFamilies...) + desired.Spec.IPFamilyPolicy = service.Spec.IPFamilyPolicy + desired.Spec.HealthCheckNodePort = service.Spec.HealthCheckNodePort + desired.ResourceVersion = service.ResourceVersion + + if !reflect.DeepEqual(service.Spec, desired.Spec) || + !reflect.DeepEqual(service.Labels, desired.Labels) || + !reflect.DeepEqual(service.Annotations, desired.Annotations) { + original := service.DeepCopy() + service.Spec = desired.Spec + service.Labels = desired.Labels + service.Annotations = desired.Annotations + if err := r.Patch(ctx, service, client.MergeFrom(original)); err != nil { + return nil, fmt.Errorf("failed to update Service: %w", err) + } + log.Info("Updated Service for Impulse") + } + + return service, nil +} + +func (r *ImpulseReconciler) buildServiceForImpulse(impulse *v1alpha1.Impulse, execCfg *config.ResolvedExecutionConfig, serviceName string) *corev1.Service { + selectorLabels := map[string]string{ + "app.kubernetes.io/name": "bobrapet-impulse", + "app.kubernetes.io/instance": impulse.Name, + "app.kubernetes.io/managed-by": "bobrapet-operator", + } + + serviceLabels := make(map[string]string, len(selectorLabels)+len(execCfg.ServiceLabels)) + for k, v := range selectorLabels { + serviceLabels[k] = v + } + for k, v := range execCfg.ServiceLabels { + serviceLabels[k] = v + } + if impulse.Spec.Service != nil && len(impulse.Spec.Service.Labels) > 0 { + for k, v := range impulse.Spec.Service.Labels { + serviceLabels[k] = v + } + } + + annotations := make(map[string]string, len(execCfg.ServiceAnnotations)) + for k, v := range execCfg.ServiceAnnotations { + annotations[k] = v + } + if impulse.Spec.Service != nil && len(impulse.Spec.Service.Annotations) > 0 { + for k, v := range impulse.Spec.Service.Annotations { + annotations[k] = v + } + } + + ports := make([]corev1.ServicePort, len(execCfg.ServicePorts)) + copy(ports, execCfg.ServicePorts) + + serviceType := corev1.ServiceTypeClusterIP + if impulse.Spec.Service != nil && impulse.Spec.Service.Type != "" { + serviceType = corev1.ServiceType(impulse.Spec.Service.Type) + } + + return &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: impulse.Namespace, + Labels: serviceLabels, + Annotations: annotations, + }, + Spec: corev1.ServiceSpec{ + Type: serviceType, + Selector: selectorLabels, + Ports: ports, + }, + } +} + +func convertImpulseTemplate(template *catalogv1alpha1.ImpulseTemplate) *catalogv1alpha1.EngramTemplate { + if template == nil { + return nil + } + return &catalogv1alpha1.EngramTemplate{ + Spec: catalogv1alpha1.EngramTemplateSpec{ + TemplateSpec: catalogv1alpha1.TemplateSpec{ + Version: template.Spec.Version, + Description: template.Spec.Description, + Image: template.Spec.Image, + SupportedModes: template.Spec.SupportedModes, + Execution: template.Spec.Execution, + SecretSchema: template.Spec.SecretSchema, + ConfigSchema: template.Spec.ConfigSchema, + }, + }, + } +} + // SetupWithManager sets up the controller with the Manager. func (r *ImpulseReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.Recorder = mgr.GetEventRecorderFor("impulse-controller") @@ -314,6 +504,7 @@ func (r *ImpulseReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.O WithOptions(opts). Owns(&appsv1.Deployment{}). Owns(&appsv1.StatefulSet{}). + Owns(&corev1.Service{}). Complete(r) } diff --git a/internal/controller/impulse_controller_test.go b/internal/controller/impulse_controller_test.go index 369b4b2..c0484a2 100644 --- a/internal/controller/impulse_controller_test.go +++ b/internal/controller/impulse_controller_test.go @@ -60,6 +60,7 @@ var _ = Describe("Impulse Controller", func() { TemplateSpec: catalogv1alpha1.TemplateSpec{ Version: "1.0.0", SupportedModes: []enums.WorkloadMode{enums.WorkloadModeDeployment}, + Image: "ghcr.io/bubustack/impulse-default:latest", }, }, } diff --git a/internal/controller/mergeutil/with.go b/internal/controller/mergeutil/with.go new file mode 100644 index 0000000..090830d --- /dev/null +++ b/internal/controller/mergeutil/with.go @@ -0,0 +1,105 @@ +package mergeutil + +import ( + "encoding/json" + "fmt" + + "k8s.io/apimachinery/pkg/runtime" +) + +// MergeWithBlocks merges two runtime.RawExtension values representing JSON "with" blocks. +// Values from the stepWith overlay the engramWith while preserving nested structures. +func MergeWithBlocks(engramWith, stepWith *runtime.RawExtension) (*runtime.RawExtension, error) { + if engramWith == nil { + return stepWith, nil + } + if stepWith == nil { + return engramWith, nil + } + + var engramMap, stepMap map[string]any + if err := json.Unmarshal(engramWith.Raw, &engramMap); err != nil { + return nil, fmt.Errorf("failed to unmarshal engram 'with' block: %w", err) + } + if err := json.Unmarshal(stepWith.Raw, &stepMap); err != nil { + return nil, fmt.Errorf("failed to unmarshal step 'with' block: %w", err) + } + + merged := deepMergeJSONMaps(engramMap, stepMap) + + mergedBytes, err := json.Marshal(merged) + if err != nil { + return nil, fmt.Errorf("failed to marshal merged 'with' block: %w", err) + } + + return &runtime.RawExtension{Raw: mergedBytes}, nil +} + +func deepMergeJSONMaps(base, overlay map[string]any) map[string]any { + if base == nil && overlay == nil { + return nil + } + out := make(map[string]any) + for k, v := range base { + out[k] = deepCloneJSONValue(v) + } + for k, v := range overlay { + if existing, ok := out[k]; ok { + out[k] = mergeJSONValue(existing, v) + continue + } + out[k] = deepCloneJSONValue(v) + } + return out +} + +func mergeJSONValue(base, overlay any) any { + baseMap, baseOK := toStringAnyMap(base) + overlayMap, overlayOK := toStringAnyMap(overlay) + if baseOK && overlayOK { + return deepMergeJSONMaps(baseMap, overlayMap) + } + return deepCloneJSONValue(overlay) +} + +func deepCloneJSONValue(value any) any { + switch v := value.(type) { + case map[string]any: + return deepMergeJSONMaps(v, nil) + case []any: + out := make([]any, len(v)) + for i := range v { + out[i] = deepCloneJSONValue(v[i]) + } + return out + case string, float64, bool, nil: + return v + default: + // For numbers unmarshalled as json.Number or other interfaces, fall back to JSON marshal/unmarshal clone. + bytes, err := json.Marshal(v) + if err != nil { + return v + } + var clone any + if err := json.Unmarshal(bytes, &clone); err != nil { + return v + } + return clone + } +} + +func toStringAnyMap(value any) (map[string]any, bool) { + switch v := value.(type) { + case map[string]any: + return v, true + case map[any]any: + out := make(map[string]any, len(v)) + for key, val := range v { + strKey := fmt.Sprintf("%v", key) + out[strKey] = val + } + return out, true + default: + return nil, false + } +} diff --git a/internal/controller/naming/naming.go b/internal/controller/naming/naming.go new file mode 100644 index 0000000..566e49c --- /dev/null +++ b/internal/controller/naming/naming.go @@ -0,0 +1,64 @@ +package naming + +import ( + "fmt" + "hash/fnv" + "strings" + + "k8s.io/apimachinery/pkg/util/validation" +) + +const maxDNS1123Length = validation.DNS1123LabelMaxLength + +// Compose builds a DNS-1123 compliant resource name from the provided parts. +// When the concatenated name exceeds 63 characters, the prefix is truncated and +// a deterministic hash suffix is appended so the result remains stable across reconciliations. +func Compose(parts ...string) string { + base := strings.Join(parts, "-") + if len(base) <= maxDNS1123Length { + return base + } + + hash := fnv.New32a() + for _, part := range parts { + _, _ = hash.Write([]byte(part)) + _, _ = hash.Write([]byte{0}) // separator to avoid collisions on different segment splits + } + suffix := fmt.Sprintf("%08x", hash.Sum32()) + + // Reserve space for "-" plus the hash suffix. + prefixLen := maxDNS1123Length - len(suffix) - 1 + if prefixLen < 1 { + prefixLen = maxDNS1123Length - len(suffix) + } + if prefixLen < 1 { + // Fallback: suffix alone (truncated if needed) + if len(suffix) > maxDNS1123Length { + return suffix[:maxDNS1123Length] + } + return suffix + } + + prefix := base[:prefixLen] + prefix = strings.TrimSuffix(prefix, "-") + if len(prefix) == 0 { + prefix = base[:prefixLen] + prefix = strings.Trim(prefix, "-") + if len(prefix) == 0 { + prefix = "resource" + } + } + + result := fmt.Sprintf("%s-%s", prefix, suffix) + if len(result) > maxDNS1123Length { + result = result[:maxDNS1123Length] + result = strings.TrimSuffix(result, "-") + if len(result) == 0 { + if len(suffix) > maxDNS1123Length { + return suffix[:maxDNS1123Length] + } + return suffix + } + } + return result +} diff --git a/internal/controller/naming/naming_test.go b/internal/controller/naming/naming_test.go new file mode 100644 index 0000000..6bf4776 --- /dev/null +++ b/internal/controller/naming/naming_test.go @@ -0,0 +1,25 @@ +package naming + +import "testing" + +func TestComposeWithinLimit(t *testing.T) { + name := Compose("story", "step") + if name != "story-step" { + t.Fatalf("expected story-step, got %s", name) + } + if len(name) > maxDNS1123Length { + t.Fatalf("expected length <= %d, got %d", maxDNS1123Length, len(name)) + } +} + +func TestComposeTruncatesAndHashes(t *testing.T) { + longPart := "this-is-a-very-long-name-component-that-will-force-truncation" + name := Compose(longPart, longPart, longPart) + if len(name) > maxDNS1123Length { + t.Fatalf("expected length <= %d, got %d", maxDNS1123Length, len(name)) + } + // ensure hash suffix present (8 hex chars) + if len(name) < 9 { + t.Fatalf("expected hashed suffix, got %s", name) + } +} diff --git a/internal/controller/realtime_engram_controller.go b/internal/controller/realtime_engram_controller.go index 003460d..2d13a5e 100644 --- a/internal/controller/realtime_engram_controller.go +++ b/internal/controller/realtime_engram_controller.go @@ -24,6 +24,7 @@ import ( catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" "github.com/bubustack/bobrapet/api/v1alpha1" "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/secretutil" "github.com/bubustack/bobrapet/pkg/conditions" "github.com/bubustack/bobrapet/pkg/enums" "github.com/bubustack/bobrapet/pkg/logging" @@ -246,7 +247,7 @@ func (r *RealtimeEngramReconciler) ensureRunnerServiceAccount(ctx context.Contex } // reconcileWorkload dispatches to the proper workload reconciler based on mode. -func (r *RealtimeEngramReconciler) reconcileWorkload(ctx context.Context, engram *v1alpha1.Engram, resolved *config.ResolvedExecutionConfig) error { +func (r *RealtimeEngramReconciler) reconcileWorkload(ctx context.Context, engram *v1alpha1.Engram, template *catalogv1alpha1.EngramTemplate, resolved *config.ResolvedExecutionConfig) error { mode := engram.Spec.Mode if mode == "" { mode = enums.WorkloadModeJob @@ -254,9 +255,10 @@ func (r *RealtimeEngramReconciler) reconcileWorkload(ctx context.Context, engram if mode == enums.WorkloadModeJob { return nil } + switch mode { case enums.WorkloadModeDeployment: - return r.reconcileDeployment(ctx, engram, resolved) + return r.reconcileDeployment(ctx, engram, template, resolved) case enums.WorkloadModeStatefulSet: return r.reconcileStatefulSet(ctx, engram, resolved) default: @@ -283,7 +285,7 @@ func (r *RealtimeEngramReconciler) reconcileRealtime(ctx context.Context, engram } // Resolve execution config - resolvedConfig, err := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, nil, engram, template) + resolvedConfig, err := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, nil, engram, template, nil) if err != nil { log.Error(err, "Failed to resolve execution config") return ctrl.Result{}, err @@ -295,7 +297,7 @@ func (r *RealtimeEngramReconciler) reconcileRealtime(ctx context.Context, engram } // Reconcile workload and service - if err := r.reconcileWorkload(ctx, engram, resolvedConfig); err != nil { + if err := r.reconcileWorkload(ctx, engram, template, resolvedConfig); err != nil { return ctrl.Result{}, err } if err := r.reconcileService(ctx, engram, resolvedConfig); err != nil { @@ -378,13 +380,13 @@ func (r *RealtimeEngramReconciler) reconcileDelete(ctx context.Context, engram * return nil } -func (r *RealtimeEngramReconciler) reconcileDeployment(ctx context.Context, engram *v1alpha1.Engram, execCfg *config.ResolvedExecutionConfig) error { +func (r *RealtimeEngramReconciler) reconcileDeployment(ctx context.Context, engram *v1alpha1.Engram, template *catalogv1alpha1.EngramTemplate, execCfg *config.ResolvedExecutionConfig) error { log := logging.NewReconcileLogger(ctx, "realtime_engram").WithValues("engram", engram.Name) deployment := &appsv1.Deployment{} err := r.Get(ctx, types.NamespacedName{Name: engram.Name, Namespace: engram.Namespace}, deployment) if err != nil && errors.IsNotFound(err) { log.Info("Creating a new Deployment", "Deployment.Namespace", engram.Namespace, "Deployment.Name", engram.Name) - newDep := r.deploymentForEngram(ctx, engram, execCfg) + newDep := r.deploymentForEngram(ctx, engram, template, execCfg) if err := r.Create(ctx, newDep); err != nil { log.Error(err, "Failed to create new Deployment") return err @@ -395,7 +397,7 @@ func (r *RealtimeEngramReconciler) reconcileDeployment(ctx context.Context, engr } // Update logic - desiredDep := r.deploymentForEngram(ctx, engram, execCfg) + desiredDep := r.deploymentForEngram(ctx, engram, template, execCfg) if !reflect.DeepEqual(deployment.Spec.Template, desiredDep.Spec.Template) || *deployment.Spec.Replicas != *desiredDep.Spec.Replicas { log.Info("Deployment spec differs, updating deployment") // Retry on conflict using a fresh GET + merge to avoid clobbering cluster-managed fields @@ -613,27 +615,17 @@ func (r *RealtimeEngramReconciler) reconcileService(ctx context.Context, engram return nil } -func (r *RealtimeEngramReconciler) deploymentForEngram(ctx context.Context, engram *v1alpha1.Engram, execCfg *config.ResolvedExecutionConfig) *appsv1.Deployment { +func (r *RealtimeEngramReconciler) deploymentForEngram(ctx context.Context, engram *v1alpha1.Engram, template *catalogv1alpha1.EngramTemplate, execCfg *config.ResolvedExecutionConfig) *appsv1.Deployment { labels := map[string]string{"app": engram.Name, "bubustack.io/engram": engram.Name} replicas := int32(1) - - // Serialize the engram's 'with' configuration to JSON for the SDK - var configEnvVars []corev1.EnvVar - if engram.Spec.With != nil && len(engram.Spec.With.Raw) > 0 { - configEnvVars = append(configEnvVars, corev1.EnvVar{ - Name: "BUBU_CONFIG", - Value: string(engram.Spec.With.Raw), - }) - } - - // Set terminationGracePeriodSeconds to coordinate with SDK graceful shutdown timeout terminationGracePeriod := r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultTerminationGracePeriodSeconds engramConfig := r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig + envVars := buildRealtimeBaseEnv(engramConfig) + envVars = append(envVars, buildConfigEnvVars(engram)...) + podSpec := corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: labels, - }, + ObjectMeta: metav1.ObjectMeta{Labels: labels}, Spec: corev1.PodSpec{ ServiceAccountName: execCfg.ServiceAccountName, TerminationGracePeriodSeconds: &terminationGracePeriod, @@ -644,37 +636,10 @@ func (r *RealtimeEngramReconciler) deploymentForEngram(ctx context.Context, engr ImagePullPolicy: execCfg.ImagePullPolicy, SecurityContext: execCfg.ToContainerSecurityContext(), Ports: []corev1.ContainerPort{{ - ContainerPort: int32(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort), + ContainerPort: int32(engramConfig.DefaultGRPCPort), Name: "grpc", }}, - Env: append([]corev1.EnvVar{ - {Name: "BUBU_MODE", Value: "deployment"}, - // Ensure SDK auto-detects streaming execution mode - {Name: "BUBU_EXECUTION_MODE", Value: "streaming"}, - // Expose port for SDK gRPC server; aligns with Service port - {Name: "BUBU_GRPC_PORT", Value: fmt.Sprintf("%d", r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort)}, - {Name: "BUBU_POD_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.name"}}}, - {Name: "BUBU_POD_NAMESPACE", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.namespace"}}}, - // Set inline size limit for storage offloading (consistency with batch StepRun jobs) - {Name: "BUBU_MAX_INLINE_SIZE", Value: fmt.Sprintf("%d", engramConfig.DefaultMaxInlineSize)}, - // Set storage operation timeout for large file uploads/downloads - {Name: "BUBU_STORAGE_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultStorageTimeoutSeconds)}, - // Set graceful shutdown timeout to coordinate with terminationGracePeriodSeconds - {Name: "BUBU_GRPC_GRACEFUL_SHUTDOWN_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultGracefulShutdownTimeoutSeconds)}, - // Newly added SDK tuning parameters - {Name: "BUBU_GRPC_MAX_RECV_BYTES", Value: fmt.Sprintf("%d", engramConfig.DefaultMaxRecvMsgBytes)}, - {Name: "BUBU_GRPC_MAX_SEND_BYTES", Value: fmt.Sprintf("%d", engramConfig.DefaultMaxSendMsgBytes)}, - {Name: "BUBU_GRPC_CLIENT_MAX_RECV_BYTES", Value: fmt.Sprintf("%d", engramConfig.DefaultMaxRecvMsgBytes)}, - {Name: "BUBU_GRPC_CLIENT_MAX_SEND_BYTES", Value: fmt.Sprintf("%d", engramConfig.DefaultMaxSendMsgBytes)}, - {Name: "BUBU_GRPC_DIAL_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultDialTimeoutSeconds)}, - {Name: "BUBU_GRPC_CHANNEL_BUFFER_SIZE", Value: fmt.Sprintf("%d", engramConfig.DefaultChannelBufferSize)}, - {Name: "BUBU_GRPC_RECONNECT_MAX_RETRIES", Value: fmt.Sprintf("%d", engramConfig.DefaultReconnectMaxRetries)}, - {Name: "BUBU_GRPC_RECONNECT_BASE_BACKOFF", Value: fmt.Sprintf("%dms", engramConfig.DefaultReconnectBaseBackoffMillis)}, - {Name: "BUBU_GRPC_RECONNECT_MAX_BACKOFF", Value: fmt.Sprintf("%ds", engramConfig.DefaultReconnectMaxBackoffSeconds)}, - {Name: "BUBU_GRPC_HANG_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultHangTimeoutSeconds)}, - {Name: "BUBU_GRPC_MESSAGE_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultMessageTimeoutSeconds)}, - {Name: "BUBU_GRPC_CHANNEL_SEND_TIMEOUT", Value: fmt.Sprintf("%ds", engramConfig.DefaultMessageTimeoutSeconds)}, // Re-use message timeout - }, configEnvVars...), + Env: envVars, LivenessProbe: execCfg.LivenessProbe, ReadinessProbe: execCfg.ReadinessProbe, StartupProbe: execCfg.StartupProbe, @@ -682,43 +647,117 @@ func (r *RealtimeEngramReconciler) deploymentForEngram(ctx context.Context, engr }, } - // Handle object storage configuration - if storagePolicy := execCfg.Storage; storagePolicy != nil && storagePolicy.S3 != nil { - s3Config := storagePolicy.S3 - // log with reconcile context - logging.NewControllerLogger(ctx, "realtime_engram").WithValues("engram", engram.Name).Info("Configuring pod for S3 object storage access", "bucket", s3Config.Bucket) + r.applySecretArtifactsToRealtimePod(template, execCfg, &podSpec) + r.configureRealtimeStorage(ctx, engram, execCfg, &podSpec) + r.configureRealtimeTLS(ctx, engram, &podSpec) - // Add environment variables for the SDK - podSpec.Spec.Containers[0].Env = append(podSpec.Spec.Containers[0].Env, - corev1.EnvVar{Name: "BUBU_STORAGE_PROVIDER", Value: "s3"}, - corev1.EnvVar{Name: "BUBU_STORAGE_S3_BUCKET", Value: s3Config.Bucket}, - ) - if s3Config.Region != "" { - podSpec.Spec.Containers[0].Env = append(podSpec.Spec.Containers[0].Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_REGION", Value: s3Config.Region}) - } - if s3Config.Endpoint != "" { - podSpec.Spec.Containers[0].Env = append(podSpec.Spec.Containers[0].Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_ENDPOINT", Value: s3Config.Endpoint}) - } + return r.newRealtimeDeployment(ctx, engram, labels, replicas, podSpec) +} - // Handle secret-based authentication - if auth := &s3Config.Authentication; auth.SecretRef != nil { - secretName := auth.SecretRef.Name - logging.NewControllerLogger(ctx, "realtime_engram").WithValues("engram", engram.Name).Info("Using S3 secret reference for authentication", "secretName", secretName) - podSpec.Spec.Containers[0].EnvFrom = append(podSpec.Spec.Containers[0].EnvFrom, corev1.EnvFromSource{ - SecretRef: &corev1.SecretEnvSource{ - LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, - }, - }) - } +func buildConfigEnvVars(engram *v1alpha1.Engram) []corev1.EnvVar { + if engram.Spec.With == nil || len(engram.Spec.With.Raw) == 0 { + return nil } + return []corev1.EnvVar{ + {Name: "BUBU_CONFIG", Value: string(engram.Spec.With.Raw)}, + } +} - // Configure TLS via user-provided secret if annotated +func buildRealtimeBaseEnv(cfg config.EngramControllerConfig) []corev1.EnvVar { + return []corev1.EnvVar{ + {Name: "BUBU_MODE", Value: "deployment"}, + {Name: "BUBU_EXECUTION_MODE", Value: "streaming"}, + {Name: "BUBU_GRPC_PORT", Value: fmt.Sprintf("%d", cfg.DefaultGRPCPort)}, + {Name: "BUBU_MAX_RECURSION_DEPTH", Value: "64"}, + downwardEnvVar("BUBU_POD_NAME", "metadata.name"), + downwardEnvVar("BUBU_POD_NAMESPACE", "metadata.namespace"), + downwardEnvVar("POD_NAME", "metadata.name"), + downwardEnvVar("POD_NAMESPACE", "metadata.namespace"), + downwardEnvVar("SERVICE_ACCOUNT_NAME", "spec.serviceAccountName"), + {Name: "BUBU_MAX_INLINE_SIZE", Value: fmt.Sprintf("%d", cfg.DefaultMaxInlineSize)}, + {Name: "BUBU_STORAGE_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultStorageTimeoutSeconds)}, + {Name: "BUBU_GRPC_GRACEFUL_SHUTDOWN_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultGracefulShutdownTimeoutSeconds)}, + {Name: "BUBU_GRPC_MAX_RECV_BYTES", Value: fmt.Sprintf("%d", cfg.DefaultMaxRecvMsgBytes)}, + {Name: "BUBU_GRPC_MAX_SEND_BYTES", Value: fmt.Sprintf("%d", cfg.DefaultMaxSendMsgBytes)}, + {Name: "BUBU_GRPC_CLIENT_MAX_RECV_BYTES", Value: fmt.Sprintf("%d", cfg.DefaultMaxRecvMsgBytes)}, + {Name: "BUBU_GRPC_CLIENT_MAX_SEND_BYTES", Value: fmt.Sprintf("%d", cfg.DefaultMaxSendMsgBytes)}, + {Name: "BUBU_GRPC_DIAL_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultDialTimeoutSeconds)}, + {Name: "BUBU_GRPC_CHANNEL_BUFFER_SIZE", Value: fmt.Sprintf("%d", cfg.DefaultChannelBufferSize)}, + {Name: "BUBU_GRPC_RECONNECT_MAX_RETRIES", Value: fmt.Sprintf("%d", cfg.DefaultReconnectMaxRetries)}, + {Name: "BUBU_GRPC_RECONNECT_BASE_BACKOFF", Value: fmt.Sprintf("%dms", cfg.DefaultReconnectBaseBackoffMillis)}, + {Name: "BUBU_GRPC_RECONNECT_MAX_BACKOFF", Value: fmt.Sprintf("%ds", cfg.DefaultReconnectMaxBackoffSeconds)}, + {Name: "BUBU_GRPC_HANG_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultHangTimeoutSeconds)}, + {Name: "BUBU_GRPC_MESSAGE_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultMessageTimeoutSeconds)}, + {Name: "BUBU_GRPC_CHANNEL_SEND_TIMEOUT", Value: fmt.Sprintf("%ds", cfg.DefaultMessageTimeoutSeconds)}, + } +} + +func downwardEnvVar(name, fieldPath string) corev1.EnvVar { + return corev1.EnvVar{ + Name: name, + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{FieldPath: fieldPath}, + }, + } +} + +func (r *RealtimeEngramReconciler) applySecretArtifactsToRealtimePod(template *catalogv1alpha1.EngramTemplate, execCfg *config.ResolvedExecutionConfig, podSpec *corev1.PodTemplateSpec) { + if podSpec == nil || len(podSpec.Spec.Containers) == 0 || execCfg.Secrets == nil || template == nil || template.Spec.SecretSchema == nil { + return + } + artifacts := secretutil.BuildArtifacts(template.Spec.SecretSchema, execCfg.Secrets) + podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, artifacts.Volumes...) + + container := &podSpec.Spec.Containers[0] + container.Env = append(container.Env, artifacts.EnvVars...) + container.EnvFrom = append(container.EnvFrom, artifacts.EnvFrom...) + container.VolumeMounts = append(container.VolumeMounts, artifacts.VolumeMounts...) +} + +func (r *RealtimeEngramReconciler) configureRealtimeStorage(ctx context.Context, engram *v1alpha1.Engram, execCfg *config.ResolvedExecutionConfig, podSpec *corev1.PodTemplateSpec) { + if podSpec == nil || len(podSpec.Spec.Containers) == 0 || execCfg.Storage == nil || execCfg.Storage.S3 == nil { + return + } + + s3Config := execCfg.Storage.S3 + logger := logging.NewControllerLogger(ctx, "realtime_engram").WithValues("engram", engram.Name) + logger.Info("Configuring pod for S3 object storage access", "bucket", s3Config.Bucket) + + container := &podSpec.Spec.Containers[0] + container.Env = append(container.Env, + corev1.EnvVar{Name: "BUBU_STORAGE_PROVIDER", Value: "s3"}, + corev1.EnvVar{Name: "BUBU_STORAGE_S3_BUCKET", Value: s3Config.Bucket}, + ) + if s3Config.Region != "" { + container.Env = append(container.Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_REGION", Value: s3Config.Region}) + } + if s3Config.Endpoint != "" { + container.Env = append(container.Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_ENDPOINT", Value: s3Config.Endpoint}) + } + + if s3Config.Authentication.SecretRef != nil { + secretName := s3Config.Authentication.SecretRef.Name + logger.Info("Using S3 secret reference for authentication", "secretName", secretName) + container.EnvFrom = append(container.EnvFrom, corev1.EnvFromSource{ + SecretRef: &corev1.SecretEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: secretName}, + }, + }) + } +} + +func (r *RealtimeEngramReconciler) configureRealtimeTLS(ctx context.Context, engram *v1alpha1.Engram, podSpec *corev1.PodTemplateSpec) { + if podSpec == nil { + return + } if sec := getTLSSecretName(&engram.ObjectMeta); sec != "" { - if err := r.maybeConfigureTLSEnvAndMounts(ctx, engram.Namespace, sec, &podSpec, 0); err != nil { + if err := r.maybeConfigureTLSEnvAndMounts(ctx, engram.Namespace, sec, podSpec, 0); err != nil { logging.NewReconcileLogger(ctx, "realtime_engram").WithValues("engram", engram.Name).Error(err, "Failed to configure TLS mounts/envs for Deployment") } } +} +func (r *RealtimeEngramReconciler) newRealtimeDeployment(ctx context.Context, engram *v1alpha1.Engram, labels map[string]string, replicas int32, podSpec corev1.PodTemplateSpec) *appsv1.Deployment { dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: engram.Name, @@ -726,9 +765,7 @@ func (r *RealtimeEngramReconciler) deploymentForEngram(ctx context.Context, engr }, Spec: appsv1.DeploymentSpec{ Replicas: &replicas, - Selector: &metav1.LabelSelector{ - MatchLabels: labels, - }, + Selector: &metav1.LabelSelector{MatchLabels: labels}, Template: podSpec, }, } @@ -770,6 +807,7 @@ func (r *RealtimeEngramReconciler) serviceForEngram(ctx context.Context, engram // SetupWithManager sets up the controller with the Manager. func (r *RealtimeEngramReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { return ctrl.NewControllerManagedBy(mgr). + Named("realtime-engram"). For(&v1alpha1.Engram{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Owns(&appsv1.Deployment{}). Owns(&appsv1.StatefulSet{}). diff --git a/internal/controller/runs/dag.go b/internal/controller/runs/dag.go index 2b6e580..f2deb9d 100644 --- a/internal/controller/runs/dag.go +++ b/internal/controller/runs/dag.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "regexp" + "strings" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" @@ -13,6 +14,7 @@ import ( "github.com/bubustack/bobrapet/pkg/conditions" "github.com/bubustack/bobrapet/pkg/enums" "github.com/bubustack/bobrapet/pkg/logging" + "github.com/bubustack/bobrapet/pkg/metrics" "github.com/bubustack/bobrapet/pkg/patch" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -137,21 +139,36 @@ func (r *DAGReconciler) checkCompletionOrFailure(ctx context.Context, srun *runs func (r *DAGReconciler) persistStepStates(ctx context.Context, srun *runsv1alpha1.StoryRun) error { return patch.RetryableStatusPatch(ctx, r.Client, srun, func(obj client.Object) { sr := obj.(*runsv1alpha1.StoryRun) - sr.Status.StepStates = srun.Status.StepStates + sr.Status.StepStates = cloneStepStates(srun.Status.StepStates) }) } func (r *DAGReconciler) persistMergedStates(ctx context.Context, srun *runsv1alpha1.StoryRun) error { return patch.RetryableStatusPatch(ctx, r.Client, srun, func(obj client.Object) { sr := obj.(*runsv1alpha1.StoryRun) + if sr.Status.StepStates == nil { + sr.Status.StepStates = make(map[string]runsv1alpha1.StepState, len(srun.Status.StepStates)) + } for k, v := range srun.Status.StepStates { sr.Status.StepStates[k] = v } }) } +func cloneStepStates(in map[string]runsv1alpha1.StepState) map[string]runsv1alpha1.StepState { + if len(in) == 0 { + return make(map[string]runsv1alpha1.StepState) + } + out := make(map[string]runsv1alpha1.StepState, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + func (r *DAGReconciler) syncStateFromStepRuns(ctx context.Context, srun *runsv1alpha1.StoryRun) (*runsv1alpha1.StepRunList, error) { log := logging.NewReconcileLogger(ctx, "storyrun-dag-sync") + r.initStepStatesIfNeeded(srun) var stepRunList runsv1alpha1.StepRunList if err := r.List(ctx, &stepRunList, client.InNamespace(srun.Namespace), client.MatchingLabels{"bubustack.io/storyrun": srun.Name}); err != nil { log.Error(err, "Failed to list StepRuns") @@ -168,9 +185,6 @@ func (r *DAGReconciler) syncStateFromStepRuns(ctx context.Context, srun *runsv1a Phase: sr.Status.Phase, Message: sr.Status.LastFailureMsg, } - // DEPRECATED: Do not propagate StepRun outputs into the parent StoryRun. - // This was causing a storage hazard where the StoryRun could exceed etcd's - // size limit. Step outputs are now resolved on-demand from StepRun objects. } } return &stepRunList, nil @@ -231,6 +245,7 @@ func (r *DAGReconciler) checkSyncSubStories(ctx context.Context, srun *runsv1alp func (r *DAGReconciler) findAndLaunchReadySteps(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, completedSteps, runningSteps map[string]bool, priorStepOutputs map[string]any) ([]*bubuv1alpha1.Step, []*bubuv1alpha1.Step, error) { log := logging.NewReconcileLogger(ctx, "storyrun-dag-launcher") + r.initStepStatesIfNeeded(srun) dependencies, _ := buildDependencyGraphs(story.Spec.Steps) storyRunInputs, _ := getStoryRunInputs(srun) @@ -256,7 +271,9 @@ func (r *DAGReconciler) findAndLaunchReadySteps(ctx context.Context, srun *runsv // Propagate the error up to the main reconcile loop to trigger backoff return nil, nil, fmt.Errorf("failed to execute step %s: %w", step.Name, err) } - srun.Status.StepStates[step.Name] = runsv1alpha1.StepState{Phase: enums.PhaseRunning} + if state, ok := srun.Status.StepStates[step.Name]; !ok || state.Phase == "" { + srun.Status.StepStates[step.Name] = runsv1alpha1.StepState{Phase: enums.PhaseRunning} + } } return readySteps, skippedSteps, nil @@ -316,7 +333,15 @@ func collectOutputsFromStepRuns(stepRunList *runsv1alpha1.StepRunList, outputs m log.Error(err, "Failed to unmarshal output from prior StepRun during fallback", "step", sr.Spec.StepID) continue } - outputs[sr.Spec.StepID] = map[string]any{"outputs": outputData} + stepContext := map[string]any{ + "outputs": outputData, + "output": outputData, + } + if len(sr.Status.Manifest) > 0 { + stepContext["manifest"] = sr.Status.Manifest + applyManifestPlaceholders(outputData, sr.Status.Manifest) + } + outputs[sr.Spec.StepID] = stepContext } } } @@ -342,8 +367,95 @@ func collectOutputsFromSubStories(ctx context.Context, c client.Client, srun *ru log.Error(err, "Failed to unmarshal output from sub-StoryRun", "step", stepID) continue } - outputs[stepID] = map[string]any{"outputs": outputData} + stepContext := map[string]any{ + "outputs": outputData, + "output": outputData, + } + outputs[stepID] = stepContext + } + } +} + +func applyManifestPlaceholders(outputs map[string]any, manifest map[string]runsv1alpha1.StepManifestData) { + if outputs == nil || len(manifest) == 0 { + return + } + + for path, data := range manifest { + if path == "" || path == manifestRootPath { + applyRootManifestMetadata(outputs, data) + continue + } + if strings.Contains(path, "[") { + continue + } + + if sample, ok := decodeManifestSample(data.Sample); ok { + ensurePathValue(outputs, path, sample) + continue + } + + if data.Length != nil { + length := int(*data.Length) + if length < 0 { + length = 0 + } + placeholder := make([]any, length) + ensurePathValue(outputs, path, placeholder) + continue + } + + if data.Exists != nil && *data.Exists { + ensurePathValue(outputs, path, map[string]any{}) + } + } +} + +func applyRootManifestMetadata(outputs map[string]any, data runsv1alpha1.StepManifestData) { + if outputs == nil { + return + } + if data.Length != nil { + outputs[cel.ManifestLengthKey] = *data.Length + } +} + +func decodeManifestSample(raw *runtime.RawExtension) (any, bool) { + if raw == nil || len(raw.Raw) == 0 { + return nil, false + } + var out any + if err := json.Unmarshal(raw.Raw, &out); err != nil { + return nil, false + } + return out, true +} + +func ensurePathValue(root map[string]any, path string, value any) { + segments := strings.Split(path, ".") + current := root + for i, segment := range segments { + if segment == "" { + continue + } + if i == len(segments)-1 { + if _, exists := current[segment]; !exists { + current[segment] = value + } + return + } + next, ok := current[segment] + if !ok { + sub := make(map[string]any) + current[segment] = sub + current = sub + continue + } + nextMap, ok := next.(map[string]any) + if !ok { + return } + current = nextMap } } @@ -596,6 +708,7 @@ func (r *DAGReconciler) setStoryRunPhase(ctx context.Context, srun *runsv1alpha1 sr.Status.FinishedAt = &now if sr.Status.StartedAt != nil { sr.Status.Duration = now.Sub(sr.Status.StartedAt.Time).String() + metrics.RecordStoryRunMetrics(sr.Namespace, sr.Spec.StoryRef.Name, string(phase), now.Sub(sr.Status.StartedAt.Time)) } } cm.SetCondition(&sr.Status.Conditions, conditions.ConditionReady, metav1.ConditionTrue, conditions.ReasonCompleted, message) @@ -605,6 +718,7 @@ func (r *DAGReconciler) setStoryRunPhase(ctx context.Context, srun *runsv1alpha1 sr.Status.FinishedAt = &now if sr.Status.StartedAt != nil { sr.Status.Duration = now.Sub(sr.Status.StartedAt.Time).String() + metrics.RecordStoryRunMetrics(sr.Namespace, sr.Spec.StoryRef.Name, string(phase), now.Sub(sr.Status.StartedAt.Time)) } } cm.SetCondition(&sr.Status.Conditions, conditions.ConditionReady, metav1.ConditionFalse, conditions.ReasonExecutionFailed, message) diff --git a/internal/controller/runs/dag_test.go b/internal/controller/runs/dag_test.go new file mode 100644 index 0000000..4027380 --- /dev/null +++ b/internal/controller/runs/dag_test.go @@ -0,0 +1,117 @@ +package runs + +import ( + "context" + "testing" + "time" + + runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" + bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/pkg/cel" + "github.com/bubustack/bobrapet/pkg/enums" +) + +func TestApplyManifestPlaceholdersInsertsLengthPlaceholders(t *testing.T) { + outputs := map[string]any{ + "$bubuStorageRef": "outputs/ref.json", + } + length := int64(3) + manifest := map[string]runsv1alpha1.StepManifestData{ + "tools": { + Length: &length, + }, + } + + applyManifestPlaceholders(outputs, manifest) + + val, ok := outputs["tools"] + if !ok { + t.Fatalf("expected tools placeholder to be present") + } + slice, ok := val.([]any) + if !ok { + t.Fatalf("expected tools placeholder to be []any, got %T", val) + } + if len(slice) != int(length) { + t.Fatalf("expected placeholder length %d, got %d", length, len(slice)) + } +} + +func TestApplyManifestPlaceholdersAnnotatesRootLength(t *testing.T) { + outputs := map[string]any{} + length := int64(7) + manifest := map[string]runsv1alpha1.StepManifestData{ + manifestRootPath: { + Length: &length, + }, + } + + applyManifestPlaceholders(outputs, manifest) + + val, ok := outputs[cel.ManifestLengthKey] + if !ok { + t.Fatalf("expected manifest length metadata to be present") + } + intVal, ok := val.(int64) + if !ok { + t.Fatalf("expected manifest length metadata to be int64, got %T", val) + } + if intVal != length { + t.Fatalf("expected manifest length %d, got %d", length, intVal) + } +} + +type celNopLogger struct{} + +func (celNopLogger) CacheHit(string, string) {} +func (celNopLogger) EvaluationStart(string, string) {} +func (celNopLogger) EvaluationError(error, string, string, time.Duration) {} +func (celNopLogger) EvaluationSuccess(string, string, time.Duration, any) {} + +func TestFindAndLaunchReadyStepsInitializesStepStates(t *testing.T) { + eval, err := cel.New(celNopLogger{}) + if err != nil { + t.Fatalf("failed to create evaluator: %v", err) + } + t.Cleanup(eval.Close) + + reconciler := &DAGReconciler{CEL: eval} + + ifCondition := "false" + story := &bubuv1alpha1.Story{ + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{{ + Name: "skip-me", + Type: enums.StepTypeSetData, + If: &ifCondition, + }}, + }, + } + + srun := &runsv1alpha1.StoryRun{} + ready, skipped, err := reconciler.findAndLaunchReadySteps( + context.Background(), + srun, + story, + map[string]bool{}, + map[string]bool{}, + map[string]any{}, + ) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(ready) != 0 { + t.Fatalf("expected no ready steps, got %d", len(ready)) + } + if len(skipped) != 1 { + t.Fatalf("expected exactly one skipped step, got %d", len(skipped)) + } + + state, ok := srun.Status.StepStates["skip-me"] + if !ok { + t.Fatalf("expected skip-me to have a StepState entry") + } + if state.Phase != enums.PhaseSkipped { + t.Fatalf("expected step to be marked skipped, got phase %s", state.Phase) + } +} diff --git a/internal/controller/runs/rbac.go b/internal/controller/runs/rbac.go index 77d0dbe..b7bafca 100644 --- a/internal/controller/runs/rbac.go +++ b/internal/controller/runs/rbac.go @@ -9,7 +9,7 @@ import ( "github.com/bubustack/bobrapet/pkg/logging" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -31,7 +31,17 @@ func NewRBACManager(k8sClient client.Client, scheme *runtime.Scheme) *RBACManage // Reconcile ensures the necessary ServiceAccount, Role, and RoleBinding exist for the StoryRun. func (r *RBACManager) Reconcile(ctx context.Context, storyRun *runsv1alpha1.StoryRun) error { log := logging.NewReconcileLogger(ctx, "storyrun-rbac") - story, _ := r.getStoryForRun(ctx, storyRun) + story, err := r.getStoryForRun(ctx, storyRun) + if err != nil { + if apierrors.IsNotFound(err) { + log.WithStoryRun(storyRun).Info("Parent Story not found; continuing RBAC reconciliation without story-scoped annotations") + } else { + log.WithStoryRun(storyRun).Error(err, "Failed to fetch parent Story for RBAC provisioning") + return fmt.Errorf("failed to get Story %s for StoryRun %s: %w", + storyRun.Spec.StoryRef.ToNamespacedName(storyRun), storyRun.Name, err) + } + story = nil + } saName := fmt.Sprintf("%s-engram-runner", storyRun.Name) if err := r.reconcileServiceAccount(ctx, storyRun, story, saName, log); err != nil { @@ -67,34 +77,43 @@ func (r *RBACManager) reconcileServiceAccount(ctx context.Context, storyRun *run } func (r *RBACManager) reconcileRole(ctx context.Context, storyRun *runsv1alpha1.StoryRun, saName string, log *logging.ReconcileLogger) error { - role := &rbacv1.Role{ - ObjectMeta: metav1.ObjectMeta{Name: saName, Namespace: storyRun.Namespace}, - Rules: []rbacv1.PolicyRule{{APIGroups: []string{"runs.bubustack.io"}, Resources: []string{"stepruns"}, Verbs: []string{"get", "watch"}}, {APIGroups: []string{"runs.bubustack.io"}, Resources: []string{"stepruns/status"}, Verbs: []string{"patch", "update"}}}, - } - if err := controllerutil.SetOwnerReference(storyRun, role, r.Scheme); err != nil { - return fmt.Errorf("failed to set owner reference on Role: %w", err) + role := &rbacv1.Role{ObjectMeta: metav1.ObjectMeta{Name: saName, Namespace: storyRun.Namespace}} + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, role, func() error { + role.Rules = []rbacv1.PolicyRule{ + {APIGroups: []string{"runs.bubustack.io"}, Resources: []string{"stepruns"}, Verbs: []string{"get", "watch"}}, + {APIGroups: []string{"runs.bubustack.io"}, Resources: []string{"stepruns/status"}, Verbs: []string{"patch", "update"}}, + } + return controllerutil.SetOwnerReference(storyRun, role, r.Scheme) + }) + if err != nil { + return fmt.Errorf("failed to create or update Role: %w", err) } - if err := r.Create(ctx, role); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("failed to create Role: %w", err) - } else if err == nil { - log.Info("Created Role for Engram runner", "role", role.Name) + if op != controllerutil.OperationResultNone { + log.Info("Reconciled Role for Engram runner", "role", role.Name, "operation", op) } return nil } func (r *RBACManager) reconcileRoleBinding(ctx context.Context, storyRun *runsv1alpha1.StoryRun, saName string, log *logging.ReconcileLogger) error { - rb := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: saName, Namespace: storyRun.Namespace}, - Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: saName, Namespace: storyRun.Namespace}}, - RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: saName}, - } - if err := controllerutil.SetOwnerReference(storyRun, rb, r.Scheme); err != nil { - return fmt.Errorf("failed to set owner reference on RoleBinding: %w", err) + rb := &rbacv1.RoleBinding{ObjectMeta: metav1.ObjectMeta{Name: saName, Namespace: storyRun.Namespace}} + op, err := controllerutil.CreateOrUpdate(ctx, r.Client, rb, func() error { + rb.Subjects = []rbacv1.Subject{{ + Kind: "ServiceAccount", + Name: saName, + Namespace: storyRun.Namespace, + }} + rb.RoleRef = rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: saName, + } + return controllerutil.SetOwnerReference(storyRun, rb, r.Scheme) + }) + if err != nil { + return fmt.Errorf("failed to create or update RoleBinding: %w", err) } - if err := r.Create(ctx, rb); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("failed to create RoleBinding: %w", err) - } else if err == nil { - log.Info("Created RoleBinding for Engram runner", "roleBinding", rb.Name) + if op != controllerutil.OperationResultNone { + log.Info("Reconciled RoleBinding for Engram runner", "roleBinding", rb.Name, "operation", op) } return nil } diff --git a/internal/controller/runs/step_executor.go b/internal/controller/runs/step_executor.go index 2af26ce..11cf693 100644 --- a/internal/controller/runs/step_executor.go +++ b/internal/controller/runs/step_executor.go @@ -1,15 +1,25 @@ package runs import ( + "bytes" "context" "encoding/json" "fmt" + "reflect" + "regexp" + "sort" + "strconv" + "strings" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/mergeutil" + "github.com/bubustack/bobrapet/internal/controller/naming" "github.com/bubustack/bobrapet/pkg/cel" "github.com/bubustack/bobrapet/pkg/enums" "github.com/bubustack/bobrapet/pkg/logging" + "github.com/bubustack/bobrapet/pkg/observability" refs "github.com/bubustack/bobrapet/pkg/refs" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -17,22 +27,59 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + "go.opentelemetry.io/otel/attribute" ) // StepExecutor is responsible for executing individual steps in a StoryRun. type StepExecutor struct { client.Client - Scheme *runtime.Scheme - CEL *cel.Evaluator + Scheme *runtime.Scheme + CEL *cel.Evaluator + ConfigResolver *config.Resolver } +var ( + manifestLenPattern = regexp.MustCompile(`len\(\s*steps\.([a-zA-Z0-9_\-]+)\.output((?:\.[a-zA-Z0-9_\-]+|\[[^]]+\])*)\s*\)`) + manifestPathPattern = regexp.MustCompile(`steps\.([a-zA-Z0-9_\-]+)\.output((?:\.[a-zA-Z0-9_\-]+|\[[^]]+\])*)`) + stepAliasPattern = regexp.MustCompile(`steps\.([a-zA-Z0-9_\-]+)`) +) + +const ( + manifestRootPath = "$" + defaultMaxLoopIterations = 100 +) + // NewStepExecutor creates a new StepExecutor. -func NewStepExecutor(k8sClient client.Client, scheme *runtime.Scheme, celEval *cel.Evaluator) *StepExecutor { - return &StepExecutor{Client: k8sClient, Scheme: scheme, CEL: celEval} +func NewStepExecutor(k8sClient client.Client, scheme *runtime.Scheme, celEval *cel.Evaluator, cfgResolver *config.Resolver) *StepExecutor { + return &StepExecutor{Client: k8sClient, Scheme: scheme, CEL: celEval, ConfigResolver: cfgResolver} } // Execute determines the step type and calls the appropriate execution method. -func (e *StepExecutor) Execute(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) error { +func (e *StepExecutor) Execute(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) (err error) { + ensureStoryStepStateMap(srun) + stepType := string(step.Type) + if stepType == "" { + stepType = "primitive" + if step.Ref != nil { + stepType = "engram" + } + } + ctx, span := observability.StartSpan(ctx, "StepExecutor.Execute", + attribute.String("namespace", srun.Namespace), + attribute.String("storyrun", srun.Name), + attribute.String("story", story.Name), + attribute.String("step", step.Name), + attribute.String("step_type", stepType), + attribute.Int("needs.count", len(step.Needs)), + ) + defer span.End() + defer func() { + if err != nil { + span.RecordError(err) + } + }() + // An Engram step is defined by the presence of the 'ref' field. if step.Ref != nil { return e.executeEngramStep(ctx, srun, story, step) @@ -41,7 +88,7 @@ func (e *StepExecutor) Execute(ctx context.Context, srun *runsv1alpha1.StoryRun, // If 'ref' is not present, a 'type' must be specified. switch step.Type { case enums.StepTypeExecuteStory: - return e.executeStoryStep(ctx, srun, step) + return e.executeStoryStep(ctx, srun, step, vars) case enums.StepTypeLoop: return e.executeLoopStep(ctx, srun, story, step, vars) case enums.StepTypeParallel: @@ -60,73 +107,143 @@ func (e *StepExecutor) Execute(ctx context.Context, srun *runsv1alpha1.StoryRun, } func (e *StepExecutor) executeEngramStep(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, step *bubuv1alpha1.Step) error { - stepName := fmt.Sprintf("%s-%s", srun.Name, step.Name) + stepName := naming.Compose(srun.Name, step.Name) + requestedManifest := e.computeManifestRequests(story, step.Name) + desiredTimeout, desiredRetry := extractExecutionOverrides(step) + var stepRun runsv1alpha1.StepRun err := e.Get(ctx, types.NamespacedName{Name: stepName, Namespace: srun.Namespace}, &stepRun) + if errors.IsNotFound(err) { + return e.createEngramStepRun(ctx, srun, story, step, stepName, desiredTimeout, desiredRetry, requestedManifest) + } + if err != nil { + return err + } - if err != nil && errors.IsNotFound(err) { - if step.Ref == nil { - return fmt.Errorf("step '%s' is missing a 'ref'", step.Name) - } + return e.patchEngramStepRun(ctx, &stepRun, desiredTimeout, desiredRetry, requestedManifest) +} - // Get the engram to merge the 'with' blocks - var engram bubuv1alpha1.Engram - if err := e.Get(ctx, step.Ref.ToNamespacedName(srun), &engram); err != nil { - return fmt.Errorf("failed to get engram '%s' for step '%s': %w", step.Ref.Name, step.Name, err) - } +func extractExecutionOverrides(step *bubuv1alpha1.Step) (string, *bubuv1alpha1.RetryPolicy) { + if step.Execution == nil { + return "", nil + } - mergedWith, err := e.mergeWithBlocks(engram.Spec.With, step.With) - if err != nil { - return fmt.Errorf("failed to merge 'with' blocks for step '%s': %w", step.Name, err) - } + var timeout string + if step.Execution.Timeout != nil { + timeout = *step.Execution.Timeout + } - stepRun = runsv1alpha1.StepRun{ - ObjectMeta: metav1.ObjectMeta{ - Name: stepName, - Namespace: srun.Namespace, - Labels: map[string]string{ - "bubustack.io/storyrun": srun.Name, - "bubustack.io/story-name": story.Name, - }, + var retry *bubuv1alpha1.RetryPolicy + if step.Execution.Retry != nil { + retry = step.Execution.Retry.DeepCopy() + } + return timeout, retry +} + +func (e *StepExecutor) createEngramStepRun( + ctx context.Context, + srun *runsv1alpha1.StoryRun, + story *bubuv1alpha1.Story, + step *bubuv1alpha1.Step, + stepName string, + timeout string, + retry *bubuv1alpha1.RetryPolicy, + requestedManifest []runsv1alpha1.ManifestRequest, +) error { + if step.Ref == nil { + return fmt.Errorf("step '%s' is missing a 'ref'", step.Name) + } + + var engram bubuv1alpha1.Engram + if err := e.Get(ctx, step.Ref.ToNamespacedName(srun), &engram); err != nil { + return fmt.Errorf("failed to get engram '%s' for step '%s': %w", step.Ref.Name, step.Name, err) + } + + mergedWith, err := mergeutil.MergeWithBlocks(engram.Spec.With, step.With) + if err != nil { + return fmt.Errorf("failed to merge 'with' blocks for step '%s': %w", step.Name, err) + } + + stepRun := runsv1alpha1.StepRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: stepName, + Namespace: srun.Namespace, + Labels: map[string]string{ + "bubustack.io/storyrun": srun.Name, + "bubustack.io/story-name": story.Name, }, - Spec: runsv1alpha1.StepRunSpec{ - StoryRunRef: refs.StoryRunReference{ - ObjectReference: refs.ObjectReference{Name: srun.Name}, - }, - StepID: step.Name, - EngramRef: step.Ref, - Input: mergedWith, + }, + Spec: runsv1alpha1.StepRunSpec{ + StoryRunRef: refs.StoryRunReference{ + ObjectReference: refs.ObjectReference{Name: srun.Name}, }, + StepID: step.Name, + EngramRef: step.Ref, + Input: mergedWith, + Timeout: timeout, + Retry: retry, + RequestedManifest: requestedManifest, + }, + } + + if err := controllerutil.SetControllerReference(srun, &stepRun, e.Scheme); err != nil { + return err + } + return e.Create(ctx, &stepRun) +} + +func (e *StepExecutor) patchEngramStepRun( + ctx context.Context, + stepRun *runsv1alpha1.StepRun, + timeout string, + retry *bubuv1alpha1.RetryPolicy, + requestedManifest []runsv1alpha1.ManifestRequest, +) error { + before := stepRun.DeepCopy() + needsPatch := false + + if stepRun.Spec.Timeout != timeout { + stepRun.Spec.Timeout = timeout + needsPatch = true + } + + if !reflect.DeepEqual(stepRun.Spec.Retry, retry) { + if retry == nil { + stepRun.Spec.Retry = nil + } else { + stepRun.Spec.Retry = retry.DeepCopy() } - if err := controllerutil.SetControllerReference(srun, &stepRun, e.Scheme); err != nil { - return err - } - return e.Create(ctx, &stepRun) + needsPatch = true + } + + if len(requestedManifest) > 0 && !manifestRequestsEqual(stepRun.Spec.RequestedManifest, requestedManifest) { + stepRun.Spec.RequestedManifest = requestedManifest + needsPatch = true + } + + if !needsPatch { + return nil } - return err // Return other errors or nil if found + + return e.Patch(ctx, stepRun, client.MergeFrom(before)) } -func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.StoryRun, _ *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) error { +func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) error { type loopWith struct { Items json.RawMessage `json:"items"` Template bubuv1alpha1.Step `json:"template"` } - var config loopWith - if err := json.Unmarshal(step.With.Raw, &config); err != nil { + var loopCfg loopWith + if err := json.Unmarshal(step.With.Raw, &loopCfg); err != nil { return fmt.Errorf("failed to parse 'with' for loop step '%s': %w", step.Name, err) } - var itemsMap any - if err := json.Unmarshal(config.Items, &itemsMap); err != nil { - // fallback to string - var str string - if err := json.Unmarshal(config.Items, &str); err != nil { - return fmt.Errorf("failed to parse 'items' for loop step '%s': %w", step.Name, err) - } - itemsMap = map[string]any{"value": str} + var itemsSpec any + if err := json.Unmarshal(loopCfg.Items, &itemsSpec); err != nil { + return fmt.Errorf("failed to parse 'items' for loop step '%s': %w", step.Name, err) } - resolvedItemsRaw, err := e.CEL.ResolveWithInputs(ctx, itemsMap.(map[string]any), vars) + resolvedItemsRaw, err := e.resolveLoopItems(ctx, itemsSpec, vars) if err != nil { return fmt.Errorf("failed to resolve 'items' for loop step '%s': %w", step.Name, err) } @@ -136,14 +253,19 @@ func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.S return fmt.Errorf("'items' in loop step '%s' did not resolve to a list: %w", step.Name, err) } - const maxIterations = 100 + maxIterations := defaultMaxLoopIterations + if e.ConfigResolver != nil { + if cfg := e.ConfigResolver.GetOperatorConfig(); cfg != nil && cfg.Controller.MaxLoopIterations > 0 { + maxIterations = cfg.Controller.MaxLoopIterations + } + } if len(resolvedItems) > maxIterations { return fmt.Errorf("loop step '%s' exceeds maximum of %d iterations", step.Name, maxIterations) } childStepRunNames := make([]string, 0, len(resolvedItems)) for i, item := range resolvedItems { - childStepName := fmt.Sprintf("%s-%s-%d", srun.Name, step.Name, i) + childStepName := naming.Compose(srun.Name, step.Name, strconv.Itoa(i)) childStepRunNames = append(childStepRunNames, childStepName) loopVars := map[string]any{ @@ -153,18 +275,12 @@ func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.S "index": i, } - var withMap any - if config.Template.With != nil { - if err := json.Unmarshal(config.Template.With.Raw, &withMap); err != nil { - return fmt.Errorf("failed to parse 'with' for loop template in step '%s': %w", step.Name, err) - } - } - - resolvedWith, err := e.CEL.ResolveWithInputs(ctx, withMap.(map[string]any), loopVars) + resolvedWithBytes, err := e.resolveTemplateWith(ctx, loopCfg.Template.With, loopVars) if err != nil { return fmt.Errorf("failed to resolve 'with' for loop iteration %d in step '%s': %w", i, step.Name, err) } - resolvedWithBytes, _ := json.Marshal(resolvedWith) + + requestedManifest := e.computeManifestRequests(story, loopCfg.Template.Name) stepRun := &runsv1alpha1.StepRun{ ObjectMeta: metav1.ObjectMeta{ @@ -173,16 +289,14 @@ func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.S Labels: map[string]string{"bubustack.io/storyrun": srun.Name, "bubustack.io/parent-step": step.Name}, }, Spec: runsv1alpha1.StepRunSpec{ - StoryRunRef: refs.StoryRunReference{ObjectReference: refs.ObjectReference{Name: srun.Name}}, - StepID: config.Template.Name, - EngramRef: config.Template.Ref, - Input: &runtime.RawExtension{Raw: resolvedWithBytes}, + StoryRunRef: refs.StoryRunReference{ObjectReference: refs.ObjectReference{Name: srun.Name}}, + StepID: loopCfg.Template.Name, + EngramRef: loopCfg.Template.Ref, + Input: resolvedWithBytes, + RequestedManifest: requestedManifest, }, } - if err := controllerutil.SetControllerReference(srun, stepRun, e.Scheme); err != nil { - return err - } - if err := e.Create(ctx, stepRun); err != nil && !errors.IsAlreadyExists(err) { + if err := e.createOrUpdateChildStepRun(ctx, srun, stepRun); err != nil { return err } } @@ -195,32 +309,26 @@ func (e *StepExecutor) executeLoopStep(ctx context.Context, srun *runsv1alpha1.S return nil } -func (e *StepExecutor) executeParallelStep(ctx context.Context, srun *runsv1alpha1.StoryRun, _ *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) error { +func (e *StepExecutor) executeParallelStep(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, vars map[string]any) error { type parallelWith struct { Steps []bubuv1alpha1.Step `json:"steps"` } - var config parallelWith - if err := json.Unmarshal(step.With.Raw, &config); err != nil { + var parallelCfg parallelWith + if err := json.Unmarshal(step.With.Raw, ¶llelCfg); err != nil { return fmt.Errorf("failed to parse 'with' for parallel step '%s': %w", step.Name, err) } - childStepRunNames := make([]string, 0, len(config.Steps)) - for _, childStep := range config.Steps { - childStepName := fmt.Sprintf("%s-%s-%s", srun.Name, step.Name, childStep.Name) + childStepRunNames := make([]string, 0, len(parallelCfg.Steps)) + for _, childStep := range parallelCfg.Steps { + childStepName := naming.Compose(srun.Name, step.Name, childStep.Name) childStepRunNames = append(childStepRunNames, childStepName) - var withMap any - if childStep.With != nil { - if err := json.Unmarshal(childStep.With.Raw, &withMap); err != nil { - return fmt.Errorf("failed to parse 'with' for parallel branch '%s' in step '%s': %w", childStep.Name, step.Name, err) - } - } - - resolvedWith, err := e.CEL.ResolveWithInputs(ctx, withMap.(map[string]any), vars) + resolvedWithBytes, err := e.resolveTemplateWith(ctx, childStep.With, vars) if err != nil { return fmt.Errorf("failed to resolve 'with' for parallel branch '%s' in step '%s': %w", childStep.Name, step.Name, err) } - resolvedWithBytes, _ := json.Marshal(resolvedWith) + + requestedManifest := e.computeManifestRequests(story, childStep.Name) stepRun := &runsv1alpha1.StepRun{ ObjectMeta: metav1.ObjectMeta{ @@ -229,16 +337,14 @@ func (e *StepExecutor) executeParallelStep(ctx context.Context, srun *runsv1alph Labels: map[string]string{"bubustack.io/storyrun": srun.Name, "bubustack.io/parent-step": step.Name}, }, Spec: runsv1alpha1.StepRunSpec{ - StoryRunRef: refs.StoryRunReference{ObjectReference: refs.ObjectReference{Name: srun.Name}}, - StepID: childStep.Name, - EngramRef: childStep.Ref, - Input: &runtime.RawExtension{Raw: resolvedWithBytes}, + StoryRunRef: refs.StoryRunReference{ObjectReference: refs.ObjectReference{Name: srun.Name}}, + StepID: childStep.Name, + EngramRef: childStep.Ref, + Input: resolvedWithBytes, + RequestedManifest: requestedManifest, }, } - if err := controllerutil.SetControllerReference(srun, stepRun, e.Scheme); err != nil { - return err - } - if err := e.Create(ctx, stepRun); err != nil && !errors.IsAlreadyExists(err) { + if err := e.createOrUpdateChildStepRun(ctx, srun, stepRun); err != nil { return err } } @@ -251,6 +357,124 @@ func (e *StepExecutor) executeParallelStep(ctx context.Context, srun *runsv1alph return nil } +func (e *StepExecutor) resolveLoopItems(ctx context.Context, raw any, vars map[string]any) (any, error) { + switch typed := raw.(type) { + case map[string]any: + resolved, err := e.CEL.ResolveWithInputs(ctx, typed, vars) + if err != nil { + return nil, err + } + if val, ok := resolved["items"]; ok { + return val, nil + } + if val, ok := resolved["value"]; ok { + return val, nil + } + if len(resolved) == 1 { + for _, v := range resolved { + return v, nil + } + } + return resolved, nil + case []any: + return typed, nil + case string: + resolved, err := e.CEL.ResolveWithInputs(ctx, map[string]any{"value": typed}, vars) + if err != nil { + return nil, err + } + return resolved["value"], nil + case nil: + return nil, fmt.Errorf("'items' cannot be null") + default: + return typed, nil + } +} + +func (e *StepExecutor) createOrUpdateChildStepRun(ctx context.Context, srun *runsv1alpha1.StoryRun, desired *runsv1alpha1.StepRun) error { + if err := controllerutil.SetControllerReference(srun, desired, e.Scheme); err != nil { + return err + } + + if err := e.Create(ctx, desired); err != nil { + if !errors.IsAlreadyExists(err) { + return err + } + + var existing runsv1alpha1.StepRun + if getErr := e.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, &existing); getErr != nil { + return getErr + } + + before := existing.DeepCopy() + updated := false + + if existing.Spec.StepID != desired.Spec.StepID { + existing.Spec.StepID = desired.Spec.StepID + updated = true + } + if !reflect.DeepEqual(existing.Spec.StoryRunRef, desired.Spec.StoryRunRef) { + existing.Spec.StoryRunRef = desired.Spec.StoryRunRef + updated = true + } + if !reflect.DeepEqual(existing.Spec.EngramRef, desired.Spec.EngramRef) { + if desired.Spec.EngramRef == nil { + existing.Spec.EngramRef = nil + } else { + clone := *desired.Spec.EngramRef + existing.Spec.EngramRef = &clone + } + updated = true + } + if !rawExtensionsEqual(existing.Spec.Input, desired.Spec.Input) { + existing.Spec.Input = cloneRawExtension(desired.Spec.Input) + updated = true + } + if !manifestRequestsEqual(existing.Spec.RequestedManifest, desired.Spec.RequestedManifest) { + existing.Spec.RequestedManifest = copyManifestRequests(desired.Spec.RequestedManifest) + updated = true + } + + ownerBefore := before.GetOwnerReferences() + if err := controllerutil.SetControllerReference(srun, &existing, e.Scheme); err != nil { + return err + } + ownerChanged := !reflect.DeepEqual(ownerBefore, existing.GetOwnerReferences()) + + if updated || ownerChanged { + if patchErr := e.Patch(ctx, &existing, client.MergeFrom(before)); patchErr != nil { + return patchErr + } + } + + return nil + } + return nil +} + +func (e *StepExecutor) resolveTemplateWith(ctx context.Context, raw *runtime.RawExtension, vars map[string]any) (*runtime.RawExtension, error) { + if raw == nil || len(raw.Raw) == 0 { + return nil, nil + } + + withVars := make(map[string]any) + if err := json.Unmarshal(raw.Raw, &withVars); err != nil { + return nil, fmt.Errorf("failed to parse 'with' block: %w", err) + } + + resolved, err := e.CEL.ResolveWithInputs(ctx, withVars, vars) + if err != nil { + return nil, err + } + + encoded, err := json.Marshal(resolved) + if err != nil { + return nil, fmt.Errorf("failed to marshal resolved 'with' block: %w", err) + } + + return &runtime.RawExtension{Raw: encoded}, nil +} + func (e *StepExecutor) executeStopStep(ctx context.Context, srun *runsv1alpha1.StoryRun, step *bubuv1alpha1.Step) error { log := logging.NewReconcileLogger(ctx, "step-executor").WithValues("storyrun", srun.Name, "step", step.Name) @@ -258,83 +482,413 @@ func (e *StepExecutor) executeStopStep(ctx context.Context, srun *runsv1alpha1.S Phase enums.Phase `json:"phase"` Message string `json:"message"` } - var config stopWith + var stopCfg stopWith if step.With != nil { - if err := json.Unmarshal(step.With.Raw, &config); err != nil { + if err := json.Unmarshal(step.With.Raw, &stopCfg); err != nil { return fmt.Errorf("failed to parse 'with' for stop step '%s': %w", step.Name, err) } } - if config.Phase == "" { - config.Phase = enums.PhaseSucceeded + if stopCfg.Phase == "" { + stopCfg.Phase = enums.PhaseSucceeded + } + if stopCfg.Message == "" { + stopCfg.Message = fmt.Sprintf("Story execution stopped by step '%s' with phase '%s'", step.Name, stopCfg.Phase) + } + + log.Info("Executing stop step", "phase", stopCfg.Phase, "message", stopCfg.Message) + srun.Status.Phase = stopCfg.Phase + srun.Status.Message = stopCfg.Message + srun.Status.StepStates[step.Name] = runsv1alpha1.StepState{Phase: stopCfg.Phase, Message: stopCfg.Message} + return nil +} + +func (e *StepExecutor) executeStoryStep(ctx context.Context, srun *runsv1alpha1.StoryRun, step *bubuv1alpha1.Step, vars map[string]any) error { + log := logging.NewReconcileLogger(ctx, "step-executor").WithValues("storyrun", srun.Name, "step", step.Name) + + ensureStoryStepStateMap(srun) + + cfg, err := parseExecuteStoryConfig(step) + if err != nil { + return err + } + + targetStory, err := e.getTargetStory(ctx, srun, cfg.storyRef, step.Name) + if err != nil { + return err + } + + inputs, err := e.resolveStoryStepInputs(ctx, cfg.rawInputs, vars, step.Name) + if err != nil { + return err } - if config.Message == "" { - config.Message = fmt.Sprintf("Story execution stopped by step '%s' with phase '%s'", step.Name, config.Phase) + + subRun, err := e.ensureSubStoryRun(ctx, srun, step, targetStory, inputs, cfg.waitForCompletion, log) + if err != nil { + return err } - log.Info("Executing stop step", "phase", config.Phase, "message", config.Message) - srun.Status.Phase = config.Phase - srun.Status.Message = config.Message + updateStoryStepState(srun, step, subRun, cfg.waitForCompletion) return nil } -func (e *StepExecutor) executeStoryStep(ctx context.Context, srun *runsv1alpha1.StoryRun, step *bubuv1alpha1.Step) error { - log := logging.NewReconcileLogger(ctx, "step-executor").WithValues("storyrun", srun.Name) +type executeStoryConfig struct { + storyRef *refs.ObjectReference + waitForCompletion bool + rawInputs *runtime.RawExtension +} + +func parseExecuteStoryConfig(step *bubuv1alpha1.Step) (executeStoryConfig, error) { + if step.With == nil { + return executeStoryConfig{}, fmt.Errorf("story step '%s' requires a 'with' block", step.Name) + } type executeStoryWith struct { - StoryRef *refs.ObjectReference `json:"storyRef"` + StoryRef *refs.ObjectReference `json:"storyRef"` + WaitForCompletion *bool `json:"waitForCompletion,omitempty"` + With *runtime.RawExtension `json:"with,omitempty"` } - var with executeStoryWith - if step.With != nil { - if err := json.Unmarshal(step.With.Raw, &with); err != nil { - return fmt.Errorf("failed to unmarshal step 'with' block for story step '%s': %w", step.Name, err) + var raw executeStoryWith + if err := json.Unmarshal(step.With.Raw, &raw); err != nil { + return executeStoryConfig{}, fmt.Errorf("failed to unmarshal step 'with' block for story step '%s': %w", step.Name, err) + } + if raw.StoryRef == nil || raw.StoryRef.Name == "" { + return executeStoryConfig{}, fmt.Errorf("story step '%s' is missing a 'storyRef' in its 'with' block", step.Name) + } + wait := true + if raw.WaitForCompletion != nil { + wait = *raw.WaitForCompletion + } + return executeStoryConfig{ + storyRef: raw.StoryRef, + waitForCompletion: wait, + rawInputs: raw.With, + }, nil +} + +func ensureStoryStepStateMap(srun *runsv1alpha1.StoryRun) { + if srun.Status.StepStates == nil { + srun.Status.StepStates = make(map[string]runsv1alpha1.StepState) + } +} + +func (e *StepExecutor) getTargetStory(ctx context.Context, srun *runsv1alpha1.StoryRun, ref *refs.ObjectReference, stepName string) (*bubuv1alpha1.Story, error) { + var targetStory bubuv1alpha1.Story + if err := e.Get(ctx, ref.ToNamespacedName(srun), &targetStory); err != nil { + return nil, fmt.Errorf("failed to get story '%s' for story step '%s': %w", ref.Name, stepName, err) + } + return &targetStory, nil +} + +func (e *StepExecutor) resolveStoryStepInputs(ctx context.Context, raw *runtime.RawExtension, vars map[string]any, stepName string) (*runtime.RawExtension, error) { + if raw == nil { + return nil, nil + } + resolved, err := e.resolveTemplateWith(ctx, raw, vars) + if err != nil { + return nil, fmt.Errorf("failed to resolve 'with' block for story step '%s': %w", stepName, err) + } + return resolved, nil +} + +func (e *StepExecutor) ensureSubStoryRun( + ctx context.Context, + srun *runsv1alpha1.StoryRun, + step *bubuv1alpha1.Step, + targetStory *bubuv1alpha1.Story, + inputs *runtime.RawExtension, + waitForCompletion bool, + log *logging.ControllerLogger, +) (*runsv1alpha1.StoryRun, error) { + subRunName := naming.Compose(srun.Name, step.Name) + if state, ok := srun.Status.StepStates[step.Name]; ok && state.SubStoryRunName != "" { + subRunName = state.SubStoryRunName + } + + subRunKey := types.NamespacedName{Name: subRunName, Namespace: srun.Namespace} + subRun := &runsv1alpha1.StoryRun{} + if err := e.Get(ctx, subRunKey, subRun); err == nil { + return subRun, nil + } else if !errors.IsNotFound(err) { + return nil, fmt.Errorf("failed to get sub-StoryRun '%s' for step '%s': %w", subRunName, step.Name, err) + } + + ns := targetStory.Namespace + storyRef := refs.StoryReference{ + ObjectReference: refs.ObjectReference{ + Name: targetStory.Name, + Namespace: &ns, + }, + UID: &targetStory.UID, + } + + newSubRun := &runsv1alpha1.StoryRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: subRunName, + Namespace: srun.Namespace, + Labels: map[string]string{ + "bubustack.io/parent-storyrun": srun.Name, + "bubustack.io/parent-step": step.Name, + }, + }, + Spec: runsv1alpha1.StoryRunSpec{ + StoryRef: storyRef, + }, + } + if inputs != nil { + newSubRun.Spec.Inputs = inputs + } + + if err := controllerutil.SetControllerReference(srun, newSubRun, e.Scheme); err != nil { + return nil, fmt.Errorf("failed to set owner reference on sub-StoryRun '%s': %w", subRunName, err) + } + + if err := e.Create(ctx, newSubRun); err != nil { + if !errors.IsAlreadyExists(err) { + return nil, fmt.Errorf("failed to create sub-StoryRun '%s': %w", subRunName, err) + } + if err := e.Get(ctx, subRunKey, subRun); err != nil { + return nil, fmt.Errorf("failed to fetch sub-StoryRun '%s' after AlreadyExists: %w", subRunName, err) } + return subRun, nil } - if with.StoryRef == nil { - return fmt.Errorf("story step '%s' is missing a 'storyRef' in 'with' block", step.Name) + log.Info("Created sub-StoryRun", "name", subRunName, "waitForCompletion", waitForCompletion) + return newSubRun, nil +} + +func updateStoryStepState(srun *runsv1alpha1.StoryRun, step *bubuv1alpha1.Step, subRun *runsv1alpha1.StoryRun, waitForCompletion bool) { + state := srun.Status.StepStates[step.Name] + state.SubStoryRunName = subRun.Name + + if waitForCompletion { + if subRun.Status.Phase.IsTerminal() && subRun.Status.Phase != "" { + state.Phase = subRun.Status.Phase + state.Message = subRun.Status.Message + if state.Message == "" { + state.Message = fmt.Sprintf("Sub-story run '%s' completed with phase %s", subRun.Name, subRun.Status.Phase) + } + } else { + state.Phase = enums.PhaseRunning + state.Message = fmt.Sprintf("Waiting for sub-story run '%s' to complete", subRun.Name) + } + } else { + state.Phase = enums.PhaseSucceeded + state.Message = fmt.Sprintf("Launched sub-story run '%s'", subRun.Name) } - var story bubuv1alpha1.Story - if err := e.Get(ctx, with.StoryRef.ToNamespacedName(srun), &story); err != nil { - return fmt.Errorf("failed to get story '%s' for story step '%s': %w", with.StoryRef.Name, step.Name, err) + srun.Status.StepStates[step.Name] = state +} + +func (e *StepExecutor) computeManifestRequests(story *bubuv1alpha1.Story, targetStep string) []runsv1alpha1.ManifestRequest { + if story == nil || targetStep == "" { + return nil } - log.Info("Executing story step", "story", story.Name) + pathOps := gatherManifestPathOperations(story, targetStep) - srun.Status.StepStates[step.Name] = runsv1alpha1.StepState{Phase: enums.PhaseSucceeded, Message: "Sub-story execution is not yet implemented."} + if len(pathOps) == 0 { + return nil + } - return nil + paths := sortedManifestPaths(pathOps) + return buildManifestRequests(paths, pathOps) } -func (e *StepExecutor) mergeWithBlocks(engramWith, stepWith *runtime.RawExtension) (*runtime.RawExtension, error) { - if engramWith == nil { - return stepWith, nil +func gatherManifestPathOperations(story *bubuv1alpha1.Story, targetStep string) map[string]map[runsv1alpha1.ManifestOperation]struct{} { + pathOps := make(map[string]map[runsv1alpha1.ManifestOperation]struct{}) + addFromString := func(text string) { + collectManifestPathsFromString(targetStep, text, pathOps) } - if stepWith == nil { - return engramWith, nil + + addStepLevelManifestExpressions(story, addFromString) + addStoryLevelManifestExpressions(story, addFromString) + return pathOps +} + +func addStepLevelManifestExpressions(story *bubuv1alpha1.Story, add func(string)) { + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] + if step.If != nil { + add(*step.If) + } + if step.With != nil && len(step.With.Raw) > 0 { + add(string(step.With.Raw)) + } } +} - var engramMap, stepMap map[string]any - if err := json.Unmarshal(engramWith.Raw, &engramMap); err != nil { - return nil, fmt.Errorf("failed to unmarshal engram 'with' block: %w", err) +func addStoryLevelManifestExpressions(story *bubuv1alpha1.Story, add func(string)) { + if story.Spec.Output != nil && len(story.Spec.Output.Raw) > 0 { + add(string(story.Spec.Output.Raw)) } - if err := json.Unmarshal(stepWith.Raw, &stepMap); err != nil { - return nil, fmt.Errorf("failed to unmarshal step 'with' block: %w", err) + if story.Spec.Policy != nil && story.Spec.Policy.With != nil && len(story.Spec.Policy.With.Raw) > 0 { + add(string(story.Spec.Policy.With.Raw)) } +} - for k, v := range stepMap { - engramMap[k] = v +func sortedManifestPaths(pathOps map[string]map[runsv1alpha1.ManifestOperation]struct{}) []string { + paths := make([]string, 0, len(pathOps)) + for path := range pathOps { + paths = append(paths, path) } + sort.Strings(paths) + return paths +} - mergedBytes, err := json.Marshal(engramMap) - if err != nil { - return nil, fmt.Errorf("failed to marshal merged 'with' block: %w", err) +func buildManifestRequests(paths []string, pathOps map[string]map[runsv1alpha1.ManifestOperation]struct{}) []runsv1alpha1.ManifestRequest { + requests := make([]runsv1alpha1.ManifestRequest, 0, len(paths)) + orderedOps := []runsv1alpha1.ManifestOperation{runsv1alpha1.ManifestOperationExists, runsv1alpha1.ManifestOperationLength} + for _, path := range paths { + opsSet := pathOps[path] + ops := make([]runsv1alpha1.ManifestOperation, 0, len(opsSet)) + for _, cand := range orderedOps { + if _, ok := opsSet[cand]; ok { + ops = append(ops, cand) + } + } + requests = append(requests, runsv1alpha1.ManifestRequest{Path: path, Operations: ops}) } + return requests +} - return &runtime.RawExtension{Raw: mergedBytes}, nil +func collectManifestPathsFromString(targetStep, text string, pathOps map[string]map[runsv1alpha1.ManifestOperation]struct{}) { + if text == "" { + return + } + + text = replaceStepAliases(text) + + if text == "" { + return + } + + aliasTarget := sanitizeStepIdentifier(targetStep) + + for _, match := range manifestLenPattern.FindAllStringSubmatch(text, -1) { + if len(match) < 3 { + continue + } + if match[1] != aliasTarget { + continue + } + path := normaliseManifestPath(match[2]) + addManifestOperation(pathOps, path, runsv1alpha1.ManifestOperationExists) + addManifestOperation(pathOps, path, runsv1alpha1.ManifestOperationLength) + } + + for _, match := range manifestPathPattern.FindAllStringSubmatch(text, -1) { + if len(match) < 3 { + continue + } + if match[1] != aliasTarget { + continue + } + path := normaliseManifestPath(match[2]) + addManifestOperation(pathOps, path, runsv1alpha1.ManifestOperationExists) + } +} + +func normaliseManifestPath(suffix string) string { + path := strings.TrimPrefix(suffix, ".") + if path == "" { + return manifestRootPath + } + return path +} + +func addManifestOperation(pathOps map[string]map[runsv1alpha1.ManifestOperation]struct{}, path string, op runsv1alpha1.ManifestOperation) { + if pathOps == nil { + return + } + ops, ok := pathOps[path] + if !ok { + ops = make(map[runsv1alpha1.ManifestOperation]struct{}) + pathOps[path] = ops + } + ops[op] = struct{}{} +} + +func replaceStepAliases(input string) string { + return stepAliasPattern.ReplaceAllStringFunc(input, func(match string) string { + submatches := stepAliasPattern.FindStringSubmatch(match) + if len(submatches) != 2 { + return match + } + original := submatches[1] + alias := sanitizeStepIdentifier(original) + if alias == original { + return match + } + return strings.Replace(match, original, alias, 1) + }) +} + +func sanitizeStepIdentifier(name string) string { + var b strings.Builder + b.Grow(len(name)) + for _, r := range name { + switch { + case r >= 'a' && r <= 'z': + b.WriteRune(r) + case r >= 'A' && r <= 'Z': + b.WriteRune(r) + case r >= '0' && r <= '9': + b.WriteRune(r) + case r == '_': + b.WriteRune(r) + default: + b.WriteRune('_') + } + } + return b.String() +} + +func (e *StepExecutor) mergeWithBlocks(engramWith, stepWith *runtime.RawExtension) (*runtime.RawExtension, error) { + return mergeutil.MergeWithBlocks(engramWith, stepWith) +} + +func manifestRequestsEqual(a, b []runsv1alpha1.ManifestRequest) bool { + if len(a) != len(b) { + return false + } + + toMap := func(reqs []runsv1alpha1.ManifestRequest) map[string]map[runsv1alpha1.ManifestOperation]struct{} { + out := make(map[string]map[runsv1alpha1.ManifestOperation]struct{}, len(reqs)) + for _, req := range reqs { + set := make(map[runsv1alpha1.ManifestOperation]struct{}, len(req.Operations)) + for _, op := range req.Operations { + set[op] = struct{}{} + } + out[req.Path] = set + } + return out + } + + mapA := toMap(a) + mapB := toMap(b) + + if len(mapA) != len(mapB) { + return false + } + + for path, opsA := range mapA { + opsB, ok := mapB[path] + if !ok { + return false + } + if len(opsA) != len(opsB) { + return false + } + for op := range opsA { + if _, ok := opsB[op]; !ok { + return false + } + } + } + return true } func coerceToList(input any) ([]any, error) { @@ -353,3 +907,43 @@ func coerceToList(input any) ([]any, error) { return nil, fmt.Errorf("input '%v' is not a list or string", input) } } + +func rawExtensionsEqual(a, b *runtime.RawExtension) bool { + switch { + case a == nil && b == nil: + return true + case a == nil || b == nil: + return false + default: + return bytes.Equal(a.Raw, b.Raw) + } +} + +func cloneRawExtension(src *runtime.RawExtension) *runtime.RawExtension { + if src == nil { + return nil + } + clone := &runtime.RawExtension{} + if len(src.Raw) > 0 { + clone.Raw = append([]byte(nil), src.Raw...) + } + clone.Object = src.Object + return clone +} + +func copyManifestRequests(src []runsv1alpha1.ManifestRequest) []runsv1alpha1.ManifestRequest { + if len(src) == 0 { + return nil + } + + out := make([]runsv1alpha1.ManifestRequest, len(src)) + for i := range src { + out[i].Path = src[i].Path + if len(src[i].Operations) > 0 { + ops := make([]runsv1alpha1.ManifestOperation, len(src[i].Operations)) + copy(ops, src[i].Operations) + out[i].Operations = ops + } + } + return out +} diff --git a/internal/controller/runs/step_executor_test.go b/internal/controller/runs/step_executor_test.go new file mode 100644 index 0000000..7b3aa84 --- /dev/null +++ b/internal/controller/runs/step_executor_test.go @@ -0,0 +1,366 @@ +package runs + +import ( + "context" + "encoding/json" + "testing" + "time" + + runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" + bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/pkg/cel" + "github.com/bubustack/bobrapet/pkg/enums" + "github.com/bubustack/bobrapet/pkg/refs" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestComputeManifestRequestsHandlesHyphenatedSteps(t *testing.T) { + story := &bubuv1alpha1.Story{ + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{ + {Name: "list-tools", Ref: &refs.EngramReference{ObjectReference: refs.ObjectReference{Name: "dummy"}}}, + {Name: "create-issue", Ref: &refs.EngramReference{ObjectReference: refs.ObjectReference{Name: "dummy"}}, + If: ptrString("len(steps.list-tools.output.tools) > 0"), + }, + }, + }, + } + + var executor StepExecutor + requests := executor.computeManifestRequests(story, "list-tools") + if len(requests) == 0 { + t.Fatalf("expected manifest requests for create-issue, got none") + } + + found := false + for _, req := range requests { + if req.Path == "output.tools" || req.Path == "tools" { + found = true + break + } + } + if !found { + t.Fatalf("expected manifest request for output.tools, got %#v", requests) + } +} + +func TestExecuteStoryStepCreatesSubStoryRunAndWaitsForCompletion(t *testing.T) { + env := newExecuteStoryTestEnv(t, "child-story", "parent-run") + step := env.newExecuteStoryStep("invoke-sub", true, map[string]any{"foo": "{{ inputs.foo }}"}) + if err := env.run(step, env.vars(map[string]any{"foo": "bar"})); err != nil { + t.Fatalf("executeStoryStep returned error: %v", err) + } + + state := env.stepState("invoke-sub") + if state.Phase != enums.PhaseRunning { + t.Fatalf("expected phase Running, got %s", state.Phase) + } + if state.SubStoryRunName != "parent-run-invoke-sub" { + t.Fatalf("expected subStoryRunName 'parent-run-invoke-sub', got %q", state.SubStoryRunName) + } + + subRun := env.assertSubRunMetadata(state.SubStoryRunName) + env.assertSubRunInputs(subRun, map[string]any{"foo": "bar"}) +} + +func TestExecuteStoryStepAsyncMarksSucceeded(t *testing.T) { + env := newExecuteStoryTestEnv(t, "async-child", "parent-async") + step := env.newExecuteStoryStep("fire-and-forget", false, nil) + if err := env.run(step, env.vars(nil)); err != nil { + t.Fatalf("executeStoryStep returned error: %v", err) + } + + state := env.stepState("fire-and-forget") + if state.Phase != enums.PhaseSucceeded { + t.Fatalf("expected phase Succeeded for async executeStory, got %s", state.Phase) + } + if state.SubStoryRunName != "parent-async-fire-and-forget" { + t.Fatalf("expected SubStoryRunName to be set, got %q", state.SubStoryRunName) + } +} + +func TestMergeWithBlocksDeepMergesNestedObjects(t *testing.T) { + executor := StepExecutor{} + + engramWith := &runtime.RawExtension{Raw: []byte(`{"auth":{"headers":{"x-api-key":"default"},"retry":3},"timeout":10}`)} + stepWith := &runtime.RawExtension{Raw: []byte(`{"auth":{"headers":{"x-trace-id":"abc"}},"timeout":20}`)} + + merged, err := executor.mergeWithBlocks(engramWith, stepWith) + if err != nil { + t.Fatalf("mergeWithBlocks returned error: %v", err) + } + var mergedMap map[string]any + if err := json.Unmarshal(merged.Raw, &mergedMap); err != nil { + t.Fatalf("failed to unmarshal merged result: %v", err) + } + auth, ok := mergedMap["auth"].(map[string]any) + if !ok { + t.Fatalf("expected auth map, got %#v", mergedMap["auth"]) + } + headers, ok := auth["headers"].(map[string]any) + if !ok { + t.Fatalf("expected headers map, got %#v", auth["headers"]) + } + if headers["x-api-key"] != "default" { + t.Fatalf("expected to preserve default api key, got %#v", headers["x-api-key"]) + } + if headers["x-trace-id"] != "abc" { + t.Fatalf("expected to apply override header, got %#v", headers["x-trace-id"]) + } + if auth["retry"] != float64(3) { + t.Fatalf("expected to keep retry value, got %#v", auth["retry"]) + } + if mergedMap["timeout"] != float64(20) { + t.Fatalf("expected timeout override 20, got %#v", mergedMap["timeout"]) + } +} + +func TestExecuteEngramStepSetsTimeoutAndRetryFromStep(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(runsv1alpha1.AddToScheme(scheme)) + utilruntime.Must(bubuv1alpha1.AddToScheme(scheme)) + + engram := &bubuv1alpha1.Engram{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker", + Namespace: "default", + }, + Spec: bubuv1alpha1.EngramSpec{ + TemplateRef: refs.EngramTemplateReference{Name: "worker-template"}, + }, + } + + step := bubuv1alpha1.Step{ + Name: "process", + Ref: &refs.EngramReference{ + ObjectReference: refs.ObjectReference{Name: engram.Name}, + }, + Execution: &bubuv1alpha1.ExecutionOverrides{ + Timeout: ptrString("45s"), + Retry: &bubuv1alpha1.RetryPolicy{ + MaxRetries: ptrInt32(5), + }, + }, + } + + story := &bubuv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Name: "example-story", + Namespace: "default", + }, + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{step}, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(engram). + Build() + + executor := StepExecutor{ + Client: fakeClient, + Scheme: scheme, + } + + storyRun := &runsv1alpha1.StoryRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "run-1", + Namespace: "default", + }, + Status: runsv1alpha1.StoryRunStatus{ + StepStates: make(map[string]runsv1alpha1.StepState), + }, + } + + if err := executor.executeEngramStep(context.Background(), storyRun, story, &story.Spec.Steps[0]); err != nil { + t.Fatalf("executeEngramStep returned error: %v", err) + } + + var created runsv1alpha1.StepRun + if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: "run-1-process", Namespace: "default"}, &created); err != nil { + t.Fatalf("failed to fetch created StepRun: %v", err) + } + + if created.Spec.Timeout != "45s" { + t.Fatalf("expected timeout '45s', got %q", created.Spec.Timeout) + } + if created.Spec.Retry == nil || created.Spec.Retry.MaxRetries == nil || *created.Spec.Retry.MaxRetries != 5 { + t.Fatalf("expected retry maxRetries 5, got %#v", created.Spec.Retry) + } +} + +type nopLogger struct{} + +func (nopLogger) CacheHit(string, string) {} +func (nopLogger) EvaluationStart(string, string) {} +func (nopLogger) EvaluationError(error, string, string, time.Duration) {} +func (nopLogger) EvaluationSuccess(string, string, time.Duration, any) {} + +func ptrString(v string) *string { return &v } + +func ptrInt32(v int32) *int32 { return &v } + +type executeStoryTestEnv struct { + t *testing.T + ctx context.Context + eval *cel.Evaluator + client client.Client + executor StepExecutor + storyRun *runsv1alpha1.StoryRun + childStoryName string + childStoryUID string +} + +func newExecuteStoryTestEnv(t *testing.T, childStoryName, storyRunName string) *executeStoryTestEnv { + t.Helper() + + ctx := context.Background() + eval, err := cel.New(nopLogger{}) + if err != nil { + t.Fatalf("failed to create evaluator: %v", err) + } + t.Cleanup(func() { eval.Close() }) + + scheme := runtime.NewScheme() + utilruntime.Must(runsv1alpha1.AddToScheme(scheme)) + utilruntime.Must(bubuv1alpha1.AddToScheme(scheme)) + + childUID := types.UID(childStoryName + "-uid") + childStory := &bubuv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Name: childStoryName, + Namespace: "default", + }, + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{ + {Name: "noop", Ref: &refs.EngramReference{ObjectReference: refs.ObjectReference{Name: "dummy"}}}, + }, + }, + } + childStory.UID = childUID + + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(childStory).Build() + + executor := StepExecutor{ + Client: cl, + Scheme: scheme, + CEL: eval, + } + + storyRun := &runsv1alpha1.StoryRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: storyRunName, + Namespace: "default", + }, + Status: runsv1alpha1.StoryRunStatus{StepStates: make(map[string]runsv1alpha1.StepState)}, + } + storyRun.UID = types.UID(storyRunName + "-uid") + + return &executeStoryTestEnv{ + t: t, + ctx: ctx, + eval: eval, + client: cl, + executor: executor, + storyRun: storyRun, + childStoryName: childStoryName, + childStoryUID: string(childUID), + } +} + +func (e *executeStoryTestEnv) vars(inputs map[string]any) map[string]any { + e.t.Helper() + if inputs == nil { + inputs = map[string]any{} + } + return map[string]any{ + "inputs": inputs, + "steps": map[string]any{}, + } +} + +func (e *executeStoryTestEnv) newExecuteStoryStep(stepName string, waitForCompletion bool, with map[string]any) *bubuv1alpha1.Step { + e.t.Helper() + payload := map[string]any{ + "storyRef": map[string]any{"name": e.childStoryName}, + } + if !waitForCompletion { + payload["waitForCompletion"] = false + } + if with != nil { + payload["with"] = with + } + raw, err := json.Marshal(payload) + if err != nil { + e.t.Fatalf("failed to marshal executeStory payload: %v", err) + } + return &bubuv1alpha1.Step{ + Name: stepName, + Type: enums.StepTypeExecuteStory, + With: &runtime.RawExtension{Raw: raw}, + } +} + +func (e *executeStoryTestEnv) run(step *bubuv1alpha1.Step, vars map[string]any) error { + e.t.Helper() + return e.executor.Execute(e.ctx, e.storyRun, &bubuv1alpha1.Story{}, step, vars) +} + +func (e *executeStoryTestEnv) stepState(stepName string) runsv1alpha1.StepState { + e.t.Helper() + state, ok := e.storyRun.Status.StepStates[stepName] + if !ok { + e.t.Fatalf("expected step state for %s to be set", stepName) + } + return state +} + +func (e *executeStoryTestEnv) fetchSubRun(name string) *runsv1alpha1.StoryRun { + e.t.Helper() + var sub runsv1alpha1.StoryRun + if err := e.client.Get(e.ctx, types.NamespacedName{Name: name, Namespace: e.storyRun.Namespace}, &sub); err != nil { + e.t.Fatalf("failed to fetch sub-StoryRun %s: %v", name, err) + } + return &sub +} + +func (e *executeStoryTestEnv) assertSubRunMetadata(name string) *runsv1alpha1.StoryRun { + e.t.Helper() + subRun := e.fetchSubRun(name) + if subRun.Spec.StoryRef.Name != e.childStoryName { + e.t.Fatalf("expected sub-StoryRun to reference %s, got %s", e.childStoryName, subRun.Spec.StoryRef.Name) + } + if subRun.Spec.StoryRef.UID == nil || string(*subRun.Spec.StoryRef.UID) != e.childStoryUID { + e.t.Fatalf("expected sub-StoryRun StoryRef UID to be %s, got %#v", e.childStoryUID, subRun.Spec.StoryRef.UID) + } + if len(subRun.OwnerReferences) != 1 || subRun.OwnerReferences[0].Name != e.storyRun.Name { + e.t.Fatalf("expected sub-StoryRun to have parent owner reference") + } + return subRun +} + +func (e *executeStoryTestEnv) assertSubRunInputs(subRun *runsv1alpha1.StoryRun, expected map[string]any) { + e.t.Helper() + if subRun.Spec.Inputs == nil { + e.t.Fatalf("expected sub-StoryRun inputs to be set") + } + var inputs map[string]any + if err := json.Unmarshal(subRun.Spec.Inputs.Raw, &inputs); err != nil { + e.t.Fatalf("failed to unmarshal sub-StoryRun inputs: %v", err) + } + for key, want := range expected { + got, ok := inputs[key] + if !ok { + e.t.Fatalf("expected propagated input %q to be present", key) + } + if got != want { + e.t.Fatalf("expected propagated input %q=%#v, got %#v", key, want, got) + } + } +} diff --git a/internal/controller/runs/steprun_controller.go b/internal/controller/runs/steprun_controller.go index 4c7d49c..5318b0d 100644 --- a/internal/controller/runs/steprun_controller.go +++ b/internal/controller/runs/steprun_controller.go @@ -20,6 +20,8 @@ import ( "context" "encoding/json" "fmt" + "reflect" + "sort" "time" batchv1 "k8s.io/api/batch/v1" @@ -27,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -39,6 +42,7 @@ import ( runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" "github.com/bubustack/bobrapet/api/v1alpha1" "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/secretutil" "github.com/bubustack/bobrapet/pkg/cel" "github.com/bubustack/bobrapet/pkg/conditions" "github.com/bubustack/bobrapet/pkg/enums" @@ -52,6 +56,8 @@ const ( StepRunFinalizer = "steprun.bubustack.io/finalizer" ) +const stepRunEngramIndexField = "spec.engramRef.key" + // StepRunReconciler reconciles a StepRun object type StepRunReconciler struct { config.ControllerDependencies @@ -68,7 +74,6 @@ type StepRunReconciler struct { // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch -// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=pods/log,verbs=get func (r *StepRunReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { @@ -184,7 +189,7 @@ func (r *StepRunReconciler) reconcileJobExecution(ctx context.Context, step *run // Handle CEL evaluation blockage by requeueing if evalBlocked, ok := err.(*cel.ErrEvaluationBlocked); ok { stepLogger.Info("CEL evaluation blocked, requeueing", "reason", evalBlocked.Reason) - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return ctrl.Result{RequeueAfter: r.nextRequeueDelay()}, nil } stepLogger.Error(err, "Failed to create Job for StepRun") return ctrl.Result{}, err @@ -332,7 +337,7 @@ func (r *StepRunReconciler) reconcileDelete(ctx context.Context, step *runsv1alp return ctrl.Result{}, err } // Requeue to wait for the job to be fully deleted. - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return ctrl.Result{RequeueAfter: r.nextRequeueDelay()}, nil } // If the job is not found, it's safe to remove the finalizer. @@ -349,22 +354,46 @@ func (r *StepRunReconciler) reconcileDelete(ctx context.Context, step *runsv1alp // If the job still exists (is being deleted), requeue. stepLogger.Info("Owned Job is still terminating, waiting for it to be deleted") - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return ctrl.Result{RequeueAfter: r.nextRequeueDelay()}, nil } func (r *StepRunReconciler) createJobForStep(ctx context.Context, srun *runsv1alpha1.StepRun, engram *v1alpha1.Engram, engramTemplate *catalogv1alpha1.EngramTemplate) (*batchv1.Job, error) { stepLogger := logging.NewControllerLogger(ctx, "steprun").WithStepRun(srun) - story, storyRun, resolvedConfig, inputBytes, stepTimeout, err := r.prepareExecutionContext(ctx, srun, engram, engramTemplate, stepLogger) + story, storyRun, resolvedConfig, inputBytes, stepTimeout, downstreamTargets, err := r.prepareExecutionContext(ctx, srun, engram, engramTemplate, stepLogger) if err != nil { return nil, err } - secretEnvVars, volumes, volumeMounts := r.setupSecrets(ctx, resolvedConfig, engramTemplate) - envVars := r.buildBaseEnvVars(srun, story, inputBytes, stepTimeout) + if err := r.ensureDownstreamTargets(ctx, srun, downstreamTargets, stepLogger); err != nil { + return nil, err + } + + executionMode := "batch" + if len(downstreamTargets) > 0 { + executionMode = "hybrid" + } + + secretEnvVars, secretEnvFrom, volumes, volumeMounts := r.setupSecrets(ctx, resolvedConfig, engramTemplate) + envVars := r.buildBaseEnvVars(srun, story, inputBytes, stepTimeout, executionMode) envVars = r.appendGRPCTuningEnv(envVars) envVars = r.appendStorageEnv(envVars, resolvedConfig, stepLogger) + // If S3 auth secret is referenced (Story or Operator defaults), attach it via EnvFrom so AWS SDK can read + if resolvedConfig.Storage != nil && resolvedConfig.Storage.S3 != nil && resolvedConfig.Storage.S3.Authentication.SecretRef != nil { + secretEnvFrom = append(secretEnvFrom, corev1.EnvFromSource{ + SecretRef: &corev1.SecretEnvSource{LocalObjectReference: corev1.LocalObjectReference{ + Name: resolvedConfig.Storage.S3.Authentication.SecretRef.Name, + }}, + }) + } + + if tlsSecret := getTLSSecretName(&engram.ObjectMeta); tlsSecret != "" { + if err := r.configureTLSEnvAndMounts(ctx, engram.Namespace, tlsSecret, &volumes, &volumeMounts, &envVars); err != nil { + stepLogger.Error(err, "Failed to configure TLS for StepRun job", "secret", tlsSecret) + } + } + activeDeadlineSeconds := int64((stepTimeout + 2*time.Minute).Seconds()) startedAt := metav1.Now() if srun.Status.StartedAt != nil { @@ -372,11 +401,20 @@ func (r *StepRunReconciler) createJobForStep(ctx context.Context, srun *runsv1al } r.addStartedAtEnv(&envVars, startedAt) envVars = append(envVars, secretEnvVars...) + // Provide engram name to the pod so adapter-derived sidecars can follow naming convention + envVars = append(envVars, corev1.EnvVar{Name: "BUBU_ENGRAM_NAME", Value: engram.Name}) if engram.Spec.With != nil && len(engram.Spec.With.Raw) > 0 { envVars = append(envVars, corev1.EnvVar{Name: "BUBU_CONFIG", Value: string(engram.Spec.With.Raw)}) } + if len(srun.Spec.RequestedManifest) > 0 { + manifestBytes, err := json.Marshal(srun.Spec.RequestedManifest) + if err != nil { + return nil, fmt.Errorf("failed to marshal manifest spec: %w", err) + } + envVars = append(envVars, corev1.EnvVar{Name: "BUBU_MANIFEST_SPEC", Value: string(manifestBytes)}) + } - job := r.buildJobSpec(srun, resolvedConfig, envVars, volumes, volumeMounts, activeDeadlineSeconds) + job := r.buildJobSpec(srun, resolvedConfig, envVars, secretEnvFrom, volumes, volumeMounts, activeDeadlineSeconds) if err := controllerutil.SetControllerReference(srun, job, r.Scheme); err != nil { return nil, err } @@ -384,52 +422,63 @@ func (r *StepRunReconciler) createJobForStep(ctx context.Context, srun *runsv1al return job, nil } -func (r *StepRunReconciler) prepareExecutionContext(ctx context.Context, srun *runsv1alpha1.StepRun, engram *v1alpha1.Engram, engramTemplate *catalogv1alpha1.EngramTemplate, stepLogger *logging.ControllerLogger) (*v1alpha1.Story, *runsv1alpha1.StoryRun, *config.ResolvedExecutionConfig, []byte, time.Duration, error) { +func (r *StepRunReconciler) prepareExecutionContext(ctx context.Context, srun *runsv1alpha1.StepRun, engram *v1alpha1.Engram, engramTemplate *catalogv1alpha1.EngramTemplate, stepLogger *logging.ControllerLogger) (*v1alpha1.Story, *runsv1alpha1.StoryRun, *config.ResolvedExecutionConfig, []byte, time.Duration, []runsv1alpha1.DownstreamTarget, error) { story, err := r.getStoryForStep(ctx, srun) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to get story for step: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to get story for step: %w", err) } storyRun, err := r.getParentStoryRun(ctx, srun) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to get parent storyrun: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to get parent storyrun: %w", err) } - resolvedConfig, err := r.ConfigResolver.ResolveExecutionConfig(ctx, srun, story, engram, engramTemplate) + resolvedConfig, err := r.ConfigResolver.ResolveExecutionConfig(ctx, srun, story, engram, engramTemplate, nil) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to resolve execution config for step '%s': %w", srun.Name, err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to resolve execution config for step '%s': %w", srun.Name, err) } stepLogger.Info("Resolved ServiceAccountName", "sa", resolvedConfig.ServiceAccountName) storyRunInputs, err := r.getStoryRunInputs(ctx, storyRun) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to get storyrun inputs: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to get storyrun inputs: %w", err) } stepOutputs, err := getPriorStepOutputs(ctx, r.Client, storyRun, nil) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to get prior step outputs: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to get prior step outputs: %w", err) } with := map[string]any{} if srun.Spec.Input != nil { if err := json.Unmarshal(srun.Spec.Input.Raw, &with); err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to unmarshal step 'with' block: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to unmarshal step 'with' block: %w", err) } } vars := map[string]any{"inputs": storyRunInputs, "steps": stepOutputs} resolvedInputs, err := r.CELEvaluator.ResolveWithInputs(ctx, with, vars) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to resolve inputs with CEL: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to resolve inputs with CEL: %w", err) } inputBytes, err := json.Marshal(resolvedInputs) if err != nil { - return nil, nil, nil, nil, 0, fmt.Errorf("failed to marshal resolved inputs: %w", err) + return nil, nil, nil, nil, 0, nil, fmt.Errorf("failed to marshal resolved inputs: %w", err) + } + stepTimeout := r.computeStepTimeout(srun, story, stepLogger) + downstreamTargets, err := r.computeDownstreamTargets(ctx, story, srun) + if err != nil { + return nil, nil, nil, nil, 0, nil, err } - stepTimeout := r.computeStepTimeout(story, stepLogger) - return story, storyRun, resolvedConfig, inputBytes, stepTimeout, nil + return story, storyRun, resolvedConfig, inputBytes, stepTimeout, downstreamTargets, nil } -func (r *StepRunReconciler) computeStepTimeout(story *v1alpha1.Story, stepLogger *logging.ControllerLogger) time.Duration { +func (r *StepRunReconciler) computeStepTimeout(srun *runsv1alpha1.StepRun, story *v1alpha1.Story, stepLogger *logging.ControllerLogger) time.Duration { stepTimeout := r.ConfigResolver.GetOperatorConfig().Controller.DefaultStepTimeout if stepTimeout == 0 { stepTimeout = 30 * time.Minute } + if srun != nil && srun.Spec.Timeout != "" { + if parsedTimeout, err := time.ParseDuration(srun.Spec.Timeout); err == nil && parsedTimeout > 0 { + return parsedTimeout + } else if err != nil { + stepLogger.Error(err, "Invalid step timeout on StepRun, using defaults", "rawTimeout", srun.Spec.Timeout) + } + } if story.Spec.Policy != nil && story.Spec.Policy.Timeouts != nil && story.Spec.Policy.Timeouts.Step != nil { if parsedTimeout, err := time.ParseDuration(*story.Spec.Policy.Timeouts.Step); err == nil && parsedTimeout > 0 { stepTimeout = parsedTimeout @@ -441,7 +490,98 @@ func (r *StepRunReconciler) computeStepTimeout(story *v1alpha1.Story, stepLogger return stepTimeout } -func (r *StepRunReconciler) buildBaseEnvVars(srun *runsv1alpha1.StepRun, story *v1alpha1.Story, inputBytes []byte, stepTimeout time.Duration) []corev1.EnvVar { +func (r *StepRunReconciler) computeDownstreamTargets(ctx context.Context, story *v1alpha1.Story, srun *runsv1alpha1.StepRun) ([]runsv1alpha1.DownstreamTarget, error) { + if story == nil || srun == nil { + return nil, nil + } + + _, dependents := buildDependencyGraphs(story.Spec.Steps) + dependentSet := dependents[srun.Spec.StepID] + if len(dependentSet) == 0 { + return nil, nil + } + + stepIndex := make(map[string]*v1alpha1.Step, len(story.Spec.Steps)) + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] + stepIndex[step.Name] = step + } + + port := r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort + dependentNames := make([]string, 0, len(dependentSet)) + for name := range dependentSet { + dependentNames = append(dependentNames, name) + } + sort.Strings(dependentNames) + + log := logging.NewControllerLogger(ctx, "steprun-hybrid").WithStepRun(srun) + seenEndpoints := make(map[string]struct{}) + targets := make([]runsv1alpha1.DownstreamTarget, 0, len(dependentNames)) + + for _, depName := range dependentNames { + depStep := stepIndex[depName] + if depStep == nil || depStep.Ref == nil { + continue + } + + depEngram := &v1alpha1.Engram{} + key := depStep.Ref.ToNamespacedName(srun) + if err := r.Get(ctx, key, depEngram); err != nil { + if errors.IsNotFound(err) { + log.Info("Skipping downstream target; referenced engram not found", "dependentStep", depName, "engram", key.Name) + continue + } + return nil, fmt.Errorf("failed to resolve downstream engram '%s' for step '%s': %w", key.Name, depName, err) + } + + mode := depEngram.Spec.Mode + if mode == "" { + mode = enums.WorkloadModeJob + } + if mode != enums.WorkloadModeDeployment && mode != enums.WorkloadModeStatefulSet { + continue + } + + endpoint := fmt.Sprintf("%s.%s.svc:%d", depEngram.Name, depEngram.Namespace, port) + if _, exists := seenEndpoints[endpoint]; exists { + continue + } + seenEndpoints[endpoint] = struct{}{} + + targets = append(targets, runsv1alpha1.DownstreamTarget{ + GRPCTarget: &runsv1alpha1.GRPCTarget{Endpoint: endpoint}, + }) + } + + return targets, nil +} + +func (r *StepRunReconciler) ensureDownstreamTargets(ctx context.Context, srun *runsv1alpha1.StepRun, desired []runsv1alpha1.DownstreamTarget, logger *logging.ControllerLogger) error { + if reflect.DeepEqual(srun.Spec.DownstreamTargets, desired) { + return nil + } + + before := srun.DeepCopy() + if len(desired) == 0 { + srun.Spec.DownstreamTargets = nil + } else { + srun.Spec.DownstreamTargets = desired + } + + if err := r.Patch(ctx, srun, client.MergeFrom(before)); err != nil { + logger.Error(err, "Failed to update StepRun downstream targets") + return err + } + + if len(desired) > 0 { + logger.Info("Updated StepRun downstream targets", "count", len(desired)) + } else { + logger.Info("Cleared StepRun downstream targets") + } + return nil +} + +func (r *StepRunReconciler) buildBaseEnvVars(srun *runsv1alpha1.StepRun, story *v1alpha1.Story, inputBytes []byte, stepTimeout time.Duration, executionMode string) []corev1.EnvVar { envVars := []corev1.EnvVar{ {Name: "BUBU_STORY_NAME", Value: story.Name}, {Name: "BUBU_STORYRUN_ID", Value: srun.Spec.StoryRunRef.Name}, @@ -449,11 +589,17 @@ func (r *StepRunReconciler) buildBaseEnvVars(srun *runsv1alpha1.StepRun, story * {Name: "BUBU_STEPRUN_NAME", Value: srun.Name}, {Name: "BUBU_STEPRUN_NAMESPACE", Value: srun.Namespace}, {Name: "BUBU_INPUTS", Value: string(inputBytes)}, - {Name: "BUBU_EXECUTION_MODE", Value: "batch"}, + {Name: "BUBU_EXECUTION_MODE", Value: executionMode}, + {Name: "BUBU_HYBRID_BRIDGE", Value: boolToString(executionMode == "hybrid")}, {Name: "BUBU_GRPC_PORT", Value: fmt.Sprintf("%d", r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort)}, {Name: "BUBU_MAX_INLINE_SIZE", Value: fmt.Sprintf("%d", r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultMaxInlineSize)}, {Name: "BUBU_STORAGE_TIMEOUT", Value: fmt.Sprintf("%ds", r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultStorageTimeoutSeconds)}, {Name: "BUBU_STEP_TIMEOUT", Value: stepTimeout.String()}, + {Name: "BUBU_MAX_RECURSION_DEPTH", Value: "64"}, + // Downward API: expose pod metadata to containers via env + {Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.name"}}}, + {Name: "POD_NAMESPACE", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.namespace"}}}, + {Name: "SERVICE_ACCOUNT_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "spec.serviceAccountName"}}}, } return envVars } @@ -498,7 +644,14 @@ func (r *StepRunReconciler) addStartedAtEnv(envVars *[]corev1.EnvVar, startedAt *envVars = append(*envVars, corev1.EnvVar{Name: "BUBU_STARTED_AT", Value: startedAt.Format(time.RFC3339Nano)}) } -func (r *StepRunReconciler) buildJobSpec(srun *runsv1alpha1.StepRun, resolvedConfig *config.ResolvedExecutionConfig, envVars []corev1.EnvVar, volumes []corev1.Volume, volumeMounts []corev1.VolumeMount, activeDeadlineSeconds int64) *batchv1.Job { +func boolToString(v bool) string { + if v { + return "true" + } + return "false" +} + +func (r *StepRunReconciler) buildJobSpec(srun *runsv1alpha1.StepRun, resolvedConfig *config.ResolvedExecutionConfig, envVars []corev1.EnvVar, envFrom []corev1.EnvFromSource, volumes []corev1.Volume, volumeMounts []corev1.VolumeMount, activeDeadlineSeconds int64) *batchv1.Job { return &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: srun.Name, @@ -535,6 +688,7 @@ func (r *StepRunReconciler) buildJobSpec(srun *runsv1alpha1.StepRun, resolvedCon ReadinessProbe: resolvedConfig.ReadinessProbe, StartupProbe: resolvedConfig.StartupProbe, Env: envVars, + EnvFrom: envFrom, VolumeMounts: volumeMounts, }}, }, @@ -591,9 +745,10 @@ func (r *StepRunReconciler) handleJobStatus(ctx context.Context, step *runsv1alp // Check if SDK already patched status (won the race) // SDK writes Phase=Failed with detailed error context (timeout details, dehydration errors, etc.) - if step.Status.Phase == enums.PhaseFailed { + if step.Status.Phase == enums.PhaseFailed || step.Status.Phase == enums.PhaseTimeout { stepLogger.Info("Job failed but SDK already updated StepRun status; preserving SDK's error details", - "podExitCode", exitCode) + "podExitCode", exitCode, + "currentPhase", step.Status.Phase) // Enhance status with exit code if SDK didn't provide it (SDK may crash before os.Exit) if exitCode != 0 && step.Status.ExitCode == 0 { @@ -614,15 +769,21 @@ func (r *StepRunReconciler) handleJobStatus(ctx context.Context, step *runsv1alp // SDK didn't write status yet (crashed before patching or no RBAC) // Operator patches with generic message as fallback - stepLogger.Info("Job failed and SDK did not update status; applying fallback Failed status", + stepLogger.Info("Job failed and SDK did not update status; applying fallback status", "podExitCode", exitCode) // Use the original reconcile context. If it has timed out, the patch will fail and // the entire reconcile will be retried, which is the correct and safe behavior for // ensuring this terminal state is eventually recorded. if err := patch.RetryableStatusPatch(ctx, r.Client, step, func(obj client.Object) { sr := obj.(*runsv1alpha1.StepRun) - sr.Status.Phase = enums.PhaseFailed - sr.Status.LastFailureMsg = "Job execution failed. Check pod logs for details." + phase := enums.PhaseFailed + failureMsg := "Job execution failed. Check pod logs for details." + if exitCode == 124 { + phase = enums.PhaseTimeout + failureMsg = "Job execution timed out. Check step timeout configuration and pod logs." + } + sr.Status.Phase = phase + sr.Status.LastFailureMsg = failureMsg if exitCode != 0 { sr.Status.ExitCode = int32(exitCode) sr.Status.ExitClass = classifyExitCode(exitCode) @@ -701,6 +862,31 @@ func classifyExitCode(code int) enums.ExitClass { } } +func (r *StepRunReconciler) nextRequeueDelay() time.Duration { + const fallbackBase = 5 * time.Second + const fallbackMax = 30 * time.Second + + operatorCfg := r.ConfigResolver.GetOperatorConfig() + base := fallbackBase + max := fallbackMax + if operatorCfg != nil { + if operatorCfg.Controller.RequeueBaseDelay > 0 { + base = operatorCfg.Controller.RequeueBaseDelay + } + if operatorCfg.Controller.RequeueMaxDelay > 0 { + max = operatorCfg.Controller.RequeueMaxDelay + } + } + if max < base { + max = base + } + delay := wait.Jitter(base, 0.5) + if delay > max { + delay = max + } + return delay +} + func (r *StepRunReconciler) getStoryForStep(ctx context.Context, step *runsv1alpha1.StepRun) (*v1alpha1.Story, error) { storyRun := &runsv1alpha1.StoryRun{} storyRunKey := types.NamespacedName{Name: step.Spec.StoryRunRef.Name, Namespace: step.Namespace} @@ -731,7 +917,8 @@ func (r *StepRunReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.O func (r *StepRunReconciler) mapEngramToStepRuns(ctx context.Context, obj client.Object) []reconcile.Request { log := logging.NewReconcileLogger(ctx, "steprun-mapper").WithValues("engram", obj.GetName()) var stepRuns runsv1alpha1.StepRunList - if err := r.List(ctx, &stepRuns, client.InNamespace(obj.GetNamespace()), client.MatchingFields{"spec.engramRef": obj.GetName()}); err != nil { + indexKey := fmt.Sprintf("%s/%s", obj.GetNamespace(), obj.GetName()) + if err := r.List(ctx, &stepRuns, client.MatchingFields{stepRunEngramIndexField: indexKey}); err != nil { log.Error(err, "failed to list stepruns for engram") return nil } @@ -770,7 +957,8 @@ func (r *StepRunReconciler) mapEngramTemplateToStepRuns(ctx context.Context, obj var allRequests []reconcile.Request for _, engram := range engrams.Items { var stepRuns runsv1alpha1.StepRunList - if err := r.List(ctx, &stepRuns, client.InNamespace(engram.GetNamespace()), client.MatchingFields{"spec.engramRef": engram.GetName()}); err != nil { + indexKey := fmt.Sprintf("%s/%s", engram.GetNamespace(), engram.GetName()) + if err := r.List(ctx, &stepRuns, client.MatchingFields{stepRunEngramIndexField: indexKey}); err != nil { log.Error(err, "failed to list stepruns for engram", "engram", engram.GetName()) continue // Continue to the next engram } @@ -793,89 +981,92 @@ func (r *StepRunReconciler) mapEngramTemplateToStepRuns(ctx context.Context, obj // setupSecrets resolves the secret mappings from the Engram and prepares the necessary // volumes, volume mounts, and environment variables for the pod. -func (r *StepRunReconciler) setupSecrets(_ context.Context, resolvedConfig *config.ResolvedExecutionConfig, engramTemplate *catalogv1alpha1.EngramTemplate) ([]corev1.EnvVar, []corev1.Volume, []corev1.VolumeMount) { - var envVars []corev1.EnvVar - var volumes []corev1.Volume - var volumeMounts []corev1.VolumeMount - // Note: we no longer return EnvFromSource; secrets are either mounted as files or injected via explicit env vars +func (r *StepRunReconciler) setupSecrets(_ context.Context, resolvedConfig *config.ResolvedExecutionConfig, engramTemplate *catalogv1alpha1.EngramTemplate) ([]corev1.EnvVar, []corev1.EnvFromSource, []corev1.Volume, []corev1.VolumeMount) { + if resolvedConfig.Secrets == nil || engramTemplate == nil || engramTemplate.Spec.SecretSchema == nil { + return nil, nil, nil, nil + } - if resolvedConfig.Secrets == nil || engramTemplate.Spec.SecretSchema == nil { - return envVars, volumes, volumeMounts + artifacts := secretutil.BuildArtifacts(engramTemplate.Spec.SecretSchema, resolvedConfig.Secrets) + return artifacts.EnvVars, artifacts.EnvFrom, artifacts.Volumes, artifacts.VolumeMounts +} + +func (r *StepRunReconciler) configureTLSEnvAndMounts( + ctx context.Context, + namespace, secretName string, + volumes *[]corev1.Volume, + volumeMounts *[]corev1.VolumeMount, + envVars *[]corev1.EnvVar, +) error { + var secret corev1.Secret + if err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: secretName}, &secret); err != nil { + return err } - for logicalName, actualSecretName := range resolvedConfig.Secrets { - secretDef, ok := engramTemplate.Spec.SecretSchema[logicalName] - if !ok { - // Engram is trying to map a secret that the template doesn't define. - // We should probably log this, but it's not a fatal error for the reconcile. - continue - } + const volumeName = "engram-tls" + if !volumeExists(*volumes, volumeName) { + *volumes = append(*volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: secretName}, + }, + }) + } - sdkSecretKey := fmt.Sprintf("BUBU_SECRET_%s", logicalName) + const mountPath = "/var/run/tls" + if !volumeMountExists(*volumeMounts, volumeName, mountPath) { + *volumeMounts = append(*volumeMounts, corev1.VolumeMount{ + Name: volumeName, + MountPath: mountPath, + ReadOnly: true, + }) + } - switch secretDef.MountType { - case enums.SecretMountTypeFile: - // 1. Create the Volume pointing to the actual k8s Secret - volumeName := fmt.Sprintf("secret-%s", logicalName) - volumes = append(volumes, corev1.Volume{ - Name: volumeName, - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: actualSecretName, - }, - }, - }) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_TLS_CERT_FILE", Value: mountPath + "/tls.crt"}) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_TLS_KEY_FILE", Value: mountPath + "/tls.key"}) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_CA_FILE", Value: mountPath + "/ca.crt"}) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_CLIENT_CERT_FILE", Value: mountPath + "/tls.crt"}) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_CLIENT_KEY_FILE", Value: mountPath + "/tls.key"}) + appendEnvIfMissing(envVars, corev1.EnvVar{Name: "BUBU_GRPC_REQUIRE_TLS", Value: "true"}) - // 2. Create the VolumeMount to mount it into the container - mountPath := secretDef.MountPath - if mountPath == "" { - mountPath = fmt.Sprintf("/etc/bubu/secrets/%s", logicalName) // Default mount path - } - volumeMounts = append(volumeMounts, corev1.VolumeMount{ - Name: volumeName, - MountPath: mountPath, - ReadOnly: true, - }) + return nil +} - // 3. Add the env var for the SDK to discover it - envVars = append(envVars, corev1.EnvVar{ - Name: sdkSecretKey, - Value: fmt.Sprintf("file:%s", mountPath), - }) +func volumeExists(volumes []corev1.Volume, name string) bool { + for i := range volumes { + if volumes[i].Name == name { + return true + } + } + return false +} - case enums.SecretMountTypeEnv: - // For each key defined in the template, create an EnvVar that sources - // its value directly from the specified Kubernetes secret. - for _, key := range secretDef.ExpectedKeys { - prefix := secretDef.EnvPrefix - if prefix == "" { - prefix = fmt.Sprintf("%s_", logicalName) // Default prefix - } - envVars = append(envVars, corev1.EnvVar{ - Name: prefix + key, - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: actualSecretName, - }, - Key: key, - }, - }, - }) - } +func volumeMountExists(mounts []corev1.VolumeMount, name, mountPath string) bool { + for i := range mounts { + if mounts[i].Name == name && mounts[i].MountPath == mountPath { + return true + } + } + return false +} - // Add the discovery env var for the SDK. The SDK will now find the - // environment variables directly, as they are populated at the same time. - sdkValue := fmt.Sprintf("env:%s", secretDef.EnvPrefix) - if secretDef.EnvPrefix == "" { - sdkValue = fmt.Sprintf("env:%s_", logicalName) - } - envVars = append(envVars, corev1.EnvVar{ - Name: sdkSecretKey, - Value: sdkValue, - }) +func appendEnvIfMissing(envVars *[]corev1.EnvVar, envVar corev1.EnvVar) { + for _, existing := range *envVars { + if existing.Name == envVar.Name { + return } } + *envVars = append(*envVars, envVar) +} - return envVars, volumes, volumeMounts +func getTLSSecretName(obj *metav1.ObjectMeta) string { + if obj == nil { + return "" + } + if v, ok := obj.Annotations["engram.bubustack.io/tls-secret"]; ok && v != "" { + return v + } + if v, ok := obj.Annotations["bubustack.io/tls-secret"]; ok && v != "" { + return v + } + return "" } diff --git a/internal/controller/runs/story_crd_validation_test.go b/internal/controller/runs/story_crd_validation_test.go new file mode 100644 index 0000000..c353f8d --- /dev/null +++ b/internal/controller/runs/story_crd_validation_test.go @@ -0,0 +1,129 @@ +package runs + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" + bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + steprunwebhook "github.com/bubustack/bobrapet/internal/webhook/runs/v1alpha1" + storywebhook "github.com/bubustack/bobrapet/internal/webhook/v1alpha1" + "github.com/bubustack/bobrapet/pkg/enums" + "github.com/bubustack/bobrapet/pkg/refs" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = Describe("Story CRD validations", func() { + const namespace = "default" + + It("rejects duplicate step names", func() { + story := &bubuv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Name: "validation-duplicate-names", + Namespace: namespace, + }, + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{ + { + Name: "duplicate", + Type: enums.StepTypeSetData, + }, + { + Name: "duplicate", + Type: enums.StepTypeSetData, + }, + }, + }, + } + + validator := &storywebhook.StoryCustomValidator{} + _, err := validator.ValidateCreate(context.Background(), story.DeepCopyObject()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("duplicate step name")) + }) + + It("rejects self-referential dependencies", func() { + story := &bubuv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Name: "validation-self-dependency", + Namespace: namespace, + }, + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{ + { + Name: "self", + Type: enums.StepTypeSetData, + Needs: []string{"self"}, + }, + }, + }, + } + + validator := &storywebhook.StoryCustomValidator{} + _, err := validator.ValidateCreate(context.Background(), story.DeepCopyObject()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot depend on itself")) + }) + + It("rejects cycles by requiring dependencies to reference earlier steps", func() { + story := &bubuv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Name: "validation-cycle", + Namespace: namespace, + }, + Spec: bubuv1alpha1.StorySpec{ + Steps: []bubuv1alpha1.Step{ + { + Name: "first", + Type: enums.StepTypeSetData, + Needs: []string{"second"}, + }, + { + Name: "second", + Type: enums.StepTypeSetData, + }, + }, + }, + } + + validator := &storywebhook.StoryCustomValidator{} + _, err := validator.ValidateCreate(context.Background(), story.DeepCopyObject()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("must be declared before")) + }) +}) + +var _ = Describe("StepRun CRD validations", func() { + const namespace = "default" + + It("rejects status.needs entries referencing the StepRun itself", func() { + stepRun := &runsv1alpha1.StepRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "self-referencing-step-run", + Namespace: namespace, + }, + Spec: runsv1alpha1.StepRunSpec{ + StoryRunRef: refs.StoryRunReference{ + ObjectReference: refs.ObjectReference{ + Name: "parent-storyrun", + }, + }, + StepID: "step-id", + }, + } + + validator := &steprunwebhook.StepRunCustomValidator{} + _, err := validator.ValidateCreate(context.Background(), stepRun.DeepCopyObject()) + Expect(err).NotTo(HaveOccurred()) + + oldObj := stepRun.DeepCopyObject() + newStepRun := stepRun.DeepCopy() + newStepRun.Status.Needs = []string{"self-referencing-step-run"} + + _, err = validator.ValidateUpdate(context.Background(), oldObj, newStepRun.DeepCopyObject()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot reference the StepRun itself")) + }) +}) diff --git a/internal/controller/runs/storyrun_controller.go b/internal/controller/runs/storyrun_controller.go index 4551597..7c088ea 100644 --- a/internal/controller/runs/storyrun_controller.go +++ b/internal/controller/runs/storyrun_controller.go @@ -23,19 +23,24 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "go.opentelemetry.io/otel/attribute" + runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" bubushv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/naming" "github.com/bubustack/bobrapet/pkg/conditions" "github.com/bubustack/bobrapet/pkg/enums" "github.com/bubustack/bobrapet/pkg/logging" "github.com/bubustack/bobrapet/pkg/metrics" + "github.com/bubustack/bobrapet/pkg/observability" "github.com/bubustack/bobrapet/pkg/patch" ) @@ -59,17 +64,33 @@ type StoryRunReconciler struct { // +kubebuilder:rbac:groups=runs.bubustack.io,resources=storyruns,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=runs.bubustack.io,resources=storyruns/status,verbs=get;update;patch // +kubebuilder:rbac:groups=runs.bubustack.io,resources=storyruns/finalizers,verbs=update -// +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=create;get;watch;list -// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=create;get;watch;list -// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=create;get;watch;list +// +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=create;get;watch;list;update;patch +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=create;get;watch;list;update;patch +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=create;get;watch;list;update;patch // +kubebuilder:rbac:groups=runs.bubustack.io,resources=stepruns,verbs=get;list;watch;create;update;patch // +kubebuilder:rbac:groups=bubustack.io,resources=stories,verbs=get;list;watch // +kubebuilder:rbac:groups=bubustack.io,resources=engrams,verbs=get;list;watch func (r *StoryRunReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { + ctx, span := observability.StartSpan(ctx, "StoryRunReconciler.Reconcile", + attribute.String("namespace", req.Namespace), + attribute.String("storyrun", req.Name), + ) + defer span.End() + log := logging.NewReconcileLogger(ctx, "storyrun").WithValues("storyrun", req.NamespacedName) startTime := time.Now() - defer func() { metrics.RecordControllerReconcile("storyrun", time.Since(startTime), err) }() + defer func() { + if err != nil { + span.RecordError(err) + } + requeue := res.RequeueAfter > 0 + span.SetAttributes( + attribute.Bool("requeue", requeue), + attribute.String("requeue_after", res.RequeueAfter.String()), + ) + metrics.RecordControllerReconcile("storyrun", time.Since(startTime), err) + }() ctx, cancel := r.withReconcileTimeout(ctx) defer cancel() @@ -78,6 +99,10 @@ func (r *StoryRunReconciler) Reconcile(ctx context.Context, req ctrl.Request) (r if err := r.Get(ctx, req.NamespacedName, &srun); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } + span.SetAttributes( + attribute.String("story", srun.Spec.StoryRef.Name), + attribute.String("phase", string(srun.Status.Phase)), + ) if handled, err := r.guardOversizedInputs(ctx, &srun, log); handled || err != nil { return ctrl.Result{}, err @@ -168,6 +193,34 @@ func (r *StoryRunReconciler) ensureFinalizer(ctx context.Context, srun *runsv1al return nil } +func (r *StoryRunReconciler) nextRequeueDelay() time.Duration { + const fallbackBase = 5 * time.Second + const fallbackMax = 30 * time.Second + + cfg := r.ConfigResolver.GetOperatorConfig() + if cfg == nil { + return wait.Jitter(fallbackBase, 0.5) + } + + base := cfg.Controller.RequeueBaseDelay + if base <= 0 { + base = fallbackBase + } + max := cfg.Controller.RequeueMaxDelay + if max <= 0 { + max = fallbackMax + } + if max < base { + max = base + } + + delay := wait.Jitter(base, 0.5) + if delay > max { + delay = max + } + return delay +} + func (r *StoryRunReconciler) getStoryOrWait(ctx context.Context, srun *runsv1alpha1.StoryRun, log *logging.ControllerLogger) (bool, ctrl.Result, *bubushv1alpha1.Story, error) { story, err := r.getStoryForRun(ctx, srun) if err == nil { @@ -192,14 +245,11 @@ func (r *StoryRunReconciler) getStoryOrWait(ctx context.Context, srun *runsv1alp log.Error(statusErr, "Failed to update StoryRun status while waiting for Story") return true, ctrl.Result{}, nil, statusErr } - return true, ctrl.Result{RequeueAfter: 15 * time.Second}, nil, nil + return true, ctrl.Result{RequeueAfter: r.nextRequeueDelay()}, nil, nil } log.Error(err, "Failed to get Story for StoryRun") - if updateErr := r.setStoryRunPhase(ctx, srun, enums.PhaseFailed, fmt.Sprintf("failed to get story: %v", err)); updateErr != nil { - log.Error(updateErr, "Failed to set StoryRun status to Failed") - return true, ctrl.Result{}, nil, updateErr - } - return true, ctrl.Result{}, nil, nil + // Propagate the error so controller-runtime retries instead of marking the run failed on a transient API error. + return true, ctrl.Result{}, nil, err } func (r *StoryRunReconciler) handleStreamingIfNeeded(ctx context.Context, srun *runsv1alpha1.StoryRun, story *bubushv1alpha1.Story, log *logging.ControllerLogger) (bool, error) { @@ -256,7 +306,7 @@ func (r *StoryRunReconciler) reconcileStreamingStoryRun(ctx context.Context, sru // Create or update the run-specific engram runEngram := &bubushv1alpha1.Engram{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", srun.Name, step.Name), + Name: naming.Compose(srun.Name, step.Name), Namespace: srun.Namespace, }, } @@ -307,6 +357,8 @@ func (r *StoryRunReconciler) reconcileDelete(ctx context.Context, srun *runsv1al // For a simple MVP, we will just delete the StepRuns. // A more advanced implementation might try to gracefully cancel them. + hasDependents := false + for _, sr := range stepRunList.Items { if sr.DeletionTimestamp.IsZero() { if err := r.Delete(ctx, &sr); err != nil { @@ -315,10 +367,31 @@ func (r *StoryRunReconciler) reconcileDelete(ctx context.Context, srun *runsv1al } log.Info("Deleted child StepRun", "stepRun", sr.Name) } + hasDependents = true + } + + // Clean up any sub-storyruns spawned by executeStory steps. + var childStoryRuns runsv1alpha1.StoryRunList + if err := r.List(ctx, &childStoryRuns, client.InNamespace(srun.Namespace), client.MatchingLabels{"bubustack.io/parent-storyrun": srun.Name}); err != nil { + log.Error(err, "Failed to list child StoryRuns for cleanup") + return ctrl.Result{}, err + } + for _, child := range childStoryRuns.Items { + if child.UID == srun.UID { + continue + } + if child.DeletionTimestamp.IsZero() { + if err := r.Delete(ctx, &child, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil { + log.Error(err, "Failed to delete child StoryRun during cleanup", "childStoryRun", child.Name) + return ctrl.Result{}, err + } + log.Info("Deleted child StoryRun", "childStoryRun", child.Name) + } + hasDependents = true } // Once all children are gone, remove the finalizer. - if len(stepRunList.Items) == 0 { + if !hasDependents { if controllerutil.ContainsFinalizer(srun, StoryRunFinalizer) { mergePatch := client.MergeFrom(srun.DeepCopy()) controllerutil.RemoveFinalizer(srun, StoryRunFinalizer) @@ -332,7 +405,7 @@ func (r *StoryRunReconciler) reconcileDelete(ctx context.Context, srun *runsv1al } } else { // If children still exist, requeue to check on them later. - return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + return ctrl.Result{RequeueAfter: r.nextRequeueDelay()}, nil } return ctrl.Result{}, nil @@ -392,7 +465,7 @@ func (r *StoryRunReconciler) setStoryRunPhase(ctx context.Context, srun *runsv1a // SetupWithManager sets up the controller with the Manager. func (r *StoryRunReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.rbacManager = NewRBACManager(mgr.GetClient(), mgr.GetScheme()) - stepExecutor := NewStepExecutor(mgr.GetClient(), mgr.GetScheme(), &r.CELEvaluator) + stepExecutor := NewStepExecutor(mgr.GetClient(), mgr.GetScheme(), &r.CELEvaluator, r.ConfigResolver) r.dagReconciler = NewDAGReconciler(mgr.GetClient(), &r.CELEvaluator, stepExecutor, r.ConfigResolver) r.Recorder = mgr.GetEventRecorderFor("storyrun-controller") diff --git a/internal/controller/secretutil/secret_artifacts.go b/internal/controller/secretutil/secret_artifacts.go new file mode 100644 index 0000000..831faa7 --- /dev/null +++ b/internal/controller/secretutil/secret_artifacts.go @@ -0,0 +1,138 @@ +package secretutil + +import ( + "fmt" + + catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" + "github.com/bubustack/bobrapet/pkg/enums" + corev1 "k8s.io/api/core/v1" +) + +// Artifacts captures the Kubernetes objects required to surface secrets inside pods. +type Artifacts struct { + EnvVars []corev1.EnvVar + EnvFrom []corev1.EnvFromSource + Volumes []corev1.Volume + VolumeMounts []corev1.VolumeMount +} + +// BuildArtifacts converts template secret definitions combined with resolved user mappings +// into Kubernetes primitives that can be attached to a pod. +func BuildArtifacts(secretSchema map[string]catalogv1alpha1.SecretDefinition, mappings map[string]string) Artifacts { + if len(mappings) == 0 || len(secretSchema) == 0 { + return Artifacts{} + } + + result := Artifacts{} + for logicalName, actualSecretName := range mappings { + definition, ok := secretSchema[logicalName] + if !ok { + continue + } + sdkSecretKey := fmt.Sprintf("BUBU_SECRET_%s", logicalName) + applySecret(&result, logicalName, actualSecretName, sdkSecretKey, definition) + } + return result +} + +func applySecret(artifacts *Artifacts, logicalName, actualSecretName, sdkSecretKey string, definition catalogv1alpha1.SecretDefinition) { + switch definition.MountType { + case enums.SecretMountTypeFile: + applyFileSecret(artifacts, logicalName, actualSecretName, sdkSecretKey, definition) + case enums.SecretMountTypeEnv: + applyEnvSecret(artifacts, logicalName, actualSecretName, sdkSecretKey, definition, true, true) + case enums.SecretMountTypeBoth: + applyFileSecret(artifacts, logicalName, actualSecretName, sdkSecretKey, definition) + applyEnvSecret(artifacts, logicalName, actualSecretName, sdkSecretKey, definition, true, false) + default: + applyEnvSecret(artifacts, logicalName, actualSecretName, sdkSecretKey, definition, true, true) + } +} + +func applyFileSecret(artifacts *Artifacts, logicalName, actualSecretName, sdkSecretKey string, definition catalogv1alpha1.SecretDefinition) { + volumeName := fmt.Sprintf("secret-%s", logicalName) + mountPath := definition.MountPath + if mountPath == "" { + mountPath = fmt.Sprintf("/etc/bubu/secrets/%s", logicalName) + } + + artifacts.Volumes = append(artifacts.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: actualSecretName}, + }, + }) + artifacts.VolumeMounts = append(artifacts.VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + MountPath: mountPath, + ReadOnly: true, + }) + artifacts.EnvVars = append(artifacts.EnvVars, + corev1.EnvVar{Name: sdkSecretKey, Value: fmt.Sprintf("file:%s", mountPath)}, + ) +} + +func applyEnvSecret( + artifacts *Artifacts, + logicalName, + actualSecretName, + sdkSecretKey string, + definition catalogv1alpha1.SecretDefinition, + exposeNameForPrefix bool, + exposeNameForKeys bool, +) { + if len(definition.ExpectedKeys) > 0 { + addExplicitKeyEnvVars(artifacts, logicalName, actualSecretName, sdkSecretKey, definition, exposeNameForKeys) + return + } + prefix := resolvePrefix(logicalName, definition) + artifacts.EnvFrom = append(artifacts.EnvFrom, corev1.EnvFromSource{ + Prefix: prefix, + SecretRef: &corev1.SecretEnvSource{LocalObjectReference: corev1.LocalObjectReference{Name: actualSecretName}}, + }) + artifacts.EnvVars = append(artifacts.EnvVars, + corev1.EnvVar{Name: sdkSecretKey, Value: fmt.Sprintf("env:%s", prefix)}, + ) + if exposeNameForPrefix { + artifacts.EnvVars = append(artifacts.EnvVars, + corev1.EnvVar{Name: fmt.Sprintf("%s_NAME", sdkSecretKey), Value: actualSecretName}, + ) + } +} + +func addExplicitKeyEnvVars(artifacts *Artifacts, logicalName, actualSecretName, sdkSecretKey string, definition catalogv1alpha1.SecretDefinition, exposeSecretName bool) { + prefix := definition.EnvPrefix + if prefix == "" { + prefix = fmt.Sprintf("%s_", logicalName) + } + for _, key := range definition.ExpectedKeys { + artifacts.EnvVars = append(artifacts.EnvVars, corev1.EnvVar{ + Name: prefix + key, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: actualSecretName}, + Key: key, + }, + }, + }) + } + + sdkValue := fmt.Sprintf("env:%s", definition.EnvPrefix) + if definition.EnvPrefix == "" { + sdkValue = fmt.Sprintf("env:%s_", logicalName) + } + artifacts.EnvVars = append(artifacts.EnvVars, corev1.EnvVar{Name: sdkSecretKey, Value: sdkValue}) + + if exposeSecretName { + artifacts.EnvVars = append(artifacts.EnvVars, + corev1.EnvVar{Name: fmt.Sprintf("%s_NAME", sdkSecretKey), Value: actualSecretName}, + ) + } +} + +func resolvePrefix(logicalName string, definition catalogv1alpha1.SecretDefinition) string { + if definition.EnvPrefix != "" { + return definition.EnvPrefix + } + return fmt.Sprintf("%s_", logicalName) +} diff --git a/internal/controller/story_controller.go b/internal/controller/story_controller.go index dd49cfe..51b620b 100644 --- a/internal/controller/story_controller.go +++ b/internal/controller/story_controller.go @@ -29,6 +29,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" @@ -43,9 +44,13 @@ import ( catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/internal/controller/mergeutil" + "github.com/bubustack/bobrapet/internal/controller/naming" + "github.com/bubustack/bobrapet/internal/controller/secretutil" "github.com/bubustack/bobrapet/pkg/metrics" "github.com/bubustack/bobrapet/pkg/refs" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/utils/ptr" ) const ( @@ -53,6 +58,11 @@ const ( StoryFinalizer = "story.bubustack.io/finalizer" ) +const ( + storyStepEngramIndexField = "spec.steps.ref.key" + storyStepStoryIndexField = "spec.steps.storyRef.key" +) + type executeStoryWith struct { StoryRef refs.StoryReference `json:"storyRef"` } @@ -66,9 +76,9 @@ type StoryReconciler struct { // +kubebuilder:rbac:groups=bubustack.io,resources=stories,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=bubustack.io,resources=stories/status,verbs=get;update;patch // +kubebuilder:rbac:groups=bubustack.io,resources=stories/finalizers,verbs=update -// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -186,7 +196,7 @@ func (r *StoryReconciler) cleanupOwnedResources(ctx context.Context, story *bubu // Delete Deployments for _, step := range story.Spec.Steps { if step.Ref != nil { - deploymentName := fmt.Sprintf("%s-%s", story.Name, step.Name) + deploymentName := naming.Compose(story.Name, step.Name) deployment := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: deploymentName, @@ -199,7 +209,7 @@ func (r *StoryReconciler) cleanupOwnedResources(ctx context.Context, story *bubu } // Delete Services - serviceName := fmt.Sprintf("%s-%s", story.Name, step.Name) + serviceName := naming.Compose(story.Name, step.Name) service := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ Name: serviceName, @@ -221,7 +231,8 @@ func (r *StoryReconciler) reconcilePerStoryStreaming(ctx context.Context, story logger := log.FromContext(ctx) logger.Info("Reconciling workloads for streaming Story with PerStory strategy") - for _, step := range story.Spec.Steps { + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] if step.Ref == nil { continue // This step is not an engram, so nothing to deploy. } @@ -241,17 +252,32 @@ func (r *StoryReconciler) reconcilePerStoryStreaming(ctx context.Context, story logger.Error(err, "Failed to get EngramTemplate for streaming step", "engramTemplate", engram.Spec.TemplateRef.Name) return err } - resolved, err := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, story, &engram, template) + resolved, err := r.ConfigResolver.ResolveExecutionConfig(ctx, nil, story, &engram, template, step) if err != nil { logger.Error(err, "Failed to resolve execution config for streaming step") return err } - deployment := r.deploymentForStreamingStepWithConfig(story, &step, &engram, resolved) + if step.Secrets != nil { + if resolved.Secrets == nil { + resolved.Secrets = make(map[string]string, len(step.Secrets)) + } + for k, v := range step.Secrets { + resolved.Secrets[k] = v + } + } + + mergedWith, err := mergeutil.MergeWithBlocks(engram.Spec.With, step.With) + if err != nil { + logger.Error(err, "Failed to merge 'with' block for streaming step", "step", step.Name) + return err + } + + deployment := r.deploymentForStreamingStepWithConfig(story, step, template, resolved, mergedWith) if err := r.reconcileOwnedDeployment(ctx, story, deployment); err != nil { return err } - service := r.serviceForStreamingStepWithConfig(story, &step, &engram, resolved) + service := r.serviceForStreamingStepWithConfig(story, step, &engram, resolved) if err := r.reconcileOwnedService(ctx, story, service); err != nil { return err } @@ -333,8 +359,8 @@ func (r *StoryReconciler) reconcileOwnedService(ctx context.Context, owner *bubu return nil } -func (r *StoryReconciler) deploymentForStreamingStepWithConfig(story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, _ *bubuv1alpha1.Engram, cfg *config.ResolvedExecutionConfig) *appsv1.Deployment { - name := fmt.Sprintf("%s-%s", story.Name, step.Name) +func (r *StoryReconciler) deploymentForStreamingStepWithConfig(story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, template *catalogv1alpha1.EngramTemplate, cfg *config.ResolvedExecutionConfig, mergedWith *runtime.RawExtension) *appsv1.Deployment { + name := naming.Compose(story.Name, step.Name) labels := map[string]string{ "app.kubernetes.io/name": "bobrapet-streaming-engram", "app.kubernetes.io/managed-by": "story-controller", @@ -342,7 +368,37 @@ func (r *StoryReconciler) deploymentForStreamingStepWithConfig(story *bubuv1alph "bubustack.io/step": step.Name, } replicas := int32(1) - return &appsv1.Deployment{ + podSpec := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: labels}, + Spec: corev1.PodSpec{ + ServiceAccountName: cfg.ServiceAccountName, + AutomountServiceAccountToken: ptr.To(cfg.AutomountServiceAccountToken), + SecurityContext: cfg.ToPodSecurityContext(), + Containers: []corev1.Container{{ + Name: "engram", + Image: cfg.Image, + ImagePullPolicy: cfg.ImagePullPolicy, + LivenessProbe: cfg.LivenessProbe, + ReadinessProbe: cfg.ReadinessProbe, + StartupProbe: cfg.StartupProbe, + SecurityContext: cfg.ToContainerSecurityContext(), + Resources: cfg.Resources, + Ports: []corev1.ContainerPort{{ + Name: "grpc", + ContainerPort: int32(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort), + }}, + }}, + }, + } + envVars := buildRealtimeBaseEnv(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig) + if mergedWith != nil && len(mergedWith.Raw) > 0 { + envVars = append(envVars, corev1.EnvVar{Name: "BUBU_CONFIG", Value: string(mergedWith.Raw)}) + } + podSpec.Spec.Containers[0].Env = append(podSpec.Spec.Containers[0].Env, envVars...) + applyStorageEnv(cfg, &podSpec.Spec.Containers[0]) + applySecretArtifacts(template, cfg, &podSpec) + + dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: story.Namespace, @@ -351,51 +407,88 @@ func (r *StoryReconciler) deploymentForStreamingStepWithConfig(story *bubuv1alph Spec: appsv1.DeploymentSpec{ Replicas: &replicas, Selector: &metav1.LabelSelector{MatchLabels: labels}, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{Labels: labels}, - Spec: corev1.PodSpec{ - ServiceAccountName: cfg.ServiceAccountName, - Containers: []corev1.Container{{ - Name: "engram", - Image: cfg.Image, - ImagePullPolicy: cfg.ImagePullPolicy, - LivenessProbe: cfg.LivenessProbe, - ReadinessProbe: cfg.ReadinessProbe, - StartupProbe: cfg.StartupProbe, - Ports: []corev1.ContainerPort{{ - Name: "grpc", - ContainerPort: int32(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort), - }}, - Env: []corev1.EnvVar{{Name: "BUBU_EXECUTION_MODE", Value: "streaming"}}, - }}, - }, - }, + Template: podSpec, }, } + return dep } -func (r *StoryReconciler) serviceForStreamingStepWithConfig(story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, _ *bubuv1alpha1.Engram, _ *config.ResolvedExecutionConfig) *corev1.Service { - name := fmt.Sprintf("%s-%s", story.Name, step.Name) - labels := map[string]string{ +func (r *StoryReconciler) serviceForStreamingStepWithConfig(story *bubuv1alpha1.Story, step *bubuv1alpha1.Step, _ *bubuv1alpha1.Engram, cfg *config.ResolvedExecutionConfig) *corev1.Service { + name := naming.Compose(story.Name, step.Name) + selectorLabels := map[string]string{ "bubustack.io/story": story.Name, "bubustack.io/step": step.Name, } + serviceLabels := make(map[string]string, len(selectorLabels)+len(cfg.ServiceLabels)) + for k, v := range selectorLabels { + serviceLabels[k] = v + } + for k, v := range cfg.ServiceLabels { + serviceLabels[k] = v + } + serviceAnnotations := make(map[string]string, len(cfg.ServiceAnnotations)) + for k, v := range cfg.ServiceAnnotations { + serviceAnnotations[k] = v + } + ports := make([]corev1.ServicePort, len(cfg.ServicePorts)) + if len(cfg.ServicePorts) > 0 { + copy(ports, cfg.ServicePorts) + } else { + ports = []corev1.ServicePort{{ + Protocol: corev1.ProtocolTCP, + Port: int32(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort), + TargetPort: intstr.FromString("grpc"), + }} + } return &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: story.Namespace, + Name: name, + Namespace: story.Namespace, + Labels: serviceLabels, + Annotations: serviceAnnotations, }, Spec: corev1.ServiceSpec{ - Selector: labels, - Ports: []corev1.ServicePort{{ - Protocol: corev1.ProtocolTCP, - Port: int32(r.ConfigResolver.GetOperatorConfig().Controller.Engram.EngramControllerConfig.DefaultGRPCPort), - TargetPort: intstr.FromString("grpc"), - }}, + Selector: selectorLabels, + Ports: ports, }, } } +func applyStorageEnv(cfg *config.ResolvedExecutionConfig, container *corev1.Container) { + if container == nil || cfg == nil || cfg.Storage == nil || cfg.Storage.S3 == nil { + return + } + s3Config := cfg.Storage.S3 + container.Env = append(container.Env, + corev1.EnvVar{Name: "BUBU_STORAGE_PROVIDER", Value: "s3"}, + corev1.EnvVar{Name: "BUBU_STORAGE_S3_BUCKET", Value: s3Config.Bucket}, + ) + if s3Config.Region != "" { + container.Env = append(container.Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_REGION", Value: s3Config.Region}) + } + if s3Config.Endpoint != "" { + container.Env = append(container.Env, corev1.EnvVar{Name: "BUBU_STORAGE_S3_ENDPOINT", Value: s3Config.Endpoint}) + } + if s3Config.Authentication.SecretRef != nil { + container.EnvFrom = append(container.EnvFrom, corev1.EnvFromSource{ + SecretRef: &corev1.SecretEnvSource{LocalObjectReference: corev1.LocalObjectReference{Name: s3Config.Authentication.SecretRef.Name}}, + }) + } +} + +func applySecretArtifacts(template *catalogv1alpha1.EngramTemplate, cfg *config.ResolvedExecutionConfig, podSpec *corev1.PodTemplateSpec) { + if podSpec == nil || len(podSpec.Spec.Containers) == 0 || template == nil || cfg == nil || template.Spec.SecretSchema == nil { + return + } + artifacts := secretutil.BuildArtifacts(template.Spec.SecretSchema, cfg.Secrets) + podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, artifacts.Volumes...) + + container := &podSpec.Spec.Containers[0] + container.Env = append(container.Env, artifacts.EnvVars...) + container.EnvFrom = append(container.EnvFrom, artifacts.EnvFrom...) + container.VolumeMounts = append(container.VolumeMounts, artifacts.VolumeMounts...) +} + func (r *StoryReconciler) validateEngramReferences(ctx context.Context, story *bubuv1alpha1.Story) error { for i, step := range story.Spec.Steps { if step.Ref != nil { // This is an Engram step. @@ -447,7 +540,21 @@ func (r *StoryReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Opt r.Recorder = mgr.GetEventRecorderFor("story-controller") mapEngramToStories := func(ctx context.Context, obj client.Object) []reconcile.Request { var stories bubuv1alpha1.StoryList - if err := r.List(ctx, &stories, client.InNamespace(obj.GetNamespace()), client.MatchingFields{"spec.steps.ref.name": obj.GetName()}); err != nil { + indexKey := fmt.Sprintf("%s/%s", obj.GetNamespace(), obj.GetName()) + if err := r.List(ctx, &stories, client.MatchingFields{storyStepEngramIndexField: indexKey}); err != nil { + return nil + } + reqs := make([]reconcile.Request, 0, len(stories.Items)) + for i := range stories.Items { + reqs = append(reqs, reconcile.Request{NamespacedName: types.NamespacedName{Name: stories.Items[i].Name, Namespace: stories.Items[i].Namespace}}) + } + return reqs + } + + mapStoryToStories := func(ctx context.Context, obj client.Object) []reconcile.Request { + var stories bubuv1alpha1.StoryList + indexKey := fmt.Sprintf("%s/%s", obj.GetNamespace(), obj.GetName()) + if err := r.List(ctx, &stories, client.MatchingFields{storyStepStoryIndexField: indexKey}); err != nil { return nil } reqs := make([]reconcile.Request, 0, len(stories.Items)) @@ -460,6 +567,7 @@ func (r *StoryReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Opt return ctrl.NewControllerManagedBy(mgr). For(&bubuv1alpha1.Story{}). Watches(&bubuv1alpha1.Engram{}, handler.EnqueueRequestsFromMapFunc(mapEngramToStories)). + Watches(&bubuv1alpha1.Story{}, handler.EnqueueRequestsFromMapFunc(mapStoryToStories)). WithOptions(opts). Complete(r) } diff --git a/internal/setup/indexing.go b/internal/setup/indexing.go index 7b2dde7..bd78d6b 100644 --- a/internal/setup/indexing.go +++ b/internal/setup/indexing.go @@ -2,6 +2,8 @@ package setup import ( "context" + "encoding/json" + "fmt" "os" "sigs.k8s.io/controller-runtime/pkg/client" @@ -11,6 +13,8 @@ import ( catalogv1alpha1 "github.com/bubustack/bobrapet/api/catalog/v1alpha1" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" bubushv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/pkg/enums" + "github.com/bubustack/bobrapet/pkg/refs" ) var setupLog = log.Log.WithName("setup") @@ -23,7 +27,7 @@ func SetupIndexers(ctx context.Context, mgr manager.Manager) { mustIndexField(ctx, mgr, &bubushv1alpha1.Engram{}, "spec.templateRef.name", extractEngramTemplateName, "failed to index Engram spec.templateRef.name") - mustIndexField(ctx, mgr, &runsv1alpha1.StepRun{}, "spec.engramRef", extractStepRunEngramRef, "failed to index StepRun spec.engramRef.name") + mustIndexField(ctx, mgr, &runsv1alpha1.StepRun{}, "spec.engramRef.key", extractStepRunEngramRef, "failed to index StepRun spec.engramRef.key") mustIndexField(ctx, mgr, &runsv1alpha1.StepRun{}, "spec.storyRunRef.name", extractStepRunStoryRunRef, "failed to index StepRun spec.storyRunRef.name") @@ -35,7 +39,8 @@ func SetupIndexers(ctx context.Context, mgr manager.Manager) { mustIndexField(ctx, mgr, &bubushv1alpha1.Impulse{}, "spec.storyRef.name", extractImpulseStoryRefName, "failed to index Impulse spec.storyRef.name") - mustIndexField(ctx, mgr, &bubushv1alpha1.Story{}, "spec.steps.ref.name", extractStoryStepEngramRefs, "failed to index Story spec.steps.ref.name") + mustIndexField(ctx, mgr, &bubushv1alpha1.Story{}, "spec.steps.ref.key", extractStoryStepEngramRefs, "failed to index Story spec.steps.ref.key") + mustIndexField(ctx, mgr, &bubushv1alpha1.Story{}, "spec.steps.storyRef.key", extractStoryExecuteStoryRefs, "failed to index Story spec.steps.storyRef.key") mustIndexField(ctx, mgr, &catalogv1alpha1.EngramTemplate{}, "spec.description", extractEngramTemplateDescription, "failed to index EngramTemplate spec.description") @@ -70,7 +75,8 @@ func extractStepRunEngramRef(rawObj client.Object) []string { if stepRun.Spec.EngramRef == nil || stepRun.Spec.EngramRef.Name == "" { return nil } - return []string{stepRun.Spec.EngramRef.Name} + namespace := refs.ResolveNamespace(stepRun, &stepRun.Spec.EngramRef.ObjectReference) + return []string{namespacedKey(namespace, stepRun.Spec.EngramRef.Name)} } func extractStepRunStoryRunRef(rawObj client.Object) []string { @@ -122,7 +128,8 @@ func extractStoryStepEngramRefs(rawObj client.Object) []string { for i := range story.Spec.Steps { step := &story.Spec.Steps[i] if step.Ref != nil && step.Ref.Name != "" { - nameSet[step.Ref.Name] = struct{}{} + ns := refs.ResolveNamespace(story, &step.Ref.ObjectReference) + nameSet[namespacedKey(ns, step.Ref.Name)] = struct{}{} } } if len(nameSet) == 0 { @@ -135,6 +142,45 @@ func extractStoryStepEngramRefs(rawObj client.Object) []string { return out } +func extractStoryExecuteStoryRefs(rawObj client.Object) []string { + story := rawObj.(*bubushv1alpha1.Story) + if len(story.Spec.Steps) == 0 { + return nil + } + nameSet := make(map[string]struct{}) + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] + if step.Type != enums.StepTypeExecuteStory || step.With == nil || len(step.With.Raw) == 0 { + continue + } + var withBlock struct { + StoryRef struct { + Name string `json:"name"` + Namespace string `json:"namespace,omitempty"` + } `json:"storyRef"` + } + if err := json.Unmarshal(step.With.Raw, &withBlock); err != nil { + continue + } + if withBlock.StoryRef.Name == "" { + continue + } + targetNamespace := story.Namespace + if withBlock.StoryRef.Namespace != "" { + targetNamespace = withBlock.StoryRef.Namespace + } + nameSet[namespacedKey(targetNamespace, withBlock.StoryRef.Name)] = struct{}{} + } + if len(nameSet) == 0 { + return nil + } + out := make([]string, 0, len(nameSet)) + for n := range nameSet { + out = append(out, n) + } + return out +} + func extractEngramTemplateDescription(obj client.Object) []string { template := obj.(*catalogv1alpha1.EngramTemplate) if template.Spec.Description != "" { @@ -146,3 +192,7 @@ func extractEngramTemplateDescription(obj client.Object) []string { func extractEngramTemplateVersion(obj client.Object) []string { return []string{obj.(*catalogv1alpha1.EngramTemplate).Spec.Version} } + +func namespacedKey(namespace, name string) string { + return fmt.Sprintf("%s/%s", namespace, name) +} diff --git a/internal/webhook/runs/v1alpha1/steprun_webhook.go b/internal/webhook/runs/v1alpha1/steprun_webhook.go index 7a45db5..055d7f7 100644 --- a/internal/webhook/runs/v1alpha1/steprun_webhook.go +++ b/internal/webhook/runs/v1alpha1/steprun_webhook.go @@ -20,6 +20,7 @@ import ( "context" "encoding/json" "fmt" + "reflect" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" @@ -29,6 +30,7 @@ import ( runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" "github.com/bubustack/bobrapet/internal/config" + webhookshared "github.com/bubustack/bobrapet/internal/webhook/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -37,24 +39,60 @@ import ( var steprunlog = logf.Log.WithName("steprun-resource") type StepRunWebhook struct { - Client client.Client - Config *config.ControllerConfig + Client client.Client + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } func (wh *StepRunWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { wh.Client = mgr.GetClient() - operatorConfigManager := config.NewOperatorConfigManager(mgr.GetClient(), "bobrapet-system", "bobrapet-operator-config") - wh.Config = operatorConfigManager.GetControllerConfig() return ctrl.NewWebhookManagedBy(mgr). For(&runsv1alpha1.StepRun{}). + WithDefaulter(&StepRunCustomDefaulter{ + Config: wh.Config, + ConfigManager: wh.ConfigManager, + }). WithValidator(&StepRunCustomValidator{ - Client: wh.Client, - Config: wh.Config, + Client: wh.Client, + Config: wh.Config, + ConfigManager: wh.ConfigManager, }). Complete() } +type StepRunCustomDefaulter struct { + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager +} + +var _ webhook.CustomDefaulter = &StepRunCustomDefaulter{} + +func (d *StepRunCustomDefaulter) controllerConfig() *config.ControllerConfig { + if d.ConfigManager != nil { + if cfg := d.ConfigManager.GetControllerConfig(); cfg != nil { + return cfg + } + } + if d.Config != nil { + return d.Config + } + return config.DefaultControllerConfig() +} + +func (d *StepRunCustomDefaulter) Default(_ context.Context, obj runtime.Object) error { + steprun, ok := obj.(*runsv1alpha1.StepRun) + if !ok { + return fmt.Errorf("expected a StepRun object but got %T", obj) + } + steprunlog.Info("Defaulting StepRun", "name", steprun.GetName()) + + cfg := d.controllerConfig() + steprun.Spec.Retry = webhookshared.ResolveRetryPolicy(cfg, steprun.Spec.Retry) + + return nil +} + // TODO(user): change verbs to "verbs=create;update;delete" if you want to enable deletion validation. // NOTE: The 'path' attribute must follow a specific pattern and should not be modified directly here. // Modifying the path for an invalid path can cause API server errors; failing to locate the webhook. @@ -66,12 +104,25 @@ func (wh *StepRunWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { // NOTE: The +kubebuilder:object:generate=false marker prevents controller-gen from generating DeepCopy methods, // as this struct is used only for temporary operations and does not need to be deeply copied. type StepRunCustomValidator struct { - Client client.Client - Config *config.ControllerConfig + Client client.Client + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } var _ webhook.CustomValidator = &StepRunCustomValidator{} +func (v *StepRunCustomValidator) controllerConfig() *config.ControllerConfig { + if v.ConfigManager != nil { + if cfg := v.ConfigManager.GetControllerConfig(); cfg != nil { + return cfg + } + } + if v.Config != nil { + return v.Config + } + return config.DefaultControllerConfig() +} + // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type StepRun. func (v *StepRunCustomValidator) ValidateCreate(_ context.Context, obj runtime.Object) (admission.Warnings, error) { steprun, ok := obj.(*runsv1alpha1.StepRun) @@ -83,6 +134,9 @@ func (v *StepRunCustomValidator) ValidateCreate(_ context.Context, obj runtime.O if err := v.validateStepRun(steprun); err != nil { return nil, err } + if err := validateStepRunStatus(steprun); err != nil { + return nil, err + } return nil, nil } @@ -94,6 +148,28 @@ func (v *StepRunCustomValidator) ValidateUpdate(_ context.Context, oldObj, newOb } steprunlog.Info("Validation for StepRun upon update", "name", steprun.GetName()) + // Allow metadata-only updates during deletion (e.g., finalizer removal) + if steprun.DeletionTimestamp != nil { + return nil, nil + } + + if oldSr, ok := oldObj.(*runsv1alpha1.StepRun); ok { + if err := ensureStepRunObservedGenerationMonotonic(oldSr, steprun); err != nil { + return nil, err + } + } + + if err := validateStepRunStatus(steprun); err != nil { + return nil, err + } + + if oldSr, ok := oldObj.(*runsv1alpha1.StepRun); ok { + // Skip further validation if only status changed + if reflect.DeepEqual(oldSr.Spec, steprun.Spec) { + return nil, nil + } + } + if err := v.validateStepRun(steprun); err != nil { return nil, err } @@ -119,14 +195,18 @@ func (v *StepRunCustomValidator) validateStepRun(sr *runsv1alpha1.StepRun) error if err := requireBasicFields(sr); err != nil { return err } - maxBytes := pickMaxInlineBytes(v.Config) + cfg := v.controllerConfig() + maxBytes := pickMaxInlineBytes(cfg) if err := validateInputs(sr, maxBytes); err != nil { return err } if err := validateStatusOutput(sr, maxBytes); err != nil { return err } - return validateTotalSize(sr) + if err := validateTotalSize(sr); err != nil { + return err + } + return validateDownstreamTargets(sr.Spec.DownstreamTargets) } func requireBasicFields(sr *runsv1alpha1.StepRun) error { @@ -140,6 +220,9 @@ func requireBasicFields(sr *runsv1alpha1.StepRun) error { } func pickMaxInlineBytes(cfg *config.ControllerConfig) int { + if cfg == nil { + cfg = config.DefaultControllerConfig() + } maxBytes := cfg.Engram.EngramControllerConfig.DefaultMaxInlineSize if maxBytes == 0 { maxBytes = 1024 @@ -171,6 +254,31 @@ func validateStatusOutput(sr *runsv1alpha1.StepRun, maxBytes int) error { return nil } +func validateDownstreamTargets(targets []runsv1alpha1.DownstreamTarget) error { + for idx, tgt := range targets { + count := 0 + if tgt.GRPCTarget != nil { + count++ + } + if tgt.Terminate != nil { + count++ + } + if count != 1 { + return fmt.Errorf("spec.downstreamTargets[%d] must set exactly one of grpc or terminate", idx) + } + } + return nil +} + +func ensureStepRunObservedGenerationMonotonic(oldSR, newSR *runsv1alpha1.StepRun) error { + oldGen := oldSR.Status.ObservedGeneration + newGen := newSR.Status.ObservedGeneration + if oldGen > 0 && newGen > 0 && newGen < oldGen { + return fmt.Errorf("status.observedGeneration must be monotonically increasing (old=%d new=%d)", oldGen, newGen) + } + return nil +} + func validateTotalSize(sr *runsv1alpha1.StepRun) error { rawSR, err := json.Marshal(sr) if err != nil { @@ -182,3 +290,12 @@ func validateTotalSize(sr *runsv1alpha1.StepRun) error { } return nil } + +func validateStepRunStatus(sr *runsv1alpha1.StepRun) error { + for _, name := range sr.Status.Needs { + if name == sr.Name { + return fmt.Errorf("status.needs cannot reference the StepRun itself") + } + } + return nil +} diff --git a/internal/webhook/runs/v1alpha1/steprun_webhook_test.go b/internal/webhook/runs/v1alpha1/steprun_webhook_test.go index 1cc1039..ff310e0 100644 --- a/internal/webhook/runs/v1alpha1/steprun_webhook_test.go +++ b/internal/webhook/runs/v1alpha1/steprun_webhook_test.go @@ -17,11 +17,16 @@ limitations under the License. package v1alpha1 import ( + "context" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" - // TODO (user): Add any additional imports if needed + bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/pkg/enums" + "github.com/bubustack/bobrapet/pkg/refs" ) var _ = Describe("StepRun Webhook", func() { @@ -29,43 +34,111 @@ var _ = Describe("StepRun Webhook", func() { obj *runsv1alpha1.StepRun oldObj *runsv1alpha1.StepRun validator StepRunCustomValidator + defaulter StepRunCustomDefaulter ) BeforeEach(func() { obj = &runsv1alpha1.StepRun{} oldObj = &runsv1alpha1.StepRun{} - validator = StepRunCustomValidator{} + cfg := config.DefaultControllerConfig() + validator = StepRunCustomValidator{Config: cfg} + defaulter = StepRunCustomDefaulter{Config: cfg} Expect(validator).NotTo(BeNil(), "Expected validator to be initialized") Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") - // TODO (user): Add any setup logic common to all tests }) - AfterEach(func() { - // TODO (user): Add any teardown logic common to all tests - }) + newStepRun := func(withTargets []runsv1alpha1.DownstreamTarget) *runsv1alpha1.StepRun { + return &runsv1alpha1.StepRun{ + Spec: runsv1alpha1.StepRunSpec{ + StoryRunRef: refs.StoryRunReference{ + ObjectReference: refs.ObjectReference{ + Name: "storyrun-1", + }, + }, + StepID: "step-a", + DownstreamTargets: withTargets, + }, + } + } Context("When creating or updating StepRun under Validating Webhook", func() { - // TODO (user): Add logic for validating webhooks - // Example: - // It("Should deny creation if a required field is missing", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "" - // Expect(validator.ValidateCreate(ctx, obj)).Error().To(HaveOccurred()) - // }) - // - // It("Should admit creation if all required fields are present", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "valid_value" - // Expect(validator.ValidateCreate(ctx, obj)).To(BeNil()) - // }) - // - // It("Should validate updates correctly", func() { - // By("simulating a valid update scenario") - // oldObj.SomeRequiredField = "updated_value" - // obj.SomeRequiredField = "updated_value" - // Expect(validator.ValidateUpdate(ctx, oldObj, obj)).To(BeNil()) - // }) + It("allows a single grpc downstream target", func() { + sr := newStepRun([]runsv1alpha1.DownstreamTarget{ + {GRPCTarget: &runsv1alpha1.GRPCTarget{Endpoint: "example:5000"}}, + }) + Expect(validator.validateStepRun(sr)).To(Succeed()) + }) + + It("allows a single terminate downstream target", func() { + sr := newStepRun([]runsv1alpha1.DownstreamTarget{ + {Terminate: &runsv1alpha1.TerminateTarget{StopMode: enums.StopModeSuccess}}, + }) + Expect(validator.validateStepRun(sr)).To(Succeed()) + }) + + It("rejects a downstream target that sets both grpc and terminate", func() { + sr := newStepRun([]runsv1alpha1.DownstreamTarget{ + { + GRPCTarget: &runsv1alpha1.GRPCTarget{Endpoint: "example:5000"}, + Terminate: &runsv1alpha1.TerminateTarget{StopMode: enums.StopModeSuccess}, + }, + }) + Expect(validator.validateStepRun(sr)).To(MatchError(ContainSubstring("spec.downstreamTargets[0] must set exactly one of grpc or terminate"))) + }) + + It("rejects a downstream target that sets neither grpc nor terminate", func() { + sr := newStepRun([]runsv1alpha1.DownstreamTarget{{}}) + Expect(validator.validateStepRun(sr)).To(MatchError(ContainSubstring("spec.downstreamTargets[0] must set exactly one of grpc or terminate"))) + }) + + It("ValidateCreate surfaces downstream target errors", func() { + obj = newStepRun([]runsv1alpha1.DownstreamTarget{{}}) + _, err := validator.ValidateCreate(context.Background(), obj) + Expect(err).To(MatchError(ContainSubstring("spec.downstreamTargets[0] must set exactly one of grpc or terminate"))) + }) + + It("ValidateUpdate surfaces downstream target errors when spec changes", func() { + obj = newStepRun([]runsv1alpha1.DownstreamTarget{{}}) + oldObj = newStepRun(nil) + _, err := validator.ValidateUpdate(context.Background(), oldObj, obj) + Expect(err).To(MatchError(ContainSubstring("spec.downstreamTargets[0] must set exactly one of grpc or terminate"))) + }) + + It("rejects observedGeneration regressions on status updates", func() { + oldObj = newStepRun(nil) + oldObj.Status.ObservedGeneration = 5 + obj = oldObj.DeepCopy() + obj.Status.ObservedGeneration = 4 + _, err := validator.ValidateUpdate(context.Background(), oldObj, obj) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("status.observedGeneration")) + }) + }) + + Context("Defaulting StepRun retry policy", func() { + It("sets defaults when retry is nil", func() { + sr := newStepRun(nil) + sr.Spec.Retry = nil + Expect(defaulter.Default(context.Background(), sr)).To(Succeed()) + Expect(sr.Spec.Retry).NotTo(BeNil()) + Expect(sr.Spec.Retry.MaxRetries).NotTo(BeNil()) + Expect(sr.Spec.Retry.Delay).NotTo(BeNil()) + Expect(sr.Spec.Retry.Backoff).NotTo(BeNil()) + }) + + It("fills missing retry fields", func() { + sr := newStepRun(nil) + empty := "" + sr.Spec.Retry = &bubuv1alpha1.RetryPolicy{ + MaxRetries: nil, + Delay: &empty, + } + Expect(defaulter.Default(context.Background(), sr)).To(Succeed()) + Expect(sr.Spec.Retry.MaxRetries).NotTo(BeNil()) + Expect(sr.Spec.Retry.Delay).NotTo(BeNil()) + Expect(*sr.Spec.Retry.Delay).NotTo(BeEmpty()) + }) }) }) diff --git a/internal/webhook/runs/v1alpha1/storyrun_webhook.go b/internal/webhook/runs/v1alpha1/storyrun_webhook.go index dd72c9b..422ec0c 100644 --- a/internal/webhook/runs/v1alpha1/storyrun_webhook.go +++ b/internal/webhook/runs/v1alpha1/storyrun_webhook.go @@ -19,6 +19,7 @@ package v1alpha1 import ( "context" "fmt" + "reflect" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -41,25 +42,20 @@ import ( var storyrunlog = logf.Log.WithName("storyrun-resource") type StoryRunWebhook struct { - Client client.Client - Config *config.ControllerConfig + Client client.Client + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } func (wh *StoryRunWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { wh.Client = mgr.GetClient() - // Initialize operator config for validation knobs - operatorConfigManager := config.NewOperatorConfigManager( - mgr.GetClient(), - "bobrapet-system", - "bobrapet-operator-config", - ) - wh.Config = operatorConfigManager.GetControllerConfig() return ctrl.NewWebhookManagedBy(mgr). For(&runsv1alpha1.StoryRun{}). WithValidator(&StoryRunCustomValidator{ - Client: mgr.GetClient(), - Config: wh.Config, + Client: mgr.GetClient(), + Config: wh.Config, + ConfigManager: wh.ConfigManager, }). Complete() } @@ -75,12 +71,25 @@ func (wh *StoryRunWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { // NOTE: The +kubebuilder:object:generate=false marker prevents controller-gen from generating DeepCopy methods, // as this struct is used only for temporary operations and does not need to be deeply copied. type StoryRunCustomValidator struct { - Client client.Client - Config *config.ControllerConfig + Client client.Client + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } var _ webhook.CustomValidator = &StoryRunCustomValidator{} +func (v *StoryRunCustomValidator) controllerConfig() *config.ControllerConfig { + if v.ConfigManager != nil { + if cfg := v.ConfigManager.GetControllerConfig(); cfg != nil { + return cfg + } + } + if v.Config != nil { + return v.Config + } + return config.DefaultControllerConfig() +} + // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type StoryRun. func (v *StoryRunCustomValidator) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { storyrun, ok := obj.(*runsv1alpha1.StoryRun) @@ -103,6 +112,21 @@ func (v *StoryRunCustomValidator) ValidateUpdate(ctx context.Context, oldObj, ne } storyrunlog.Info("Validation for StoryRun upon update", "name", storyrun.GetName()) + // Allow metadata-only updates during deletion (e.g., finalizer removal) + if storyrun.DeletionTimestamp != nil { + return nil, nil + } + + // Skip validation if the spec hasn't changed (typical for metadata-only updates) + if oldSr, ok := oldObj.(*runsv1alpha1.StoryRun); ok { + if err := ensureStoryRunObservedGenerationMonotonic(oldSr, storyrun); err != nil { + return nil, err + } + if reflect.DeepEqual(oldSr.Spec, storyrun.Spec) { + return nil, nil + } + } + if err := v.validateStoryRun(ctx, storyrun); err != nil { return nil, err } @@ -131,7 +155,8 @@ func (v *StoryRunCustomValidator) validateStoryRun(ctx context.Context, sr *runs if err != nil { return err } - if err := validateInputsShapeAndSize(v.Config, sr); err != nil { + cfg := v.controllerConfig() + if err := validateInputsShapeAndSize(cfg, sr); err != nil { return err } return validateInputsSchema(story, sr) @@ -202,3 +227,12 @@ func validateInputsSchema(story *bubuv1alpha1.Story, sr *runsv1alpha1.StoryRun) } return nil } + +func ensureStoryRunObservedGenerationMonotonic(oldSR, newSR *runsv1alpha1.StoryRun) error { + oldGen := oldSR.Status.ObservedGeneration + newGen := newSR.Status.ObservedGeneration + if oldGen > 0 && newGen > 0 && newGen < oldGen { + return fmt.Errorf("status.observedGeneration must be monotonically increasing (old=%d new=%d)", oldGen, newGen) + } + return nil +} diff --git a/internal/webhook/runs/v1alpha1/storyrun_webhook_test.go b/internal/webhook/runs/v1alpha1/storyrun_webhook_test.go index aa8f66e..09d5ab4 100644 --- a/internal/webhook/runs/v1alpha1/storyrun_webhook_test.go +++ b/internal/webhook/runs/v1alpha1/storyrun_webhook_test.go @@ -17,11 +17,12 @@ limitations under the License. package v1alpha1 import ( + "context" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" - // TODO (user): Add any additional imports if needed ) var _ = Describe("StoryRun Webhook", func() { @@ -45,27 +46,16 @@ var _ = Describe("StoryRun Webhook", func() { // TODO (user): Add any teardown logic common to all tests }) - Context("When creating or updating StoryRun under Validating Webhook", func() { - // TODO (user): Add logic for validating webhooks - // Example: - // It("Should deny creation if a required field is missing", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "" - // Expect(validator.ValidateCreate(ctx, obj)).Error().To(HaveOccurred()) - // }) - // - // It("Should admit creation if all required fields are present", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "valid_value" - // Expect(validator.ValidateCreate(ctx, obj)).To(BeNil()) - // }) - // - // It("Should validate updates correctly", func() { - // By("simulating a valid update scenario") - // oldObj.SomeRequiredField = "updated_value" - // obj.SomeRequiredField = "updated_value" - // Expect(validator.ValidateUpdate(ctx, oldObj, obj)).To(BeNil()) - // }) + Context("status invariants", func() { + It("rejects observedGeneration regressions on status updates", func() { + old := &runsv1alpha1.StoryRun{} + old.Status.ObservedGeneration = 9 + updated := old.DeepCopy() + updated.Status.ObservedGeneration = 8 + _, err := validator.ValidateUpdate(context.Background(), old, updated) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("status.observedGeneration")) + }) }) }) diff --git a/internal/webhook/v1alpha1/retry_defaults.go b/internal/webhook/v1alpha1/retry_defaults.go new file mode 100644 index 0000000..b8f650f --- /dev/null +++ b/internal/webhook/v1alpha1/retry_defaults.go @@ -0,0 +1,40 @@ +package v1alpha1 + +import ( + "time" + + bubuv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" + "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/pkg/enums" +) + +// ResolveRetryPolicy applies controller defaults to a retry policy when it is omitted. +func ResolveRetryPolicy(cfg *config.ControllerConfig, policy *bubuv1alpha1.RetryPolicy) *bubuv1alpha1.RetryPolicy { + if policy == nil { + policy = &bubuv1alpha1.RetryPolicy{} + } + + if policy.MaxRetries == nil { + maxRetries := int32(3) + if cfg != nil && cfg.MaxRetries > 0 { + maxRetries = int32(cfg.MaxRetries) + } + policy.MaxRetries = &maxRetries + } + + if policy.Delay == nil || *policy.Delay == "" { + delay := time.Second + if cfg != nil && cfg.ExponentialBackoffBase > 0 { + delay = cfg.ExponentialBackoffBase + } + delayStr := delay.String() + policy.Delay = &delayStr + } + + if policy.Backoff == nil { + strategy := enums.BackoffStrategyExponential + policy.Backoff = &strategy + } + + return policy +} diff --git a/internal/webhook/v1alpha1/story_webhook.go b/internal/webhook/v1alpha1/story_webhook.go index 16a26fe..8188c23 100644 --- a/internal/webhook/v1alpha1/story_webhook.go +++ b/internal/webhook/v1alpha1/story_webhook.go @@ -20,10 +20,12 @@ import ( "context" "encoding/json" "fmt" + "reflect" + "sort" + "strings" "time" - "github.com/xeipuuv/gojsonschema" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -45,21 +47,24 @@ var storylog = logf.Log.WithName("story-resource") // StoryWebhook sets up the webhook for Story in the manager. type StoryWebhook struct { client.Client - Config *config.ControllerConfig + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } // SetupWebhookWithManager registers the webhook for Story in the manager. func (wh *StoryWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { wh.Client = mgr.GetClient() - operatorConfigManager := config.NewOperatorConfigManager(mgr.GetClient(), "bobrapet-system", "bobrapet-operator-config") - wh.Config = operatorConfigManager.GetControllerConfig() return ctrl.NewWebhookManagedBy(mgr).For(&bubushv1alpha1.Story{}). WithValidator(&StoryCustomValidator{ - Client: wh.Client, - Config: wh.Config, + Client: wh.Client, + Config: wh.Config, + ConfigManager: wh.ConfigManager, + }). + WithDefaulter(&StoryCustomDefaulter{ + Config: wh.Config, + ConfigManager: wh.ConfigManager, }). - WithDefaulter(&StoryCustomDefaulter{}). Complete() } @@ -71,11 +76,24 @@ func (wh *StoryWebhook) SetupWebhookWithManager(mgr ctrl.Manager) error { // NOTE: The +kubebuilder:object:generate=false marker prevents controller-gen from generating DeepCopy methods, // as it is used only for temporary operations and does not need to be deeply copied. type StoryCustomDefaulter struct { - // TODO(user): Add more fields as needed for defaulting + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } var _ webhook.CustomDefaulter = &StoryCustomDefaulter{} +func (d *StoryCustomDefaulter) controllerConfig() *config.ControllerConfig { + if d.ConfigManager != nil { + if cfg := d.ConfigManager.GetControllerConfig(); cfg != nil { + return cfg + } + } + if d.Config != nil { + return d.Config + } + return config.DefaultControllerConfig() +} + // Default implements webhook.CustomDefaulter so a webhook will be registered for the Kind Story. func (d *StoryCustomDefaulter) Default(_ context.Context, obj runtime.Object) error { story, ok := obj.(*bubushv1alpha1.Story) @@ -85,6 +103,19 @@ func (d *StoryCustomDefaulter) Default(_ context.Context, obj runtime.Object) er } storylog.Info("Defaulting for Story", "name", story.GetName()) + cfg := d.controllerConfig() + if story.Spec.Policy != nil { + if story.Spec.Policy.Retries == nil { + story.Spec.Policy.Retries = &bubushv1alpha1.StoryRetries{} + } + story.Spec.Policy.Retries.StepRetryPolicy = ResolveRetryPolicy(cfg, story.Spec.Policy.Retries.StepRetryPolicy) + } + for i := range story.Spec.Steps { + if story.Spec.Steps[i].Execution != nil { + story.Spec.Steps[i].Execution.Retry = ResolveRetryPolicy(cfg, story.Spec.Steps[i].Execution.Retry) + } + } + return nil } @@ -98,12 +129,25 @@ func (d *StoryCustomDefaulter) Default(_ context.Context, obj runtime.Object) er // NOTE: The +kubebuilder:object:generate=false marker prevents controller-gen from generating DeepCopy methods, // as this struct is used only for temporary operations and does not need to be deeply copied. type StoryCustomValidator struct { - Client client.Client - Config *config.ControllerConfig + Client client.Client + Config *config.ControllerConfig + ConfigManager *config.OperatorConfigManager } var _ webhook.CustomValidator = &StoryCustomValidator{} +func (v *StoryCustomValidator) controllerConfig() *config.ControllerConfig { + if v.ConfigManager != nil { + if cfg := v.ConfigManager.GetControllerConfig(); cfg != nil { + return cfg + } + } + if v.Config != nil { + return v.Config + } + return config.DefaultControllerConfig() +} + // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type Story. func (v *StoryCustomValidator) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { story, ok := obj.(*bubushv1alpha1.Story) @@ -129,6 +173,18 @@ func (v *StoryCustomValidator) ValidateUpdate(ctx context.Context, oldObj, newOb } storylog.Info("Validation for Story upon update", "name", story.GetName()) + // Allow metadata-only updates during deletion (e.g., finalizer removal) + if story.DeletionTimestamp != nil { + return nil, nil + } + + // Skip validation if the spec hasn't changed + if oldStory, ok := oldObj.(*bubushv1alpha1.Story); ok { + if reflect.DeepEqual(oldStory.Spec, story.Spec) { + return nil, nil + } + } + webhookCtx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() @@ -158,7 +214,8 @@ func (v *StoryCustomValidator) validateStory(ctx context.Context, story *bubushv if err := validateStorySize(story); err != nil { return err } - maxSize := pickStoryMaxWithSize(v.Config) + cfg := v.controllerConfig() + maxSize := pickStoryMaxWithSize(cfg) if err := validateOutputSize(story, maxSize); err != nil { return err } @@ -168,6 +225,12 @@ func (v *StoryCustomValidator) validateStory(ctx context.Context, story *bubushv if err := validateNeedsExistence(story); err != nil { return err } + if err := validateStepGraphAcyclic(story); err != nil { + return err + } + if err := v.validateExecuteStoryReferences(ctx, story); err != nil { + return err + } return nil } @@ -184,6 +247,9 @@ func validateStorySize(story *bubushv1alpha1.Story) error { } func pickStoryMaxWithSize(cfg *config.ControllerConfig) int { + if cfg == nil { + cfg = config.DefaultControllerConfig() + } maxSize := cfg.MaxStoryWithBlockSizeBytes if maxSize <= 0 { maxSize = config.DefaultControllerConfig().MaxStoryWithBlockSizeBytes @@ -239,7 +305,8 @@ func validatePrimitiveShapes(s *bubushv1alpha1.Step) error { } var withConfig struct { StoryRef struct { - Name string `json:"name"` + Name string `json:"name"` + Namespace string `json:"namespace,omitempty"` } `json:"storyRef"` } if err := json.Unmarshal(s.With.Raw, &withConfig); err != nil { @@ -252,6 +319,47 @@ func validatePrimitiveShapes(s *bubushv1alpha1.Step) error { return nil } +func (v *StoryCustomValidator) validateExecuteStoryReferences(ctx context.Context, story *bubushv1alpha1.Story) error { + if v.Client == nil { + return nil + } + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] + if step.Type != enums.StepTypeExecuteStory || step.With == nil { + continue + } + var withConfig struct { + StoryRef struct { + Name string `json:"name"` + Namespace string `json:"namespace,omitempty"` + } `json:"storyRef"` + } + if err := json.Unmarshal(step.With.Raw, &withConfig); err != nil { + return fmt.Errorf("step '%s' has an invalid 'with' block for type 'executeStory': %w", step.Name, err) + } + targetNamespace := story.Namespace + if withConfig.StoryRef.Namespace != "" { + targetNamespace = withConfig.StoryRef.Namespace + } + if withConfig.StoryRef.Name == "" { + continue + } + if targetNamespace == story.Namespace && withConfig.StoryRef.Name == story.Name { + return fmt.Errorf("step '%s' of type 'executeStory' cannot reference the same story", step.Name) + } + + var target bubushv1alpha1.Story + key := types.NamespacedName{Namespace: targetNamespace, Name: withConfig.StoryRef.Name} + if err := v.Client.Get(ctx, key, &target); err != nil { + if apierrors.IsNotFound(err) { + return fmt.Errorf("step '%s' of type 'executeStory' references Story '%s/%s' which does not exist", step.Name, targetNamespace, withConfig.StoryRef.Name) + } + return fmt.Errorf("failed to validate executeStory reference for step '%s': %w", step.Name, err) + } + } + return nil +} + func validateNeedsExistence(story *bubushv1alpha1.Story) error { stepNames := make(map[string]struct{}, len(story.Spec.Steps)) for i := range story.Spec.Steps { @@ -268,6 +376,92 @@ func validateNeedsExistence(story *bubushv1alpha1.Story) error { return nil } +func validateStepGraphAcyclic(story *bubushv1alpha1.Story) error { + if len(story.Spec.Steps) == 0 { + return nil + } + + indegree, edges, err := buildStepDependencyGraph(story) + if err != nil { + return err + } + + blocked := detectStepGraphCycles(indegree, edges) + if len(blocked) > 0 { + sort.Strings(blocked) + return fmt.Errorf("story contains a dependency cycle involving step(s): %s", strings.Join(blocked, ", ")) + } + return nil +} + +func buildStepDependencyGraph(story *bubushv1alpha1.Story) (map[string]int, map[string][]string, error) { + indegree := make(map[string]int, len(story.Spec.Steps)) + edges := make(map[string][]string, len(story.Spec.Steps)) + index := make(map[string]int, len(story.Spec.Steps)) + + for i := range story.Spec.Steps { + name := story.Spec.Steps[i].Name + indegree[name] = 0 + index[name] = i + } + + for i := range story.Spec.Steps { + step := &story.Spec.Steps[i] + seen := make(map[string]struct{}, len(step.Needs)) + for _, dep := range step.Needs { + if dep == step.Name { + return nil, nil, fmt.Errorf("step '%s' cannot depend on itself", step.Name) + } + if depIdx, ok := index[dep]; ok && depIdx >= i { + return nil, nil, fmt.Errorf("step '%s' dependency '%s' must be declared before the step", step.Name, dep) + } + if _, dup := seen[dep]; dup { + continue + } + seen[dep] = struct{}{} + edges[dep] = append(edges[dep], step.Name) + indegree[step.Name]++ + } + } + + return indegree, edges, nil +} + +func detectStepGraphCycles(indegree map[string]int, edges map[string][]string) []string { + queue := make([]string, 0, len(indegree)) + for name, deg := range indegree { + if deg == 0 { + queue = append(queue, name) + } + } + + processed := 0 + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + processed++ + + for _, child := range edges[current] { + indegree[child]-- + if indegree[child] == 0 { + queue = append(queue, child) + } + } + } + + if processed == len(indegree) { + return nil + } + + blocked := make([]string, 0, len(indegree)) + for name, deg := range indegree { + if deg > 0 { + blocked = append(blocked, name) + } + } + return blocked +} + func (v *StoryCustomValidator) validateEngramStep(ctx context.Context, namespace string, step *bubushv1alpha1.Step) error { // Fetch the Engram var engram bubushv1alpha1.Engram @@ -276,7 +470,7 @@ func (v *StoryCustomValidator) validateEngramStep(ctx context.Context, namespace engramKey.Namespace = *step.Ref.Namespace } if err := v.Client.Get(ctx, engramKey, &engram); err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { return fmt.Errorf("step '%s' references engram '%s' which does not exist in namespace '%s'", step.Name, engramKey.Name, engramKey.Namespace) } return fmt.Errorf("failed to get engram for step '%s': %w", step.Name, err) @@ -285,26 +479,16 @@ func (v *StoryCustomValidator) validateEngramStep(ctx context.Context, namespace // Fetch the EngramTemplate var template v1alpha1.EngramTemplate if err := v.Client.Get(ctx, types.NamespacedName{Name: engram.Spec.TemplateRef.Name, Namespace: ""}, &template); err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { return fmt.Errorf("step '%s' references engram '%s' which in turn references EngramTemplate '%s' that was not found", step.Name, engram.Name, engram.Spec.TemplateRef.Name) } return fmt.Errorf("failed to get EngramTemplate for step '%s': %w", step.Name, err) } - // Validate the step's 'with' block against the template's inputSchema. + // Validate the step's 'with' block against the template's inputSchema using shared validator. if step.With != nil && len(step.With.Raw) > 0 && template.Spec.InputSchema != nil && len(template.Spec.InputSchema.Raw) > 0 { - schemaLoader := gojsonschema.NewStringLoader(string(template.Spec.InputSchema.Raw)) - documentLoader := gojsonschema.NewStringLoader(string(step.With.Raw)) - result, err := gojsonschema.Validate(schemaLoader, documentLoader) - if err != nil { - return fmt.Errorf("step '%s': error validating 'with' block against EngramTemplate schema: %w", step.Name, err) - } - if !result.Valid() { - var errs []string - for _, desc := range result.Errors() { - errs = append(errs, desc.String()) - } - return fmt.Errorf("step '%s': 'with' block is invalid against EngramTemplate schema: %v", step.Name, errs) + if err := validateJSONAgainstSchema(step.With.Raw, template.Spec.InputSchema.Raw, "EngramTemplate"); err != nil { + return fmt.Errorf("step '%s': %w", step.Name, err) } } diff --git a/internal/webhook/v1alpha1/story_webhook_test.go b/internal/webhook/v1alpha1/story_webhook_test.go index 2d0f4b0..d111631 100644 --- a/internal/webhook/v1alpha1/story_webhook_test.go +++ b/internal/webhook/v1alpha1/story_webhook_test.go @@ -1,87 +1,134 @@ -/* -Copyright 2025 BubuStack. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package v1alpha1 import ( + "encoding/json" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + bubushv1alpha1 "github.com/bubustack/bobrapet/api/v1alpha1" - // TODO (user): Add any additional imports if needed + "github.com/bubustack/bobrapet/internal/config" + "github.com/bubustack/bobrapet/pkg/enums" ) var _ = Describe("Story Webhook", func() { var ( - obj *bubushv1alpha1.Story - oldObj *bubushv1alpha1.Story validator StoryCustomValidator defaulter StoryCustomDefaulter ) BeforeEach(func() { - obj = &bubushv1alpha1.Story{} - oldObj = &bubushv1alpha1.Story{} - validator = StoryCustomValidator{} - Expect(validator).NotTo(BeNil(), "Expected validator to be initialized") - defaulter = StoryCustomDefaulter{} - Expect(defaulter).NotTo(BeNil(), "Expected defaulter to be initialized") - Expect(oldObj).NotTo(BeNil(), "Expected oldObj to be initialized") - Expect(obj).NotTo(BeNil(), "Expected obj to be initialized") - // TODO (user): Add any setup logic common to all tests + cfg := config.DefaultControllerConfig() + validator = StoryCustomValidator{Client: k8sClient, Config: cfg} + defaulter = StoryCustomDefaulter{Config: cfg} + Expect(defaulter).NotTo(BeNil()) }) - AfterEach(func() { - // TODO (user): Add any teardown logic common to all tests - }) + Context("defaulting", func() { + It("applies retry defaults to story policy and steps", func() { + story := minimalStory("defaults") + story.Spec.Policy = &bubushv1alpha1.StoryPolicy{} + story.Spec.Steps[0].Execution = &bubushv1alpha1.ExecutionOverrides{} - Context("When creating Story under Defaulting Webhook", func() { - // TODO (user): Add logic for defaulting webhooks - // Example: - // It("Should apply defaults when a required field is empty", func() { - // By("simulating a scenario where defaults should be applied") - // obj.SomeFieldWithDefault = "" - // By("calling the Default method to apply defaults") - // defaulter.Default(ctx, obj) - // By("checking that the default values are set") - // Expect(obj.SomeFieldWithDefault).To(Equal("default_value")) - // }) + Expect(defaulter.Default(ctx, story)).To(Succeed()) + Expect(story.Spec.Policy.Retries).NotTo(BeNil()) + Expect(story.Spec.Policy.Retries.StepRetryPolicy).NotTo(BeNil()) + Expect(story.Spec.Steps[0].Execution.Retry).NotTo(BeNil()) + }) }) - Context("When creating or updating Story under Validating Webhook", func() { - // TODO (user): Add logic for validating webhooks - // Example: - // It("Should deny creation if a required field is missing", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "" - // Expect(validator.ValidateCreate(ctx, obj)).Error().To(HaveOccurred()) - // }) - // - // It("Should admit creation if all required fields are present", func() { - // By("simulating an invalid creation scenario") - // obj.SomeRequiredField = "valid_value" - // Expect(validator.ValidateCreate(ctx, obj)).To(BeNil()) - // }) - // - // It("Should validate updates correctly", func() { - // By("simulating a valid update scenario") - // oldObj.SomeRequiredField = "updated_value" - // obj.SomeRequiredField = "updated_value" - // Expect(validator.ValidateUpdate(ctx, oldObj, obj)).To(BeNil()) - // }) - }) + Context("executeStory validation", func() { + It("rejects executeStory steps that reference missing stories", func() { + story := minimalStory("parent") + story.Spec.Steps = append(story.Spec.Steps, bubushv1alpha1.Step{ + Name: "invoke-child", + Type: enums.StepTypeExecuteStory, + With: mustRawExtension(map[string]any{ + "storyRef": map[string]any{"name": "missing"}, + }), + }) + + _, err := validator.ValidateCreate(ctx, story) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("does not exist")) + }) + It("rejects executeStory steps that reference the same story", func() { + story := minimalStory("loop") + story.Spec.Steps = append(story.Spec.Steps, bubushv1alpha1.Step{ + Name: "self", + Type: enums.StepTypeExecuteStory, + With: mustRawExtension(map[string]any{ + "storyRef": map[string]any{"name": "loop"}, + }), + }) + + _, err := validator.ValidateCreate(ctx, story) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot reference the same story")) + }) + + It("allows executeStory steps that reference existing stories", func() { + child := minimalStory("child") + Expect(k8sClient.Create(ctx, child)).To(Succeed()) + DeferCleanup(func() { + _ = k8sClient.Delete(ctx, child) + }) + + story := minimalStory("parent") + story.Spec.Steps = append(story.Spec.Steps, bubushv1alpha1.Step{ + Name: "invoke-child", + Type: enums.StepTypeExecuteStory, + With: mustRawExtension(map[string]any{ + "storyRef": map[string]any{"name": "child"}, + }), + }) + + warnings, err := validator.ValidateCreate(ctx, story) + Expect(err).NotTo(HaveOccurred()) + Expect(warnings).To(BeNil()) + }) + }) }) + +func minimalStory(name string) *bubushv1alpha1.Story { + return &bubushv1alpha1.Story{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: name, + }, + Spec: bubushv1alpha1.StorySpec{ + Steps: []bubushv1alpha1.Step{{ + Name: "prepare", + Type: enums.StepTypeSetData, + }}, + }, + } +} + +func mustRawExtension(payload any) *runtime.RawExtension { + raw, err := toRawExtension(payload) + Expect(err).NotTo(HaveOccurred()) + return raw +} + +func toRawExtension(payload any) (*runtime.RawExtension, error) { + if payload == nil { + return nil, nil + } + switch v := payload.(type) { + case *runtime.RawExtension: + return v.DeepCopy(), nil + case []byte: + return &runtime.RawExtension{Raw: append([]byte(nil), v...)}, nil + default: + data, err := json.Marshal(payload) + if err != nil { + return nil, err + } + return &runtime.RawExtension{Raw: data}, nil + } +} diff --git a/internal/webhook/v1alpha1/validate_helpers.go b/internal/webhook/v1alpha1/validate_helpers.go index aa25f0c..d78ea87 100644 --- a/internal/webhook/v1alpha1/validate_helpers.go +++ b/internal/webhook/v1alpha1/validate_helpers.go @@ -1,6 +1,7 @@ package v1alpha1 import ( + "encoding/json" "fmt" "github.com/xeipuuv/gojsonschema" @@ -42,7 +43,17 @@ func enforceMaxBytes(field string, raw []byte, max int) error { } func validateJSONAgainstSchema(doc []byte, schema []byte, schemaName string) error { - schemaLoader := gojsonschema.NewStringLoader(string(schema)) + // Normalize schema to support field-level required booleans (required: true) + // by translating them into standard JSON Schema required arrays at the + // appropriate object levels. This allows template authors to mark + // properties as required inline without manually maintaining a separate + // parent-level required list. + normalizedSchema, err := normalizeSchemaBytes(schema) + if err != nil { + return fmt.Errorf("error validating against %s schema: failed to normalize schema: %w", schemaName, err) + } + + schemaLoader := gojsonschema.NewStringLoader(string(normalizedSchema)) documentLoader := gojsonschema.NewStringLoader(string(doc)) result, err := gojsonschema.Validate(schemaLoader, documentLoader) if err != nil { @@ -57,3 +68,180 @@ func validateJSONAgainstSchema(doc []byte, schema []byte, schemaName string) err } return nil } + +// normalizeSchemaBytes takes a JSON-encoded schema and rewrites any +// property-level `required: true` flags into the parent object's `required` +// array, recursively. It preserves all other keywords and structure. +func normalizeSchemaBytes(schema []byte) ([]byte, error) { + if len(schema) == 0 { + return schema, nil + } + var root any + if err := json.Unmarshal(schema, &root); err != nil { + return nil, err + } + normalized := normalizeSchemaNode(root) + out, err := json.Marshal(normalized) + if err != nil { + return nil, err + } + return out, nil +} + +// normalizeSchemaNode walks an arbitrary JSON value. For any object with a +// `properties` map, it collects boolean `required: true` flags from its child +// property schemas and moves them into the object's `required` array. +// It also recurses into common nested schema locations like items, +// additionalProperties, allOf/anyOf/oneOf, $defs/definitions and +// patternProperties. +func normalizeSchemaNode(node any) any { + switch typed := node.(type) { + case map[string]any: + return normalizeObjectSchema(typed) + case []any: + for i := range typed { + typed[i] = normalizeSchemaNode(typed[i]) + } + return typed + default: + return node + } +} + +func normalizeObjectSchema(obj map[string]any) map[string]any { + requiredSet := liftInlineRequiredFlags(obj) + mergeRequiredSet(obj, requiredSet) + normalizeNestedSchemaLocations(obj) + return obj +} + +func liftInlineRequiredFlags(obj map[string]any) map[string]struct{} { + props, hasProps := obj["properties"].(map[string]any) + if !hasProps { + return nil + } + + requiredSet := map[string]struct{}{} + for propName, rawChild := range props { + cleaned := stripBooleanRequired(rawChild, propName, requiredSet) + props[propName] = normalizeSchemaNode(cleaned) + } + if len(requiredSet) == 0 { + return nil + } + return requiredSet +} + +func stripBooleanRequired(node any, propName string, requiredSet map[string]struct{}) any { + childMap, ok := node.(map[string]any) + if !ok { + return node + } + if raw, has := childMap["required"]; has { + if b, ok := raw.(bool); ok { + if b { + requiredSet[propName] = struct{}{} + } + delete(childMap, "required") + } + } + return childMap +} + +func mergeRequiredSet(obj map[string]any, requiredSet map[string]struct{}) { + if len(requiredSet) == 0 { + return + } + + existingList := extractExistingRequired(obj) + seen := make(map[string]struct{}, len(existingList)) + for _, name := range existingList { + seen[name] = struct{}{} + } + + for name := range requiredSet { + if _, already := seen[name]; !already { + existingList = append(existingList, name) + } + } + + out := make([]any, 0, len(existingList)) + for _, name := range existingList { + out = append(out, name) + } + obj["required"] = out +} + +func extractExistingRequired(obj map[string]any) []string { + raw, has := obj["required"] + if !has { + return nil + } + + switch typed := raw.(type) { + case []any: + var result []string + for _, v := range typed { + if s, ok := v.(string); ok { + result = append(result, s) + } + } + return result + case []string: + return append([]string{}, typed...) + default: + return nil + } +} + +func normalizeNestedSchemaLocations(obj map[string]any) { + normalizeItemsNode(obj) + normalizeSingleSchemaField(obj, "additionalProperties") + normalizeMapOfSchemas(obj, "patternProperties") + normalizeSchemaSlice(obj, "allOf") + normalizeSchemaSlice(obj, "anyOf") + normalizeSchemaSlice(obj, "oneOf") + normalizeMapOfSchemas(obj, "definitions") + normalizeMapOfSchemas(obj, "$defs") + normalizeSingleSchemaField(obj, "not") +} + +func normalizeItemsNode(obj map[string]any) { + items, has := obj["items"] + if !has { + return + } + switch typed := items.(type) { + case map[string]any, []any: + obj["items"] = normalizeSchemaNode(typed) + } +} + +func normalizeSingleSchemaField(obj map[string]any, key string) { + if raw, has := obj[key]; has { + if schemaMap, ok := raw.(map[string]any); ok { + obj[key] = normalizeSchemaNode(schemaMap) + } + } +} + +func normalizeMapOfSchemas(obj map[string]any, key string) { + raw, has := obj[key].(map[string]any) + if !has { + return + } + for k, v := range raw { + raw[k] = normalizeSchemaNode(v) + } +} + +func normalizeSchemaSlice(obj map[string]any, key string) { + raw, has := obj[key].([]any) + if !has { + return + } + for i := range raw { + raw[i] = normalizeSchemaNode(raw[i]) + } + obj[key] = raw +} diff --git a/pkg/cache/storyrun_cache.go b/pkg/cache/storyrun_cache.go deleted file mode 100644 index deea59c..0000000 --- a/pkg/cache/storyrun_cache.go +++ /dev/null @@ -1,43 +0,0 @@ -package cache - -import ( - "sync" - - runsv1alpha1 "github.com/bubustack/bobrapet/api/runs/v1alpha1" - "k8s.io/apimachinery/pkg/types" -) - -// StoryRunCache is a simple thread-safe in-memory cache for StoryRun objects. -type StoryRunCache struct { - mu sync.RWMutex - store map[types.NamespacedName]*runsv1alpha1.StoryRun -} - -// NewStoryRunCache creates a new StoryRunCache. -func NewStoryRunCache() *StoryRunCache { - return &StoryRunCache{ - store: make(map[types.NamespacedName]*runsv1alpha1.StoryRun), - } -} - -// Get retrieves a StoryRun from the cache. -func (c *StoryRunCache) Get(name types.NamespacedName) (*runsv1alpha1.StoryRun, bool) { - c.mu.RLock() - defer c.mu.RUnlock() - srun, found := c.store[name] - return srun, found -} - -// AddOrUpdate adds or updates a StoryRun in the cache. -func (c *StoryRunCache) AddOrUpdate(srun *runsv1alpha1.StoryRun) { - c.mu.Lock() - defer c.mu.Unlock() - c.store[types.NamespacedName{Name: srun.Name, Namespace: srun.Namespace}] = srun -} - -// Delete removes a StoryRun from the cache. -func (c *StoryRunCache) Delete(name types.NamespacedName) { - c.mu.Lock() - defer c.mu.Unlock() - delete(c.store, name) -} diff --git a/pkg/cel/evaluator.go b/pkg/cel/evaluator.go index b1d1ea5..7a40ca0 100644 --- a/pkg/cel/evaluator.go +++ b/pkg/cel/evaluator.go @@ -3,16 +3,26 @@ package cel import ( "context" "fmt" + "math" "reflect" + "regexp" "strings" + "unicode" "github.com/google/cel-go/cel" + "github.com/google/cel-go/common/types" + "github.com/google/cel-go/common/types/ref" + "github.com/google/cel-go/common/types/traits" "github.com/google/cel-go/ext" "google.golang.org/protobuf/types/known/structpb" + "go.opentelemetry.io/otel/attribute" + "github.com/bubustack/bobrapet/pkg/observability" ) +const ManifestLengthKey = "__bubu_manifest_len" + // celEnvLib provides custom functions to the CEL environment. type celEnvLib struct{} @@ -38,6 +48,7 @@ func New(logger observability.Logger) (*Evaluator, error) { cel.Lib(celEnvLib{}), ext.Strings(), ext.Encoders(), + lenFunction(), // Declare top-level variables that can be used in expressions. cel.Variable("inputs", cel.MapType(cel.StringType, cel.AnyType)), cel.Variable("steps", cel.MapType(cel.StringType, cel.AnyType)), @@ -60,11 +71,14 @@ func (e *Evaluator) Close() { // EvaluateWhenCondition evaluates a step's `when` condition. func (e *Evaluator) EvaluateWhenCondition(ctx context.Context, when string, vars map[string]any) (bool, error) { + ctx, span := observability.StartSpan(ctx, "cel.EvaluateWhenCondition", attribute.String("expression_type", "when")) + defer span.End() if strings.TrimSpace(when) == "" { return true, nil } - program, err := e.compile(ctx, when, "when") + sanitized := sanitizeCELExpression(when) + program, err := e.compile(ctx, sanitized, "when") if err != nil { return false, fmt.Errorf("failed to compile 'when' expression: %w", err) } @@ -88,6 +102,8 @@ func (e *Evaluator) ResolveWithInputs( with map[string]any, vars map[string]any, ) (map[string]any, error) { + ctx, span := observability.StartSpan(ctx, "cel.ResolveWithInputs", attribute.String("expression_type", "with")) + defer span.End() if with == nil { return nil, nil } @@ -102,7 +118,8 @@ func (e *Evaluator) ResolveWithInputs( if strings.HasPrefix(strVal, "{{") && strings.HasSuffix(strVal, "}}") { expr := strings.TrimSpace(strVal[2 : len(strVal)-2]) - program, err := e.compile(ctx, expr, "with") + sanitized := sanitizeCELExpression(expr) + program, err := e.compile(ctx, sanitized, "with") if err != nil { return nil, fmt.Errorf("failed to compile 'with' expression for key '%s': %w", key, err) } @@ -126,9 +143,184 @@ func (e *Evaluator) ResolveWithInputs( } func (e *Evaluator) compile(ctx context.Context, expr, exprType string) (cel.Program, error) { + ctx, span := observability.StartSpan(ctx, "cel.compile", attribute.String("expression_type", exprType)) + defer span.End() cached, err := e.cache.CompileAndCache(ctx, expr, exprType) if err != nil { return nil, err } return cached.Program, nil } + +func lenFunction() cel.EnvOption { + return cel.Function("len", + cel.Overload( + "len_dyn", + []*cel.Type{cel.AnyType}, + cel.IntType, + cel.UnaryBinding(func(value ref.Val) ref.Val { + length, err := celLengthFromRefVal(value) + if err != nil { + return types.NewErr("%s", err.Error()) + } + return types.Int(length) + }), + ), + ) +} + +func celLengthFromRefVal(value ref.Val) (int64, error) { + if value == nil || value == types.NullValue { + return 0, nil + } + + if mapper, ok := value.(traits.Mapper); ok { + if length, ok := manifestLengthFromMapper(mapper); ok { + return length, nil + } + } + + if sizer, ok := value.(traits.Sizer); ok { + if length, ok := lengthFromSizer(sizer); ok { + return length, nil + } + } + + if length, ok := lengthFromNativeValue(value.Value()); ok { + return length, nil + } + + return 0, fmt.Errorf("len: unsupported argument type %s", value.Type()) +} + +func manifestLengthFromMapper(mapper traits.Mapper) (int64, bool) { + manifestVal, found := mapper.Find(types.String(ManifestLengthKey)) + if !found || manifestVal == nil || manifestVal == types.NullValue { + return 0, false + } + return coerceRefValToInt64(manifestVal) +} + +func lengthFromSizer(sizer traits.Sizer) (int64, bool) { + size := sizer.Size() + if size == nil || size == types.NullValue { + return 0, true + } + if intVal, ok := size.(types.Int); ok { + return int64(intVal), true + } + if native, err := size.ConvertToNative(reflect.TypeOf(int64(0))); err == nil { + if typed, ok := native.(int64); ok { + return typed, true + } + } + if converted, err := size.ConvertToNative(reflect.TypeOf(int(0))); err == nil { + if typed, ok := converted.(int); ok { + return int64(typed), true + } + } + return coerceToInt64(size.Value()) +} + +func lengthFromNativeValue(value any) (int64, bool) { + switch v := value.(type) { + case string: + return int64(len(v)), true + case []byte: + return int64(len(v)), true + case []any: + return int64(len(v)), true + case map[string]any: + if length, ok := coerceToInt64(v[ManifestLengthKey]); ok { + return length, true + } + return int64(len(v)), true + case map[any]any: + if length, ok := coerceToInt64(v[ManifestLengthKey]); ok { + return length, true + } + return int64(len(v)), true + default: + return 0, false + } +} + +func coerceRefValToInt64(val ref.Val) (int64, bool) { + if val == nil || val == types.NullValue { + return 0, false + } + if intVal, ok := val.(types.Int); ok { + return int64(intVal), true + } + if native, err := val.ConvertToNative(reflect.TypeOf(int64(0))); err == nil { + if typed, ok := native.(int64); ok { + return typed, true + } + } + return coerceToInt64(val.Value()) +} + +func coerceToInt64(value any) (int64, bool) { + switch v := value.(type) { + case int: + return int64(v), true + case int32: + return int64(v), true + case int64: + return v, true + case float32: + return int64(v), true + case float64: + return int64(v), true + case uint: + return int64(v), true + case uint32: + return int64(v), true + case uint64: + if v > math.MaxInt64 { + return 0, false + } + return int64(v), true + case types.Int: + return int64(v), true + case *int64: + if v == nil { + return 0, false + } + return *v, true + default: + return 0, false + } +} + +var ( + stepReferencePattern = regexp.MustCompile(`steps\.([A-Za-z0-9_\-]+)`) +) + +func sanitizeCELExpression(expr string) string { + return stepReferencePattern.ReplaceAllStringFunc(expr, func(match string) string { + submatches := stepReferencePattern.FindStringSubmatch(match) + if len(submatches) != 2 { + return match + } + original := submatches[1] + alias := sanitizeIdentifier(original) + if alias == original { + return match + } + return strings.Replace(match, original, alias, 1) + }) +} + +func sanitizeIdentifier(name string) string { + var b strings.Builder + b.Grow(len(name)) + for _, r := range name { + if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' { + b.WriteRune(r) + } else { + b.WriteRune('_') + } + } + return b.String() +} diff --git a/pkg/cel/evaluator_test.go b/pkg/cel/evaluator_test.go new file mode 100644 index 0000000..19e71e7 --- /dev/null +++ b/pkg/cel/evaluator_test.go @@ -0,0 +1,77 @@ +package cel + +import ( + "context" + "testing" + "time" +) + +type nopLogger struct{} + +func (nopLogger) CacheHit(string, string) {} +func (nopLogger) EvaluationStart(string, string) {} +func (nopLogger) EvaluationError(error, string, string, time.Duration) {} +func (nopLogger) EvaluationSuccess(string, string, time.Duration, any) {} + +func TestEvaluateWhenConditionHandlesHyphenatedStepNames(t *testing.T) { + eval, err := New(nopLogger{}) + if err != nil { + t.Fatalf("failed to create evaluator: %v", err) + } + defer eval.Close() + + tools := []any{"issue.create"} + stepContext := map[string]any{ + "outputs": map[string]any{"tools": tools}, + "output": map[string]any{"tools": tools}, + } + + vars := map[string]any{ + "inputs": map[string]any{}, + "steps": map[string]any{ + "list-tools": stepContext, + "list_tools": stepContext, + }, + } + + ok, err := eval.EvaluateWhenCondition(context.Background(), `len(steps.list-tools.output.tools) > 0`, vars) + if err != nil { + t.Fatalf("unexpected evaluation error: %v", err) + } + if !ok { + t.Fatalf("expected condition to evaluate to true") + } +} + +func TestLenUsesManifestLengthMetadata(t *testing.T) { + eval, err := New(nopLogger{}) + if err != nil { + t.Fatalf("failed to create evaluator: %v", err) + } + defer eval.Close() + + stepContext := map[string]any{ + "outputs": map[string]any{ + ManifestLengthKey: int64(4), + }, + "output": map[string]any{ + ManifestLengthKey: int64(4), + }, + } + + vars := map[string]any{ + "inputs": map[string]any{}, + "steps": map[string]any{ + "collect-data": stepContext, + "collect_data": stepContext, + }, + } + + ok, err := eval.EvaluateWhenCondition(context.Background(), `len(steps.collect-data.output) == 4`, vars) + if err != nil { + t.Fatalf("unexpected evaluation error: %v", err) + } + if !ok { + t.Fatalf("expected manifest-backed length to evaluate to true") + } +} diff --git a/pkg/cel/expressions.go b/pkg/cel/expressions.go index b0d84da..52b6ee5 100644 --- a/pkg/cel/expressions.go +++ b/pkg/cel/expressions.go @@ -149,7 +149,7 @@ func (r *ExpressionResolver) resolveStepsExpression(ctx context.Context, express err := r.client.List(ctx, &stepRuns, client.InNamespace(r.namespace), client.MatchingLabels{ - "bobrapet.bubustack.io/story-run": r.storyRun, + "bubustack.io/storyrun": r.storyRun, }) if err != nil { @@ -303,7 +303,7 @@ func (r *ExpressionResolver) CheckDependencies(ctx context.Context, dependencies err := r.client.List(ctx, &stepRuns, client.InNamespace(r.namespace), client.MatchingLabels{ - "bobrapet.bubustack.io/story-run": r.storyRun, + "bubustack.io/storyrun": r.storyRun, }) if err != nil { diff --git a/pkg/observability/tracing.go b/pkg/observability/tracing.go new file mode 100644 index 0000000..b25fd2b --- /dev/null +++ b/pkg/observability/tracing.go @@ -0,0 +1,32 @@ +package observability + +import ( + "context" + "sync/atomic" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + +var tracingEnabled atomic.Bool + +// EnableTracing toggles OpenTelemetry span emission for shared helpers. +func EnableTracing(enabled bool) { + tracingEnabled.Store(enabled) +} + +// TracingEnabled reports whether spans should be emitted. +func TracingEnabled() bool { + return tracingEnabled.Load() +} + +// StartSpan creates a tracer span when tracing is enabled and returns the context/span pair. +// When tracing is disabled, it returns the original context and a no-op span. +func StartSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span) { + if !TracingEnabled() { + return ctx, trace.SpanFromContext(ctx) + } + tracer := otel.Tracer("github.com/bubustack/bobrapet") + return tracer.Start(ctx, name, trace.WithAttributes(attrs...)) +}