From 84d01382ee2eea32bcca93780a92f20ce6d4a7f8 Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Tue, 16 Nov 2021 10:09:08 +0000 Subject: [PATCH] [Feature] Add Startup Probe support --- CHANGELOG.md | 1 + pkg/apis/deployment/v1/server_group_spec.go | 5 + pkg/deployment/images.go | 4 +- pkg/deployment/pod/probes.go | 12 +- .../resources/pod_creator_arangod.go | 27 ++-- .../resources/pod_creator_probes.go | 150 +++++++++++++++++- pkg/deployment/resources/pod_creator_sync.go | 19 ++- pkg/deployment/rotation/arangod_containers.go | 15 ++ pkg/util/k8sutil/interfaces/pod_creator.go | 2 +- pkg/util/k8sutil/pods.go | 3 +- pkg/util/k8sutil/probes/probes.go | 20 +-- 11 files changed, 228 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f97ac4a63..da85b4f98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Add Graceful shutdown as finalizer (supports kubectl delete) - Add Watch to Lifecycle command - Add Topology Discovery +- Add Support for StartupProbe ## [1.2.4](https://github.com/arangodb/kube-arangodb/tree/1.2.4) (2021-10-22) - Replace `beta.kubernetes.io/arch` Pod label with `kubernetes.io/arch` using Silent Rotation diff --git a/pkg/apis/deployment/v1/server_group_spec.go b/pkg/apis/deployment/v1/server_group_spec.go index fc3e18a72..d5ddd8bd4 100644 --- a/pkg/apis/deployment/v1/server_group_spec.go +++ b/pkg/apis/deployment/v1/server_group_spec.go @@ -261,6 +261,11 @@ type ServerGroupProbesSpec struct { ReadinessProbeDisabled *bool `json:"readinessProbeDisabled,omitempty"` // ReadinessProbeSpec override readiness probe configuration ReadinessProbeSpec *ServerGroupProbeSpec `json:"readinessProbeSpec,omitempty"` + + // StartupProbeDisabled if true startupProbes are disabled + StartupProbeDisabled *bool `json:"startupProbeDisabled,omitempty"` + // StartupProbeSpec override startup probe configuration + StartupProbeSpec *ServerGroupProbeSpec `json:"startupProbeSpec,omitempty"` } // GetReadinessProbeDisabled returns in proper manner readiness probe flag with backward compatibility. diff --git a/pkg/deployment/images.go b/pkg/deployment/images.go index 9b85a2b12..baf96094a 100644 --- a/pkg/deployment/images.go +++ b/pkg/deployment/images.go @@ -405,8 +405,8 @@ func (a *ContainerIdentity) GetPorts() []core.ContainerPort { } } -func (a *ContainerIdentity) GetProbes() (*core.Probe, *core.Probe, error) { - return nil, nil, nil +func (a *ContainerIdentity) GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) { + return nil, nil, nil, nil } func (a *ContainerIdentity) GetResourceRequirements() core.ResourceRequirements { diff --git a/pkg/deployment/pod/probes.go b/pkg/deployment/pod/probes.go index d32d2c862..afbc9028c 100644 --- a/pkg/deployment/pod/probes.go +++ b/pkg/deployment/pod/probes.go @@ -39,37 +39,47 @@ func LivenessSpec(group api.ServerGroup) Probe { return probeMap[group].liveness } +func StartupSpec(group api.ServerGroup) Probe { + return probeMap[group].startup +} + type Probe struct { CanBeEnabled, EnabledByDefault bool } type probes struct { - liveness, readiness Probe + liveness, readiness, startup Probe } // probeMap defines default values and if Probe can be enabled var probeMap = map[api.ServerGroup]probes{ api.ServerGroupSingle: { + startup: newProbe(true, false), liveness: newProbe(true, true), readiness: newProbe(true, true), }, api.ServerGroupAgents: { + startup: newProbe(true, false), liveness: newProbe(true, true), readiness: newProbe(true, false), }, api.ServerGroupDBServers: { + startup: newProbe(true, false), liveness: newProbe(true, true), readiness: newProbe(true, false), }, api.ServerGroupCoordinators: { + startup: newProbe(true, false), liveness: newProbe(true, false), readiness: newProbe(true, true), }, api.ServerGroupSyncMasters: { + startup: newProbe(true, false), liveness: newProbe(true, true), readiness: newProbe(false, false), }, api.ServerGroupSyncWorkers: { + startup: newProbe(true, false), liveness: newProbe(true, true), readiness: newProbe(false, false), }, diff --git a/pkg/deployment/resources/pod_creator_arangod.go b/pkg/deployment/resources/pod_creator_arangod.go index fed294ff9..90aad9ffb 100644 --- a/pkg/deployment/resources/pod_creator_arangod.go +++ b/pkg/deployment/resources/pod_creator_arangod.go @@ -136,17 +136,22 @@ func (a *ArangoDContainer) GetSecurityContext() *core.SecurityContext { return a.groupSpec.SecurityContext.NewSecurityContext() } -func (a *ArangoDContainer) GetProbes() (*core.Probe, *core.Probe, error) { - var liveness, readiness *core.Probe +func (a *ArangoDContainer) GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) { + var liveness, readiness, startup *core.Probe probeLivenessConfig, err := a.resources.getLivenessProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) if err != nil { - return nil, nil, err + return nil, nil, nil, err } probeReadinessConfig, err := a.resources.getReadinessProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) if err != nil { - return nil, nil, err + return nil, nil, nil, err + } + + probeStartupConfig, err := a.resources.getStartupProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) + if err != nil { + return nil, nil, nil, err } if probeLivenessConfig != nil { @@ -157,7 +162,11 @@ func (a *ArangoDContainer) GetProbes() (*core.Probe, *core.Probe, error) { readiness = probeReadinessConfig.Create() } - return liveness, readiness, nil + if probeStartupConfig != nil { + startup = probeStartupConfig.Create() + } + + return liveness, readiness, startup, nil } func (a *ArangoDContainer) GetImage() string { @@ -602,8 +611,8 @@ func (a *ArangoUpgradeContainer) GetName() string { } // GetProbes returns no probes for the ArangoD upgrade container. -func (a *ArangoUpgradeContainer) GetProbes() (*core.Probe, *core.Probe, error) { - return nil, nil, nil +func (a *ArangoUpgradeContainer) GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) { + return nil, nil, nil, nil } // GetArgs returns list of arguments for the ArangoD version check container. @@ -622,6 +631,6 @@ func (a *ArangoVersionCheckContainer) GetName() string { } // GetProbes returns no probes for the ArangoD version check container. -func (a *ArangoVersionCheckContainer) GetProbes() (*core.Probe, *core.Probe, error) { - return nil, nil, nil +func (a *ArangoVersionCheckContainer) GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) { + return nil, nil, nil, nil } diff --git a/pkg/deployment/resources/pod_creator_probes.go b/pkg/deployment/resources/pod_creator_probes.go index d713978a8..f6dcb5d24 100644 --- a/pkg/deployment/resources/pod_creator_probes.go +++ b/pkg/deployment/resources/pod_creator_probes.go @@ -49,7 +49,7 @@ type Probe interface { } type probeCheckBuilder struct { - liveness, readiness probeBuilder + liveness, readiness, startup probeBuilder } type probeBuilder func(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) @@ -118,6 +118,36 @@ func (r *Resources) getLivenessProbe(spec api.DeploymentSpec, group api.ServerGr return config, nil } +func (r *Resources) getStartupProbe(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { + if !r.isStartupProbeEnabled(spec, group) { + return nil, nil + } + + builders := r.probeBuilders() + + builder, ok := builders[group] + if !ok { + return nil, nil + } + + config, err := builder.startup(spec, group, version) + if err != nil { + return nil, err + } + + groupSpec := spec.GetServerGroupSpec(group) + + if !groupSpec.HasProbesSpec() { + return config, nil + } + + probeSpec := groupSpec.GetProbesSpec() + + config.SetSpec(probeSpec.StartupProbeSpec) + + return config, nil +} + func (r *Resources) isReadinessProbeEnabled(spec api.DeploymentSpec, group api.ServerGroup) bool { probe := pod.ReadinessSpec(group) @@ -146,29 +176,49 @@ func (r *Resources) isLivenessProbeEnabled(spec api.DeploymentSpec, group api.Se return probe.CanBeEnabled && probe.EnabledByDefault } +func (r *Resources) isStartupProbeEnabled(spec api.DeploymentSpec, group api.ServerGroup) bool { + probe := pod.StartupSpec(group) + + groupSpec := spec.GetServerGroupSpec(group) + + if groupSpec.HasProbesSpec() { + if p := groupSpec.GetProbesSpec().StartupProbeDisabled; p != nil { + return !*p && probe.CanBeEnabled + } + } + + return probe.CanBeEnabled && probe.EnabledByDefault +} + func (r *Resources) probeBuilders() map[api.ServerGroup]probeCheckBuilder { return map[api.ServerGroup]probeCheckBuilder{ api.ServerGroupSingle: { + startup: r.probeBuilderStartupCoreSelect(), liveness: r.probeBuilderLivenessCoreSelect(), readiness: r.probeBuilderReadinessCoreSelect(), }, api.ServerGroupAgents: { + startup: r.probeBuilderStartupCoreSelect(), liveness: r.probeBuilderLivenessCoreSelect(), readiness: r.probeBuilderReadinessSimpleCoreSelect(), }, api.ServerGroupDBServers: { + startup: r.probeBuilderStartupCoreSelect(), liveness: r.probeBuilderLivenessCoreSelect(), readiness: r.probeBuilderReadinessSimpleCoreSelect(), }, api.ServerGroupCoordinators: { + startup: r.probeBuilderStartupCoreSelect(), liveness: r.probeBuilderLivenessCoreSelect(), readiness: r.probeBuilderReadinessCoreSelect(), }, api.ServerGroupSyncMasters: { + startup: r.probeBuilderStartupSync, liveness: r.probeBuilderLivenessSync, readiness: nilProbeBuilder, }, api.ServerGroupSyncWorkers: { + startup: r.probeBuilderStartupSync, liveness: r.probeBuilderLivenessSync, readiness: nilProbeBuilder, }, @@ -207,6 +257,14 @@ func (r *Resources) probeBuilderLivenessCoreSelect() probeBuilder { return r.probeBuilderLivenessCore } +func (r *Resources) probeBuilderStartupCoreSelect() probeBuilder { + if features.JWTRotation().Enabled() { + return r.probeBuilderStartupCoreOperator + } + + return r.probeBuilderStartupCore +} + func (r *Resources) probeBuilderLivenessCoreOperator(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { args, err := r.probeCommand(spec, "/_api/version") if err != nil { @@ -218,6 +276,29 @@ func (r *Resources) probeBuilderLivenessCoreOperator(spec api.DeploymentSpec, gr }, nil } +func (r *Resources) probeBuilderStartupCoreOperator(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { + args, err := r.probeCommand(spec, "/_api/version") + if err != nil { + return nil, err + } + + var retries int32 + + switch group { + case api.ServerGroupDBServers: + retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay + default: + retries = 60 + } + + return &probes.CMDProbeConfig{ + Command: args, + FailureThreshold: retries, + PeriodSeconds: 5, + InitialDelaySeconds: 1, + }, nil +} + func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { authorization := "" if spec.IsAuthenticated() { @@ -237,6 +318,38 @@ func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api. }, nil } +func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { + + var retries int32 + + switch group { + case api.ServerGroupDBServers: + retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay + default: + retries = 60 + } + + authorization := "" + if spec.IsAuthenticated() { + secretData, err := r.getJWTSecret(spec) + if err != nil { + return nil, errors.WithStack(err) + } + authorization, err = jwt.CreateArangodJwtAuthorizationHeaderAllowedPaths(secretData, "kube-arangodb", []string{"/_api/version"}) + if err != nil { + return nil, errors.WithStack(err) + } + } + return &probes.HTTPProbeConfig{ + LocalPath: "/_api/version", + Secure: spec.IsSecure(), + Authorization: authorization, + FailureThreshold: retries, + PeriodSeconds: 5, + InitialDelaySeconds: 1, + }, nil +} + func (r *Resources) probeBuilderReadinessSimpleCoreSelect() probeBuilder { if features.JWTRotation().Enabled() { return r.probeBuilderReadinessSimpleCoreOperator @@ -363,3 +476,38 @@ func (r *Resources) probeBuilderLivenessSync(spec api.DeploymentSpec, group api. Port: port, }, nil } + +func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { + authorization := "" + port := k8sutil.ArangoSyncMasterPort + if group == api.ServerGroupSyncWorkers { + port = k8sutil.ArangoSyncWorkerPort + } + if spec.Sync.Monitoring.GetTokenSecretName() != "" { + // Use monitoring token + token, err := r.getSyncMonitoringToken(spec) + if err != nil { + return nil, errors.WithStack(err) + } + authorization = "bearer " + token + } else if group == api.ServerGroupSyncMasters { + // Fall back to JWT secret + secretData, err := r.getSyncJWTSecret(spec) + if err != nil { + return nil, errors.WithStack(err) + } + authorization, err = jwt.CreateArangodJwtAuthorizationHeaderAllowedPaths(secretData, "kube-arangodb", []string{"/_api/version"}) + if err != nil { + return nil, errors.WithStack(err) + } + } else { + // Don't have a probe + return nil, nil + } + return &probes.HTTPProbeConfig{ + LocalPath: "/_api/version", + Secure: spec.Sync.TLS.IsSecure(), + Authorization: authorization, + Port: port, + }, nil +} diff --git a/pkg/deployment/resources/pod_creator_sync.go b/pkg/deployment/resources/pod_creator_sync.go index 1a466441a..e6ef2ae1e 100644 --- a/pkg/deployment/resources/pod_creator_sync.go +++ b/pkg/deployment/resources/pod_creator_sync.go @@ -105,17 +105,22 @@ func (a *ArangoSyncContainer) GetSecurityContext() *core.SecurityContext { return a.groupSpec.SecurityContext.NewSecurityContext() } -func (a *ArangoSyncContainer) GetProbes() (*core.Probe, *core.Probe, error) { - var liveness, readiness *core.Probe +func (a *ArangoSyncContainer) GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) { + var liveness, readiness, startup *core.Probe probeLivenessConfig, err := a.resources.getLivenessProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) if err != nil { - return nil, nil, err + return nil, nil, nil, err } probeReadinessConfig, err := a.resources.getReadinessProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) if err != nil { - return nil, nil, err + return nil, nil, nil, err + } + + probeStartupConfig, err := a.resources.getReadinessProbe(a.spec, a.group, a.imageInfo.ArangoDBVersion) + if err != nil { + return nil, nil, nil, err } if probeLivenessConfig != nil { @@ -126,7 +131,11 @@ func (a *ArangoSyncContainer) GetProbes() (*core.Probe, *core.Probe, error) { readiness = probeReadinessConfig.Create() } - return liveness, readiness, nil + if probeStartupConfig != nil { + startup = probeStartupConfig.Create() + } + + return liveness, readiness, startup, nil } func (a *ArangoSyncContainer) GetResourceRequirements() core.ResourceRequirements { diff --git a/pkg/deployment/rotation/arangod_containers.go b/pkg/deployment/rotation/arangod_containers.go index 396735b91..1991a2341 100644 --- a/pkg/deployment/rotation/arangod_containers.go +++ b/pkg/deployment/rotation/arangod_containers.go @@ -72,6 +72,11 @@ func containersCompare(_ api.DeploymentSpec, _ api.ServerGroup, spec, status *co mode = mode.And(SilentRotation) } } + + if !areProbesEqual(ac.StartupProbe, bc.StartupProbe) { + bc.StartupProbe = ac.StartupProbe + mode = mode.And(SilentRotation) + } } else { if ac.Image != bc.Image { // Image changed @@ -196,3 +201,13 @@ func getEnvs(e []core.EnvVar) map[string]core.EnvVar { return m } + +func areProbesEqual(a, b *core.Probe) bool { + if a == nil && b == nil { + return true + } + if a == nil || b == nil { + return false + } + return equality.Semantic.DeepEqual(a, b) +} diff --git a/pkg/util/k8sutil/interfaces/pod_creator.go b/pkg/util/k8sutil/interfaces/pod_creator.go index a3486bdf5..be91c7e6c 100644 --- a/pkg/util/k8sutil/interfaces/pod_creator.go +++ b/pkg/util/k8sutil/interfaces/pod_creator.go @@ -68,7 +68,7 @@ type ContainerCreator interface { GetArgs() ([]string, error) GetName() string GetExecutor() string - GetProbes() (*core.Probe, *core.Probe, error) + GetProbes() (*core.Probe, *core.Probe, *core.Probe, error) GetResourceRequirements() core.ResourceRequirements GetLifecycle() (*core.Lifecycle, error) GetImagePullPolicy() core.PullPolicy diff --git a/pkg/util/k8sutil/pods.go b/pkg/util/k8sutil/pods.go index 9aefe9b50..ffad91243 100644 --- a/pkg/util/k8sutil/pods.go +++ b/pkg/util/k8sutil/pods.go @@ -415,7 +415,7 @@ func ExtractPodResourceRequirement(resources core.ResourceRequirements) core.Res // NewContainer creates a container for specified creator func NewContainer(containerCreator interfaces.ContainerCreator) (core.Container, error) { - liveness, readiness, err := containerCreator.GetProbes() + liveness, readiness, startup, err := containerCreator.GetProbes() if err != nil { return core.Container{}, err } @@ -439,6 +439,7 @@ func NewContainer(containerCreator interfaces.ContainerCreator) (core.Container, Resources: containerCreator.GetResourceRequirements(), LivenessProbe: liveness, ReadinessProbe: readiness, + StartupProbe: startup, Lifecycle: lifecycle, ImagePullPolicy: containerCreator.GetImagePullPolicy(), SecurityContext: containerCreator.GetSecurityContext(), diff --git a/pkg/util/k8sutil/probes/probes.go b/pkg/util/k8sutil/probes/probes.go index 643ad3176..089e17395 100644 --- a/pkg/util/k8sutil/probes/probes.go +++ b/pkg/util/k8sutil/probes/probes.go @@ -87,11 +87,11 @@ func (config HTTPProbeConfig) Create() *core.Probe { HTTPHeaders: headers, }, }, - InitialDelaySeconds: defaultInt32(config.InitialDelaySeconds, 15*60), // Wait 15min before first probe - TimeoutSeconds: defaultInt32(config.TimeoutSeconds, 2), // Timeout of each probe is 2s - PeriodSeconds: defaultInt32(config.PeriodSeconds, 60), // Interval between probes is 10s - SuccessThreshold: defaultInt32(config.SuccessThreshold, 1), // Single probe is enough to indicate success - FailureThreshold: defaultInt32(config.FailureThreshold, 10), // Need 10 failed probes to consider a failed state + InitialDelaySeconds: defaultInt32(config.InitialDelaySeconds, 900), // Wait 15min before first probe + TimeoutSeconds: defaultInt32(config.TimeoutSeconds, 2), // Timeout of each probe is 2s + PeriodSeconds: defaultInt32(config.PeriodSeconds, 60), // Interval between probes is 10s + SuccessThreshold: defaultInt32(config.SuccessThreshold, 1), // Single probe is enough to indicate success + FailureThreshold: defaultInt32(config.FailureThreshold, 10), // Need 10 failed probes to consider a failed state } } @@ -126,11 +126,11 @@ func (config CMDProbeConfig) Create() *core.Probe { Command: config.Command, }, }, - InitialDelaySeconds: defaultInt32(config.InitialDelaySeconds, 15*60), // Wait 15min before first probe - TimeoutSeconds: defaultInt32(config.TimeoutSeconds, 2), // Timeout of each probe is 2s - PeriodSeconds: defaultInt32(config.PeriodSeconds, 60), // Interval between probes is 10s - SuccessThreshold: defaultInt32(config.SuccessThreshold, 1), // Single probe is enough to indicate success - FailureThreshold: defaultInt32(config.FailureThreshold, 10), // Need 10 failed probes to consider a failed state + InitialDelaySeconds: defaultInt32(config.InitialDelaySeconds, 900), // Wait 15min before first probe + TimeoutSeconds: defaultInt32(config.TimeoutSeconds, 2), // Timeout of each probe is 2s + PeriodSeconds: defaultInt32(config.PeriodSeconds, 60), // Interval between probes is 10s + SuccessThreshold: defaultInt32(config.SuccessThreshold, 1), // Single probe is enough to indicate success + FailureThreshold: defaultInt32(config.FailureThreshold, 10), // Need 10 failed probes to consider a failed state } }