From a08eed74b37efb7509e39578930a6dc2c7e7b9da Mon Sep 17 00:00:00 2001 From: jwierzbo Date: Tue, 17 May 2022 16:15:45 +0200 Subject: [PATCH 1/3] GT-26 Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure --- CHANGELOG.md | 1 + README.md | 1 + .../features/restart_policy_always.go | 37 +++++++++++++++++ pkg/deployment/images.go | 4 ++ pkg/deployment/resources/pod_cleanup.go | 5 +++ .../resources/pod_creator_arangod.go | 7 ++++ pkg/deployment/resources/pod_creator_sync.go | 30 +++++++------- pkg/deployment/resources/pod_inspector.go | 40 +++++++++++++++---- pkg/util/k8sutil/events.go | 2 +- pkg/util/k8sutil/interfaces/pod_creator.go | 1 + pkg/util/k8sutil/pods.go | 2 +- 11 files changed, 107 insertions(+), 23 deletions(-) create mode 100644 pkg/deployment/features/restart_policy_always.go diff --git a/CHANGELOG.md b/CHANGELOG.md index f6747a5ab..6b989ab5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - (Feature) (ACS) Add Resource plan - (Feature) Allow raw json value for license token-v2 - (Update) Replace `beta.kubernetes.io/arch` to `kubernetes.io/arch` in Operator Chart +- (Feature) Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure ## [1.2.12](https://github.com/arangodb/kube-arangodb/tree/1.2.12) (2022-05-10) - (Feature) Add CoreV1 Endpoints Inspector diff --git a/README.md b/README.md index 7db01aa0b..eb7091ecf 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ Feature-wise production readiness table: | Operator Internal Metrics Exporter | 1.2.0 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | | Operator Internal Metrics Exporter | 1.2.3 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | It is always enabled | | Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.ephemeral-volumes | N/A | +| Pod RestartPolicyAlways | 1.2.13 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.restart-policy-always | N/A | ## Release notes for 0.3.16 diff --git a/pkg/deployment/features/restart_policy_always.go b/pkg/deployment/features/restart_policy_always.go new file mode 100644 index 000000000..48a6b397c --- /dev/null +++ b/pkg/deployment/features/restart_policy_always.go @@ -0,0 +1,37 @@ +// +// DISCLAIMER +// +// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package features + +func init() { + registerFeature(restartPolicyAlways) +} + +var restartPolicyAlways = &feature{ + name: "restart-policy-always", + description: "Allow to restart containers with always restart policy", + version: "3.6.0", + enterpriseRequired: false, + enabledByDefault: true, +} + +func RestartPolicyAlways() Feature { + return restartPolicyAlways +} diff --git a/pkg/deployment/images.go b/pkg/deployment/images.go index 5d6a1da1c..ee8cd5b08 100644 --- a/pkg/deployment/images.go +++ b/pkg/deployment/images.go @@ -279,6 +279,10 @@ func (i *ImageUpdatePod) GetContainerCreator() interfaces.ContainerCreator { return i.containerCreator } +func (i *ImageUpdatePod) GetRestartPolicy() core.RestartPolicy { + return core.RestartPolicyNever +} + func (i *ImageUpdatePod) GetAffinityRole() string { return "" } diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go index b057415b0..4e6c15bc7 100644 --- a/pkg/deployment/resources/pod_cleanup.go +++ b/pkg/deployment/resources/pod_cleanup.go @@ -69,6 +69,11 @@ func (r *Resources) CleanupTerminatedPods(ctx context.Context) (util.Interval, e return nil } + if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !k8sutil.IsPodMarkedForDeletion(pod) { + // The pod restart (failure case) will be handled by the kubelet. + return nil + } + // Check member termination condition if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { if !group.IsStateless() { diff --git a/pkg/deployment/resources/pod_creator_arangod.go b/pkg/deployment/resources/pod_creator_arangod.go index 424baf521..a6f7a9f98 100644 --- a/pkg/deployment/resources/pod_creator_arangod.go +++ b/pkg/deployment/resources/pod_creator_arangod.go @@ -493,6 +493,13 @@ func (m *MemberArangoDPod) GetContainerCreator() interfaces.ContainerCreator { } } +func (m *MemberArangoDPod) GetRestartPolicy() core.RestartPolicy { + if features.RestartPolicyAlways().Enabled() { + return core.RestartPolicyAlways + } + return core.RestartPolicyNever +} + func (m *MemberArangoDPod) createMetricsExporterSidecarInternalExporter() (*core.Container, error) { image := m.GetContainerCreator().GetImage() diff --git a/pkg/deployment/resources/pod_creator_sync.go b/pkg/deployment/resources/pod_creator_sync.go index bba2afa11..131b55f32 100644 --- a/pkg/deployment/resources/pod_creator_sync.go +++ b/pkg/deployment/resources/pod_creator_sync.go @@ -24,24 +24,19 @@ import ( "context" "math" - "github.com/arangodb/kube-arangodb/pkg/util/globals" - - "github.com/arangodb/kube-arangodb/pkg/util/errors" - - meta "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/arangodb/kube-arangodb/pkg/util/collection" - - "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces" - - "github.com/arangodb/kube-arangodb/pkg/deployment/pod" - - "github.com/arangodb/kube-arangodb/pkg/util/constants" - api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/apis/shared" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/deployment/pod" + "github.com/arangodb/kube-arangodb/pkg/util/collection" + "github.com/arangodb/kube-arangodb/pkg/util/constants" + "github.com/arangodb/kube-arangodb/pkg/util/errors" + "github.com/arangodb/kube-arangodb/pkg/util/globals" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces" + core "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" ) const ( @@ -312,6 +307,13 @@ func (m *MemberSyncPod) GetContainerCreator() interfaces.ContainerCreator { } } +func (m *MemberSyncPod) GetRestartPolicy() core.RestartPolicy { + if features.RestartPolicyAlways().Enabled() { + return core.RestartPolicyAlways + } + return core.RestartPolicyNever +} + // Init initializes the arangosync pod. func (m *MemberSyncPod) Init(ctx context.Context, cachedStatus interfaces.Inspector, pod *core.Pod) error { terminationGracePeriodSeconds := int64(math.Ceil(m.groupSpec.GetTerminationGracePeriod(m.group).Seconds())) diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 73e8fb2fe..b6294b4f2 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -23,22 +23,20 @@ package resources import ( "context" "fmt" + "strings" "time" - "github.com/arangodb/kube-arangodb/pkg/deployment/patch" - - "github.com/arangodb/kube-arangodb/pkg/util/errors" - inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector" - + core "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" meta "k8s.io/apimachinery/pkg/apis/meta/v1" - "strings" - api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/patch" "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util" + "github.com/arangodb/kube-arangodb/pkg/util/errors" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector" podv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/pod/v1" ) @@ -106,6 +104,18 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded") updateMemberStatusNeeded = true nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) + + if pod.Spec.RestartPolicy == core.RestartPolicyAlways { + containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) + if exist && containerStatus.LastTerminationState.Terminated != nil { + lastTermination := containerStatus.LastTerminationState.Terminated.FinishedAt + if memberStatus.RecentTerminationsSince(lastTermination.Time) == 0 { + memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, lastTermination) + wasTerminated = true + } + } + } + if !wasTerminated { // Record termination time now := meta.Now() @@ -171,6 +181,18 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Failed") updateMemberStatusNeeded = true nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) + + if pod.Spec.RestartPolicy == core.RestartPolicyAlways { + containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) + if exist && containerStatus.LastTerminationState.Terminated != nil { + lastTermination := containerStatus.LastTerminationState.Terminated.FinishedAt + if memberStatus.RecentTerminationsSince(lastTermination.Time) == 0 { + memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, lastTermination) + wasTerminated = true + } + } + } + if !wasTerminated { // Record termination time now := meta.Now() @@ -237,6 +259,10 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter memberStatus.Conditions.Update(api.ConditionTypeServing, true, "Pod Serving", "")) { log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready, Started & Serving to true") + if pod.Spec.RestartPolicy == core.RestartPolicyAlways && memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { + memberStatus.Conditions.Update(api.ConditionTypeTerminated, false, "Pod Successfully Restarted", "") + } + if status.Topology.IsTopologyOwned(memberStatus.Topology) { nodes, err := cachedStatus.Node().V1() if err == nil { diff --git a/pkg/util/k8sutil/events.go b/pkg/util/k8sutil/events.go index c034544a2..26c6febbb 100644 --- a/pkg/util/k8sutil/events.go +++ b/pkg/util/k8sutil/events.go @@ -151,7 +151,7 @@ func NewAccessPackageDeletedEvent(apiObject APIObject, apSecretName string) *Eve return event } -// NewPlanTimeoutEvent creates an event indicating that an item on a reconciliation plan has been added +// NewPlanAppendEvent creates an event indicating that an item on a reconciliation plan has been added func NewPlanAppendEvent(apiObject APIObject, itemType, memberID, role, reason string) *Event { event := newDeploymentEvent(apiObject) event.Type = v1.EventTypeNormal diff --git a/pkg/util/k8sutil/interfaces/pod_creator.go b/pkg/util/k8sutil/interfaces/pod_creator.go index d9b6c3543..520d09af8 100644 --- a/pkg/util/k8sutil/interfaces/pod_creator.go +++ b/pkg/util/k8sutil/interfaces/pod_creator.go @@ -51,6 +51,7 @@ type PodCreator interface { GetPodAntiAffinity() *core.PodAntiAffinity GetPodAffinity() *core.PodAffinity GetNodeAffinity() *core.NodeAffinity + GetRestartPolicy() core.RestartPolicy GetContainerCreator() ContainerCreator GetImagePullSecrets() []string IsDeploymentMode() bool diff --git a/pkg/util/k8sutil/pods.go b/pkg/util/k8sutil/pods.go index e0bf8bba2..f8a6fa42d 100644 --- a/pkg/util/k8sutil/pods.go +++ b/pkg/util/k8sutil/pods.go @@ -526,7 +526,7 @@ func NewPod(deploymentName, role, id, podName string, podCreator interfaces.PodC Spec: core.PodSpec{ Hostname: hostname, Subdomain: CreateHeadlessServiceName(deploymentName), - RestartPolicy: core.RestartPolicyNever, + RestartPolicy: podCreator.GetRestartPolicy(), Tolerations: podCreator.GetTolerations(), ServiceAccountName: podCreator.GetServiceAccountName(), NodeSelector: podCreator.GetNodeSelector(), From 1bae1a2d5b4423ce4997ce13e8aaef1b70339c47 Mon Sep 17 00:00:00 2001 From: jwierzbo Date: Tue, 24 May 2022 23:42:19 +0200 Subject: [PATCH 2/3] handle sidecar restart --- .../action_runtime_container_image_update.go | 16 +++- pkg/deployment/resources/pod_cleanup.go | 9 ++- pkg/deployment/resources/pod_inspector.go | 73 +++++++++++++------ pkg/deployment/rotation/check.go | 2 + 4 files changed, 71 insertions(+), 29 deletions(-) diff --git a/pkg/deployment/reconcile/action_runtime_container_image_update.go b/pkg/deployment/reconcile/action_runtime_container_image_update.go index e5abc018f..7475a3bb2 100644 --- a/pkg/deployment/reconcile/action_runtime_container_image_update.go +++ b/pkg/deployment/reconcile/action_runtime_container_image_update.go @@ -22,6 +22,8 @@ package reconcile import ( "context" + core "k8s.io/api/core/v1" + "time" "github.com/arangodb/kube-arangodb/pkg/deployment/rotation" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" @@ -257,9 +259,21 @@ func (a actionRuntimeContainerImageUpdate) CheckProgress(ctx context.Context) (b return false, false, nil } - // Pod wont get up and running + // Pod won't get up and running return true, false, errors.Newf("Container %s failed during image replacement: (%d) %s: %s", name, s.ExitCode, s.Reason, s.Message) } else if s := cstatus.State.Waiting; s != nil { + if pod.Spec.RestartPolicy == core.RestartPolicyAlways { + lastTermination := cstatus.LastTerminationState.Terminated + if lastTermination != nil { + allowedRestartPeriod := time.Now().Add(time.Second * -20) + if lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { + return true, false, errors.Newf("Container %s continuously failed during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message) + } else { + a.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") + } + } + } + // Pod is still pulling image or pending for pod start return false, false, nil } else if s := cstatus.State.Running; s != nil { diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go index 4e6c15bc7..abb01f093 100644 --- a/pkg/deployment/resources/pod_cleanup.go +++ b/pkg/deployment/resources/pod_cleanup.go @@ -69,10 +69,11 @@ func (r *Resources) CleanupTerminatedPods(ctx context.Context) (util.Interval, e return nil } - if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !k8sutil.IsPodMarkedForDeletion(pod) { - // The pod restart (failure case) will be handled by the kubelet. - return nil - } + // todo: we don't need it + /* if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !k8sutil.IsPodMarkedForDeletion(pod) && !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { + // The pod restart (failure case) will be handled by the kubelet. + return nil + }*/ // Check member termination condition if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index b6294b4f2..298184267 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -100,21 +100,33 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter if k8sutil.IsPodSucceeded(pod, coreContainers) { // Pod has terminated with exit code 0. wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) - if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") { - log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded") - updateMemberStatusNeeded = true - nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) + markAsTerminated := true + + if pod.Spec.RestartPolicy == core.RestartPolicyAlways { + containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) + if exist && containerStatus.State.Terminated != nil { + termination := containerStatus.State.Terminated.FinishedAt + if memberStatus.RecentTerminationsSince(termination.Time) == 0 { + memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination) + wasTerminated = true + markAsTerminated = false + } - if pod.Spec.RestartPolicy == core.RestartPolicyAlways { - containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) - if exist && containerStatus.LastTerminationState.Terminated != nil { - lastTermination := containerStatus.LastTerminationState.Terminated.FinishedAt - if memberStatus.RecentTerminationsSince(lastTermination.Time) == 0 { - memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, lastTermination) - wasTerminated = true - } + lastTermination := containerStatus.LastTerminationState.Terminated + allowedRestartPeriod := time.Now().Add(time.Second * -10) + if lastTermination != nil && lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { + log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it") + markAsTerminated = true + } else { + log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") } } + } + + if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") { + log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded") + updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) if !wasTerminated { // Record termination time @@ -125,7 +137,31 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter } else if k8sutil.IsPodFailed(pod, coreContainers) { // Pod has terminated with at least 1 container with a non-zero exit code. wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) - if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { + markAsTerminated := true + + if pod.Spec.RestartPolicy == core.RestartPolicyAlways { + containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) + if exist && containerStatus.State.Terminated != nil { + termination := containerStatus.State.Terminated.FinishedAt + if memberStatus.RecentTerminationsSince(termination.Time) == 0 { + memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination) + wasTerminated = true + markAsTerminated = false + } + + lastTermination := containerStatus.LastTerminationState.Terminated + allowedRestartPeriod := time.Now().Add(time.Second * -10) + if lastTermination != nil && lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { + log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it") + markAsTerminated = true + } else { + log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") + } + + } + } + + if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { if containers := k8sutil.GetFailedContainerNames(pod.Status.InitContainerStatuses); len(containers) > 0 { for _, container := range containers { switch container { @@ -182,17 +218,6 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter updateMemberStatusNeeded = true nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) - if pod.Spec.RestartPolicy == core.RestartPolicyAlways { - containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) - if exist && containerStatus.LastTerminationState.Terminated != nil { - lastTermination := containerStatus.LastTerminationState.Terminated.FinishedAt - if memberStatus.RecentTerminationsSince(lastTermination.Time) == 0 { - memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, lastTermination) - wasTerminated = true - } - } - } - if !wasTerminated { // Record termination time now := meta.Now() diff --git a/pkg/deployment/rotation/check.go b/pkg/deployment/rotation/check.go index 35742e042..015f89bd2 100644 --- a/pkg/deployment/rotation/check.go +++ b/pkg/deployment/rotation/check.go @@ -126,6 +126,8 @@ func IsRotationRequired(log zerolog.Logger, acs sutil.ACS, spec api.DeploymentSp if mode, plan, err := compare(log, spec, member, group, specTemplate, statusTemplate); err != nil { return SkippedRotation, nil, "", err + } else if mode == SkippedRotation { + return mode, plan, "No rotation needed", nil } else { return mode, plan, "Pod needs rotation", nil } From 561b5235ce9dcb7e5884d1d3dc95bd7889935f10 Mon Sep 17 00:00:00 2001 From: jwierzbo Date: Wed, 25 May 2022 22:39:11 +0200 Subject: [PATCH 3/3] handle server restart --- .../features/restart_policy_always.go | 2 +- .../action_runtime_container_image_update.go | 10 +-- pkg/deployment/resilience/member_failure.go | 2 + pkg/deployment/resources/pod_cleanup.go | 6 -- pkg/deployment/resources/pod_inspector.go | 74 ++++++++----------- 5 files changed, 38 insertions(+), 56 deletions(-) diff --git a/pkg/deployment/features/restart_policy_always.go b/pkg/deployment/features/restart_policy_always.go index 48a6b397c..dfe09ccfa 100644 --- a/pkg/deployment/features/restart_policy_always.go +++ b/pkg/deployment/features/restart_policy_always.go @@ -29,7 +29,7 @@ var restartPolicyAlways = &feature{ description: "Allow to restart containers with always restart policy", version: "3.6.0", enterpriseRequired: false, - enabledByDefault: true, + enabledByDefault: false, } func RestartPolicyAlways() Feature { diff --git a/pkg/deployment/reconcile/action_runtime_container_image_update.go b/pkg/deployment/reconcile/action_runtime_container_image_update.go index 7475a3bb2..24f0010a6 100644 --- a/pkg/deployment/reconcile/action_runtime_container_image_update.go +++ b/pkg/deployment/reconcile/action_runtime_container_image_update.go @@ -22,16 +22,16 @@ package reconcile import ( "context" - core "k8s.io/api/core/v1" "time" + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/deployment/rotation" + "github.com/arangodb/kube-arangodb/pkg/util/errors" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + "github.com/rs/zerolog" + core "k8s.io/api/core/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" - "github.com/arangodb/kube-arangodb/pkg/util/errors" ) func init() { @@ -267,7 +267,7 @@ func (a actionRuntimeContainerImageUpdate) CheckProgress(ctx context.Context) (b if lastTermination != nil { allowedRestartPeriod := time.Now().Add(time.Second * -20) if lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { - return true, false, errors.Newf("Container %s continuously failed during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message) + return true, false, errors.Newf("Container %s continuously failing during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message) } else { a.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") } diff --git a/pkg/deployment/resilience/member_failure.go b/pkg/deployment/resilience/member_failure.go index d39be701d..66abe4dbe 100644 --- a/pkg/deployment/resilience/member_failure.go +++ b/pkg/deployment/resilience/member_failure.go @@ -158,6 +158,8 @@ func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.Se case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers: // Sync masters & workers can be replaced at will return true, "", nil + case api.ServerGroupSingle: + return false, "ServerGroupSingle can not marked as a failed", nil default: // TODO return false, "TODO", nil diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go index abb01f093..b057415b0 100644 --- a/pkg/deployment/resources/pod_cleanup.go +++ b/pkg/deployment/resources/pod_cleanup.go @@ -69,12 +69,6 @@ func (r *Resources) CleanupTerminatedPods(ctx context.Context) (util.Interval, e return nil } - // todo: we don't need it - /* if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !k8sutil.IsPodMarkedForDeletion(pod) && !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { - // The pod restart (failure case) will be handled by the kubelet. - return nil - }*/ - // Check member termination condition if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { if !group.IsStateless() { diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 298184267..93aad2c2a 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -46,11 +46,36 @@ var ( ) const ( - podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. + podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. + terminationRestartPeriod = time.Second * -30 // If previous pod termination happened less than this time ago, + // we will mark the pod as scheduled for termination recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens) ) +func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) { + containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) + if exist && containerStatus.State.Terminated != nil { + // do not record termination time again in the code below + *wasTerminated = true + + termination := containerStatus.State.Terminated.FinishedAt + if memberStatus.RecentTerminationsSince(termination.Time) == 0 { + memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination) + } + + previousTermination := containerStatus.LastTerminationState.Terminated + allowedRestartPeriod := time.Now().Add(terminationRestartPeriod) + if previousTermination != nil && !previousTermination.FinishedAt.Time.Before(allowedRestartPeriod) { + r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it") + *markAsTerminated = true + } else { + *markAsTerminated = false + r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") + } + } +} + // InspectPods lists all pods that belong to the given deployment and updates // the member status of the deployment accordingly. // Returns: Interval_till_next_inspection, error @@ -102,25 +127,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) markAsTerminated := true - if pod.Spec.RestartPolicy == core.RestartPolicyAlways { - containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) - if exist && containerStatus.State.Terminated != nil { - termination := containerStatus.State.Terminated.FinishedAt - if memberStatus.RecentTerminationsSince(termination.Time) == 0 { - memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination) - wasTerminated = true - markAsTerminated = false - } - - lastTermination := containerStatus.LastTerminationState.Terminated - allowedRestartPeriod := time.Now().Add(time.Second * -10) - if lastTermination != nil && lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { - log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it") - markAsTerminated = true - } else { - log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") - } - } + if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated { + r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated) } if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") { @@ -139,26 +147,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) markAsTerminated := true - if pod.Spec.RestartPolicy == core.RestartPolicyAlways { - containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer) - if exist && containerStatus.State.Terminated != nil { - termination := containerStatus.State.Terminated.FinishedAt - if memberStatus.RecentTerminationsSince(termination.Time) == 0 { - memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination) - wasTerminated = true - markAsTerminated = false - } - - lastTermination := containerStatus.LastTerminationState.Terminated - allowedRestartPeriod := time.Now().Add(time.Second * -10) - if lastTermination != nil && lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) { - log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it") - markAsTerminated = true - } else { - log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..") - } - - } + if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated { + r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated) } if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { @@ -284,10 +274,6 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter memberStatus.Conditions.Update(api.ConditionTypeServing, true, "Pod Serving", "")) { log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready, Started & Serving to true") - if pod.Spec.RestartPolicy == core.RestartPolicyAlways && memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { - memberStatus.Conditions.Update(api.ConditionTypeTerminated, false, "Pod Successfully Restarted", "") - } - if status.Topology.IsTopologyOwned(memberStatus.Topology) { nodes, err := cachedStatus.Node().V1() if err == nil {