From e3f19461f85a33248e163c9e95079698fe85a374 Mon Sep 17 00:00:00 2001 From: Tomasz Mielech Date: Tue, 12 Jul 2022 10:23:53 +0200 Subject: [PATCH 1/2] restore failed member high plan --- CHANGELOG.md | 1 + pkg/deployment/reconcile/plan_builder_high.go | 3 +- .../reconcile/plan_builder_member_recovery.go | 128 ++++++++++++++++++ .../reconcile/plan_builder_normal.go | 88 +----------- pkg/deployment/reconcile/plan_builder_test.go | 12 +- pkg/deployment/reconcile/reconciler.go | 2 +- 6 files changed, 144 insertions(+), 90 deletions(-) create mode 100644 pkg/deployment/reconcile/plan_builder_member_recovery.go diff --git a/CHANGELOG.md b/CHANGELOG.md index d0bd8ac7f..65b1d63bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ - (Feature) Member restarts metric - (Bugfix) Infinite loop fix in ArangoD AsyncClient - (Bugfix) Add Panic Handler +- (Feature) Recreation member in the high plan ## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07) - (Bugfix) Fix arangosync members state inspection diff --git a/pkg/deployment/reconcile/plan_builder_high.go b/pkg/deployment/reconcile/plan_builder_high.go index d1303f46e..7a996e0a7 100644 --- a/pkg/deployment/reconcile/plan_builder_high.go +++ b/pkg/deployment/reconcile/plan_builder_high.go @@ -59,7 +59,8 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)). Apply(r.createBackupInProgressConditionPlan). // Discover backups always Apply(r.createMaintenanceConditionPlan). // Discover maintenance always - Apply(r.cleanupConditions) // Cleanup Conditions + Apply(r.cleanupConditions). // Cleanup Conditions + ApplyIfEmpty(r.createMemberFailedRestoreHighPlan) return q.Plan(), q.BackOff(), true } diff --git a/pkg/deployment/reconcile/plan_builder_member_recovery.go b/pkg/deployment/reconcile/plan_builder_member_recovery.go new file mode 100644 index 000000000..737b35d49 --- /dev/null +++ b/pkg/deployment/reconcile/plan_builder_member_recovery.go @@ -0,0 +1,128 @@ +// +// DISCLAIMER +// +// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package reconcile + +import ( + "context" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/actions" + "github.com/arangodb/kube-arangodb/pkg/deployment/agency" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" +) + +// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member. +func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject, + spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + condition := func(a api.Action) bool { + return a.Type != api.ActionTypeRecreateMember + } + + return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition) +} + +// createMemberFailedRestoreHighPlan returns only recreate member actions. +func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject, + spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + condition := func(a api.Action) bool { + return a.Type == api.ActionTypeRecreateMember + } + + return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition) +} + +func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec, + status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + var plan api.Plan + + // Fetch agency plan. + agencyState, agencyOK := context.GetAgencyCache() + + // Check for members in failed state. + status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error { + failed := 0 + for _, m := range members { + if m.Phase == api.MemberPhaseFailed { + failed++ + } + } + for _, m := range members { + if m.Phase != api.MemberPhaseFailed || len(plan) > 0 { + continue + } + + memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole()) + + if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster { + if !agencyOK { + // If agency is down DBServers should not be touched. + memberLog.Info("Agency state is not present") + continue + } + + if c := spec.DBServers.GetCount(); c <= len(members)-failed { + // There are more or equal alive members than current count. A member should not be recreated. + continue + } + + if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { + // DBServer still exists in agency plan! Will not be removed, but needs to be recreated. + memberLog.Info("Recreating DBServer - it cannot be removed gracefully") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + + continue + } + // From here on, DBServer can be recreated. + } + + switch group { + case api.ServerGroupAgents: + // For agents just recreate member do not rotate ID, do not remove PVC or service. + memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + case api.ServerGroupSingle: + // Do not remove data for single. + memberLog.Info("Restoring old member. Rotation for single servers is not safe") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + default: + if spec.GetAllowMemberRecreation(group) { + memberLog.Info("Creating member replacement plan because member has failed") + plan = append(plan, + actions.NewAction(api.ActionTypeRemoveMember, group, m), + actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)), + ) + } else { + memberLog.Info("Restoring old member. Recreation is disabled for group") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + } + } + } + return nil + }) + + if len(plan) == 0 && !agencyOK { + r.log.Warn("unable to build further plan without access to agency") + plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle)) + } + + return plan +} diff --git a/pkg/deployment/reconcile/plan_builder_normal.go b/pkg/deployment/reconcile/plan_builder_normal.go index a6a8cdf7d..f8072782f 100644 --- a/pkg/deployment/reconcile/plan_builder_normal.go +++ b/pkg/deployment/reconcile/plan_builder_normal.go @@ -24,8 +24,6 @@ import ( "context" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" - "github.com/arangodb/kube-arangodb/pkg/deployment/actions" - "github.com/arangodb/kube-arangodb/pkg/deployment/agency" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API // Check for scale up ApplyIfEmpty(r.createScaleUPMemberPlan). // Check for failed members - ApplyIfEmpty(r.createMemberFailedRestorePlan). + ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan). // Check for scale up/down ApplyIfEmpty(r.createScaleMemberPlan). // Update status @@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API return q.Plan(), q.BackOff(), true } -func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject, - spec api.DeploymentSpec, status api.DeploymentStatus, - context PlanBuilderContext) api.Plan { - var plan api.Plan - - // Fetch agency plan - agencyState, agencyOK := context.GetAgencyCache() - - // Check for members in failed state - status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error { - failed := 0 - for _, m := range members { - if m.Phase == api.MemberPhaseFailed { - failed++ - } - } - for _, m := range members { - if m.Phase != api.MemberPhaseFailed || len(plan) > 0 { - continue - } - - memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole()) - - if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster { - // Do pre check for DBServers. If agency is down DBServers should not be touch - if !agencyOK { - memberLog.Info("Agency state is not present") - continue - } - - if c := spec.DBServers.GetCount(); c <= len(members)-failed { - // We have more or equal alive members than current count, we should not recreate this member - continue - } - - if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { - // DBServer still exists in agency plan! Will not be removed, but needs to be recreated - memberLog.Info("Recreating DBServer - it cannot be removed gracefully") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - continue - } - - // Everything is fine, proceed - } - - switch group { - case api.ServerGroupAgents: - // For agents just recreate member do not rotate ID, do not remove PVC or service - memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - case api.ServerGroupSingle: - // Do not remove data for singles - memberLog.Info("Restoring old member. Rotation for single servers is not safe") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - default: - if spec.GetAllowMemberRecreation(group) { - memberLog.Info("Creating member replacement plan because member has failed") - plan = append(plan, - actions.NewAction(api.ActionTypeRemoveMember, group, m), - actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")), - ) - } else { - memberLog.Info("Restoring old member. Recreation is disabled for group") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - } - } - } - return nil - }) - - // Ensure that we were able to get agency info - if len(plan) == 0 && !agencyOK { - r.log.Warn("unable to build further plan without access to agency") - plan = append(plan, - actions.NewClusterAction(api.ActionTypeIdle)) - } - - return plan -} - func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { diff --git a/pkg/deployment/reconcile/plan_builder_test.go b/pkg/deployment/reconcile/plan_builder_test.go index 16a0dd26d..479fb6e8f 100644 --- a/pkg/deployment/reconcile/plan_builder_test.go +++ b/pkg/deployment/reconcile/plan_builder_test.go @@ -1036,8 +1036,14 @@ func TestCreatePlan(t *testing.T) { } ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed ad.Status.Members.Agents[0].ID = "id" + for i := range ad.Status.Members.Coordinators { + ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated + } + for i := range ad.Status.Members.DBServers { + ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated + } }, - ExpectedPlan: []api.Action{ + ExpectedHighPlan: []api.Action{ actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")), }, ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss", @@ -1057,6 +1063,8 @@ func TestCreatePlan(t *testing.T) { ExpectedPlan: []api.Action{ actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")), actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators, + withPredefinedMember(api.MemberIDPreviousAction)), }, ExpectedLog: "Creating member replacement plan because member has failed", }, @@ -1075,6 +1083,8 @@ func TestCreatePlan(t *testing.T) { ExpectedPlan: []api.Action{ actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")), actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers, + withPredefinedMember(api.MemberIDPreviousAction)), }, ExpectedLog: "Creating member replacement plan because member has failed", }, diff --git a/pkg/deployment/reconcile/reconciler.go b/pkg/deployment/reconcile/reconciler.go index 252935c27..80b4c1e72 100644 --- a/pkg/deployment/reconcile/reconciler.go +++ b/pkg/deployment/reconcile/reconciler.go @@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error { } if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil { - r.log.Err(err).Error("Failed to delete pod") + r.log.Err(err).Error("Failed to delete secret") } m.Phase = api.MemberPhaseNone From 305c884f98613df1bd7317a14857c7afe7b48e17 Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Sat, 16 Jul 2022 05:40:35 +0000 Subject: [PATCH 2/2] Move plan build higher in order --- pkg/deployment/reconcile/plan_builder_high.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/deployment/reconcile/plan_builder_high.go b/pkg/deployment/reconcile/plan_builder_high.go index baff55ea4..a10de4a00 100644 --- a/pkg/deployment/reconcile/plan_builder_high.go +++ b/pkg/deployment/reconcile/plan_builder_high.go @@ -57,11 +57,11 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense). ApplyIfEmpty(r.createTopologyMemberConditionPlan). ApplyIfEmpty(r.createRebalancerCheckPlan). + ApplyIfEmpty(r.createMemberFailedRestoreHighPlan). ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)). Apply(r.createBackupInProgressConditionPlan). // Discover backups always Apply(r.createMaintenanceConditionPlan). // Discover maintenance always - Apply(r.cleanupConditions). // Cleanup Conditions - ApplyIfEmpty(r.createMemberFailedRestoreHighPlan) + Apply(r.cleanupConditions) // Cleanup Conditions return q.Plan(), q.BackOff(), true }