diff --git a/CHANGELOG.md b/CHANGELOG.md index 788e5c58c..e06e4441a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - (Feature) Move PVC resize action to high-priority plan - (Feature) Remove forgotten ArangoDB jobs during restart - (Feature) Add support for managed services +- (Feature) Recreation member in the high plan ## [1.2.14](https://github.com/arangodb/kube-arangodb/tree/1.2.14) (2022-07-14) - (Feature) Add ArangoSync TLS based rotation diff --git a/pkg/deployment/reconcile/plan_builder_high.go b/pkg/deployment/reconcile/plan_builder_high.go index e11653653..a10de4a00 100644 --- a/pkg/deployment/reconcile/plan_builder_high.go +++ b/pkg/deployment/reconcile/plan_builder_high.go @@ -57,6 +57,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense). ApplyIfEmpty(r.createTopologyMemberConditionPlan). ApplyIfEmpty(r.createRebalancerCheckPlan). + ApplyIfEmpty(r.createMemberFailedRestoreHighPlan). ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)). Apply(r.createBackupInProgressConditionPlan). // Discover backups always Apply(r.createMaintenanceConditionPlan). // Discover maintenance always diff --git a/pkg/deployment/reconcile/plan_builder_member_recovery.go b/pkg/deployment/reconcile/plan_builder_member_recovery.go new file mode 100644 index 000000000..737b35d49 --- /dev/null +++ b/pkg/deployment/reconcile/plan_builder_member_recovery.go @@ -0,0 +1,128 @@ +// +// DISCLAIMER +// +// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package reconcile + +import ( + "context" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/actions" + "github.com/arangodb/kube-arangodb/pkg/deployment/agency" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" +) + +// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member. +func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject, + spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + condition := func(a api.Action) bool { + return a.Type != api.ActionTypeRecreateMember + } + + return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition) +} + +// createMemberFailedRestoreHighPlan returns only recreate member actions. +func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject, + spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + condition := func(a api.Action) bool { + return a.Type == api.ActionTypeRecreateMember + } + + return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition) +} + +func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec, + status api.DeploymentStatus, context PlanBuilderContext) api.Plan { + var plan api.Plan + + // Fetch agency plan. + agencyState, agencyOK := context.GetAgencyCache() + + // Check for members in failed state. + status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error { + failed := 0 + for _, m := range members { + if m.Phase == api.MemberPhaseFailed { + failed++ + } + } + for _, m := range members { + if m.Phase != api.MemberPhaseFailed || len(plan) > 0 { + continue + } + + memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole()) + + if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster { + if !agencyOK { + // If agency is down DBServers should not be touched. + memberLog.Info("Agency state is not present") + continue + } + + if c := spec.DBServers.GetCount(); c <= len(members)-failed { + // There are more or equal alive members than current count. A member should not be recreated. + continue + } + + if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { + // DBServer still exists in agency plan! Will not be removed, but needs to be recreated. + memberLog.Info("Recreating DBServer - it cannot be removed gracefully") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + + continue + } + // From here on, DBServer can be recreated. + } + + switch group { + case api.ServerGroupAgents: + // For agents just recreate member do not rotate ID, do not remove PVC or service. + memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + case api.ServerGroupSingle: + // Do not remove data for single. + memberLog.Info("Restoring old member. Rotation for single servers is not safe") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + default: + if spec.GetAllowMemberRecreation(group) { + memberLog.Info("Creating member replacement plan because member has failed") + plan = append(plan, + actions.NewAction(api.ActionTypeRemoveMember, group, m), + actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)), + ) + } else { + memberLog.Info("Restoring old member. Recreation is disabled for group") + plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m)) + } + } + } + return nil + }) + + if len(plan) == 0 && !agencyOK { + r.log.Warn("unable to build further plan without access to agency") + plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle)) + } + + return plan +} diff --git a/pkg/deployment/reconcile/plan_builder_normal.go b/pkg/deployment/reconcile/plan_builder_normal.go index a057b355c..5204be255 100644 --- a/pkg/deployment/reconcile/plan_builder_normal.go +++ b/pkg/deployment/reconcile/plan_builder_normal.go @@ -24,8 +24,6 @@ import ( "context" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" - "github.com/arangodb/kube-arangodb/pkg/deployment/actions" - "github.com/arangodb/kube-arangodb/pkg/deployment/agency" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API // Check for scale up ApplyIfEmpty(r.createScaleUPMemberPlan). // Check for failed members - ApplyIfEmpty(r.createMemberFailedRestorePlan). + ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan). // Check for scale up/down ApplyIfEmpty(r.createScaleMemberPlan). // Update status @@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API return q.Plan(), q.BackOff(), true } -func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject, - spec api.DeploymentSpec, status api.DeploymentStatus, - context PlanBuilderContext) api.Plan { - var plan api.Plan - - // Fetch agency plan - agencyState, agencyOK := context.GetAgencyCache() - - // Check for members in failed state - status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error { - failed := 0 - for _, m := range members { - if m.Phase == api.MemberPhaseFailed { - failed++ - } - } - for _, m := range members { - if m.Phase != api.MemberPhaseFailed || len(plan) > 0 { - continue - } - - memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole()) - - if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster { - // Do pre check for DBServers. If agency is down DBServers should not be touch - if !agencyOK { - memberLog.Info("Agency state is not present") - continue - } - - if c := spec.DBServers.GetCount(); c <= len(members)-failed { - // We have more or equal alive members than current count, we should not recreate this member - continue - } - - if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { - // DBServer still exists in agency plan! Will not be removed, but needs to be recreated - memberLog.Info("Recreating DBServer - it cannot be removed gracefully") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - continue - } - - // Everything is fine, proceed - } - - switch group { - case api.ServerGroupAgents: - // For agents just recreate member do not rotate ID, do not remove PVC or service - memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - case api.ServerGroupSingle: - // Do not remove data for singles - memberLog.Info("Restoring old member. Rotation for single servers is not safe") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - default: - if spec.GetAllowMemberRecreation(group) { - memberLog.Info("Creating member replacement plan because member has failed") - plan = append(plan, - actions.NewAction(api.ActionTypeRemoveMember, group, m), - actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")), - ) - } else { - memberLog.Info("Restoring old member. Recreation is disabled for group") - plan = append(plan, - actions.NewAction(api.ActionTypeRecreateMember, group, m)) - } - } - } - return nil - }) - - // Ensure that we were able to get agency info - if len(plan) == 0 && !agencyOK { - r.log.Warn("unable to build further plan without access to agency") - plan = append(plan, - actions.NewClusterAction(api.ActionTypeIdle)) - } - - return plan -} - func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { diff --git a/pkg/deployment/reconcile/plan_builder_test.go b/pkg/deployment/reconcile/plan_builder_test.go index 25534db5a..8440b448b 100644 --- a/pkg/deployment/reconcile/plan_builder_test.go +++ b/pkg/deployment/reconcile/plan_builder_test.go @@ -1017,8 +1017,14 @@ func TestCreatePlan(t *testing.T) { } ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed ad.Status.Members.Agents[0].ID = "id" + for i := range ad.Status.Members.Coordinators { + ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated + } + for i := range ad.Status.Members.DBServers { + ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated + } }, - ExpectedPlan: []api.Action{ + ExpectedHighPlan: []api.Action{ actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")), }, ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss", @@ -1038,6 +1044,8 @@ func TestCreatePlan(t *testing.T) { ExpectedPlan: []api.Action{ actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")), actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators, + withPredefinedMember(api.MemberIDPreviousAction)), }, ExpectedLog: "Creating member replacement plan because member has failed", }, @@ -1056,6 +1064,8 @@ func TestCreatePlan(t *testing.T) { ExpectedPlan: []api.Action{ actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")), actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")), + actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers, + withPredefinedMember(api.MemberIDPreviousAction)), }, ExpectedLog: "Creating member replacement plan because member has failed", }, diff --git a/pkg/deployment/reconcile/reconciler.go b/pkg/deployment/reconcile/reconciler.go index 252935c27..80b4c1e72 100644 --- a/pkg/deployment/reconcile/reconciler.go +++ b/pkg/deployment/reconcile/reconciler.go @@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error { } if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil { - r.log.Err(err).Error("Failed to delete pod") + r.log.Err(err).Error("Failed to delete secret") } m.Phase = api.MemberPhaseNone