Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- (Feature) Move PVC resize action to high-priority plan
- (Feature) Remove forgotten ArangoDB jobs during restart
- (Feature) Add support for managed services
- (Feature) Recreation member in the high plan

## [1.2.14](https://github.com/arangodb/kube-arangodb/tree/1.2.14) (2022-07-14)
- (Feature) Add ArangoSync TLS based rotation
Expand Down
1 change: 1 addition & 0 deletions pkg/deployment/reconcile/plan_builder_high.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense).
ApplyIfEmpty(r.createTopologyMemberConditionPlan).
ApplyIfEmpty(r.createRebalancerCheckPlan).
ApplyIfEmpty(r.createMemberFailedRestoreHighPlan).
ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)).
Apply(r.createBackupInProgressConditionPlan). // Discover backups always
Apply(r.createMaintenanceConditionPlan). // Discover maintenance always
Expand Down
128 changes: 128 additions & 0 deletions pkg/deployment/reconcile/plan_builder_member_recovery.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//

package reconcile

import (
"context"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member.
func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
condition := func(a api.Action) bool {
return a.Type != api.ActionTypeRecreateMember
}

return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
}

// createMemberFailedRestoreHighPlan returns only recreate member actions.
func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
condition := func(a api.Action) bool {
return a.Type == api.ActionTypeRecreateMember
}

return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
}

func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec,
status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
var plan api.Plan

// Fetch agency plan.
agencyState, agencyOK := context.GetAgencyCache()

// Check for members in failed state.
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
failed := 0
for _, m := range members {
if m.Phase == api.MemberPhaseFailed {
failed++
}
}
for _, m := range members {
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
continue
}

memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())

if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
if !agencyOK {
// If agency is down DBServers should not be touched.
memberLog.Info("Agency state is not present")
continue
}

if c := spec.DBServers.GetCount(); c <= len(members)-failed {
// There are more or equal alive members than current count. A member should not be recreated.
continue
}

if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated.
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))

continue
}
// From here on, DBServer can be recreated.
}

switch group {
case api.ServerGroupAgents:
// For agents just recreate member do not rotate ID, do not remove PVC or service.
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
case api.ServerGroupSingle:
// Do not remove data for single.
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
default:
if spec.GetAllowMemberRecreation(group) {
memberLog.Info("Creating member replacement plan because member has failed")
plan = append(plan,
actions.NewAction(api.ActionTypeRemoveMember, group, m),
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)),
)
} else {
memberLog.Info("Restoring old member. Recreation is disabled for group")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
}
}
}
return nil
})

if len(plan) == 0 && !agencyOK {
r.log.Warn("unable to build further plan without access to agency")
plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle))
}

return plan
}
88 changes: 1 addition & 87 deletions pkg/deployment/reconcile/plan_builder_normal.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ import (
"context"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

Expand All @@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
// Check for scale up
ApplyIfEmpty(r.createScaleUPMemberPlan).
// Check for failed members
ApplyIfEmpty(r.createMemberFailedRestorePlan).
ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan).
// Check for scale up/down
ApplyIfEmpty(r.createScaleMemberPlan).
// Update status
Expand Down Expand Up @@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
return q.Plan(), q.BackOff(), true
}

func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus,
context PlanBuilderContext) api.Plan {
var plan api.Plan

// Fetch agency plan
agencyState, agencyOK := context.GetAgencyCache()

// Check for members in failed state
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
failed := 0
for _, m := range members {
if m.Phase == api.MemberPhaseFailed {
failed++
}
}
for _, m := range members {
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
continue
}

memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())

if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
// Do pre check for DBServers. If agency is down DBServers should not be touch
if !agencyOK {
memberLog.Info("Agency state is not present")
continue
}

if c := spec.DBServers.GetCount(); c <= len(members)-failed {
// We have more or equal alive members than current count, we should not recreate this member
continue
}

if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
continue
}

// Everything is fine, proceed
}

switch group {
case api.ServerGroupAgents:
// For agents just recreate member do not rotate ID, do not remove PVC or service
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
case api.ServerGroupSingle:
// Do not remove data for singles
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
default:
if spec.GetAllowMemberRecreation(group) {
memberLog.Info("Creating member replacement plan because member has failed")
plan = append(plan,
actions.NewAction(api.ActionTypeRemoveMember, group, m),
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
)
} else {
memberLog.Info("Restoring old member. Recreation is disabled for group")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
}
}
}
return nil
})

// Ensure that we were able to get agency info
if len(plan) == 0 && !agencyOK {
r.log.Warn("unable to build further plan without access to agency")
plan = append(plan,
actions.NewClusterAction(api.ActionTypeIdle))
}

return plan
}

func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus,
context PlanBuilderContext) api.Plan {
Expand Down
12 changes: 11 additions & 1 deletion pkg/deployment/reconcile/plan_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1017,8 +1017,14 @@ func TestCreatePlan(t *testing.T) {
}
ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed
ad.Status.Members.Agents[0].ID = "id"
for i := range ad.Status.Members.Coordinators {
ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated
}
for i := range ad.Status.Members.DBServers {
ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated
}
},
ExpectedPlan: []api.Action{
ExpectedHighPlan: []api.Action{
actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")),
},
ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss",
Expand All @@ -1038,6 +1044,8 @@ func TestCreatePlan(t *testing.T) {
ExpectedPlan: []api.Action{
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")),
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators,
withPredefinedMember(api.MemberIDPreviousAction)),
},
ExpectedLog: "Creating member replacement plan because member has failed",
},
Expand All @@ -1056,6 +1064,8 @@ func TestCreatePlan(t *testing.T) {
ExpectedPlan: []api.Action{
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")),
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers,
withPredefinedMember(api.MemberIDPreviousAction)),
},
ExpectedLog: "Creating member replacement plan because member has failed",
},
Expand Down
2 changes: 1 addition & 1 deletion pkg/deployment/reconcile/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error {
}

if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil {
r.log.Err(err).Error("Failed to delete pod")
r.log.Err(err).Error("Failed to delete secret")
}
m.Phase = api.MemberPhaseNone

Expand Down