diff --git a/pkg/deployment/reconcile/action_remove_member.go b/pkg/deployment/reconcile/action_remove_member.go index 613bc4fbe..08df4bba0 100644 --- a/pkg/deployment/reconcile/action_remove_member.go +++ b/pkg/deployment/reconcile/action_remove_member.go @@ -70,6 +70,22 @@ func (a *actionRemoveMember) Start(ctx context.Context) (bool, error) { if err := arangod.RemoveServerFromCluster(ctx, client.Connection(), driver.ServerID(m.ID)); err != nil { if !driver.IsNotFound(err) && !driver.IsPreconditionFailed(err) { return false, maskAny(errors.Wrapf(err, "Failed to remove server from cluster: %#v", err)) + } else if driver.IsPreconditionFailed(err) { + cluster, err := client.Cluster(ctx) + if err != nil { + return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster: %#v", err)) + } + health, err := cluster.Health(ctx) + if err != nil { + return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster health: %#v", err)) + } + // We don't care if not found + if record, ok := health.Health[driver.ServerID(m.ID)]; ok { + if record.Status != driver.ServerStatusFailed { + return false, maskAny(fmt.Errorf("can not remove server from cluster. Not yet terminated. Retry later")) + } + a.log.Warn().Msg("dbserver is failed but still in use") + } } else { a.log.Warn().Msgf("ignoring error: %s", err.Error()) } diff --git a/pkg/deployment/resources/pod_termination.go b/pkg/deployment/resources/pod_termination.go index 6eff6c80e..8e065681a 100644 --- a/pkg/deployment/resources/pod_termination.go +++ b/pkg/deployment/resources/pod_termination.go @@ -123,6 +123,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol log.Debug().Msg("Pod is already failed, safe to remove dbserver pod") return nil } + // If pod is not member of cluster, do nothing + if !memberStatus.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) { + log.Debug().Msg("Pod is not member of cluster") + return nil + } + // Inspect deployment deletion state apiObject := r.context.GetAPIObject() if apiObject.GetDeletionTimestamp() != nil { @@ -154,6 +160,11 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol dbserverDataWillBeGone = true } + // Once decided to drain the member, never go back + if memberStatus.Phase == api.MemberPhaseCreated { + dbserverDataWillBeGone = true + } + // Is this a simple pod restart? if !dbserverDataWillBeGone { log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod") @@ -230,11 +241,15 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol return maskAny(err) } if jobStatus.IsFailed() { - log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed. Aborting plan") + log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed") // Revert cleanout state memberStatus.Phase = api.MemberPhaseCreated memberStatus.CleanoutJobID = "" - return maskAny(fmt.Errorf("Clean out server job failed")) + if err := updateMember(memberStatus); err != nil { + return maskAny(err) + } + log.Error().Msg("Cleanout server job failed, continue anyway") + return nil } }