Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pkg/deployment/reconcile/action_remove_member.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,22 @@ func (a *actionRemoveMember) Start(ctx context.Context) (bool, error) {
if err := arangod.RemoveServerFromCluster(ctx, client.Connection(), driver.ServerID(m.ID)); err != nil {
if !driver.IsNotFound(err) && !driver.IsPreconditionFailed(err) {
return false, maskAny(errors.Wrapf(err, "Failed to remove server from cluster: %#v", err))
} else if driver.IsPreconditionFailed(err) {
cluster, err := client.Cluster(ctx)
if err != nil {
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster: %#v", err))
}
health, err := cluster.Health(ctx)
if err != nil {
return false, maskAny(errors.Wrapf(err, "Failed to obtain cluster health: %#v", err))
}
// We don't care if not found
if record, ok := health.Health[driver.ServerID(m.ID)]; ok {
if record.Status != driver.ServerStatusFailed {
return false, maskAny(fmt.Errorf("can not remove server from cluster. Not yet terminated. Retry later"))
}
a.log.Warn().Msg("dbserver is failed but still in use")
}
} else {
a.log.Warn().Msgf("ignoring error: %s", err.Error())
}
Expand Down
19 changes: 17 additions & 2 deletions pkg/deployment/resources/pod_termination.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
log.Debug().Msg("Pod is already failed, safe to remove dbserver pod")
return nil
}
// If pod is not member of cluster, do nothing
if !memberStatus.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
log.Debug().Msg("Pod is not member of cluster")
return nil
}

// Inspect deployment deletion state
apiObject := r.context.GetAPIObject()
if apiObject.GetDeletionTimestamp() != nil {
Expand Down Expand Up @@ -154,6 +160,11 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
dbserverDataWillBeGone = true
}

// Once decided to drain the member, never go back
if memberStatus.Phase == api.MemberPhaseCreated {
dbserverDataWillBeGone = true
}

// Is this a simple pod restart?
if !dbserverDataWillBeGone {
log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod")
Expand Down Expand Up @@ -230,11 +241,15 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
return maskAny(err)
}
if jobStatus.IsFailed() {
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed. Aborting plan")
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed")
// Revert cleanout state
memberStatus.Phase = api.MemberPhaseCreated
memberStatus.CleanoutJobID = ""
return maskAny(fmt.Errorf("Clean out server job failed"))
if err := updateMember(memberStatus); err != nil {
return maskAny(err)
}
log.Error().Msg("Cleanout server job failed, continue anyway")
return nil
}
}

Expand Down