diff --git a/pkg/apis/deployment/v1alpha/member_status_list.go b/pkg/apis/deployment/v1alpha/member_status_list.go index b6bf0fff9..4f88cd453 100644 --- a/pkg/apis/deployment/v1alpha/member_status_list.go +++ b/pkg/apis/deployment/v1alpha/member_status_list.go @@ -25,8 +25,10 @@ package v1alpha import ( "math/rand" "sort" + "time" "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" ) // MemberStatusList is a list of MemberStatus entries @@ -178,3 +180,27 @@ func (l MemberStatusList) MembersReady() int { func (l MemberStatusList) AllMembersReady() bool { return len(l) == l.MembersReady() } + +// AllConditionTrueSince returns true if all members satisfy the condition since the given period +func (l MemberStatusList) AllConditionTrueSince(cond ConditionType, status v1.ConditionStatus, period time.Duration) bool { + for _, x := range l { + if c, ok := x.Conditions.Get(cond); ok { + if c.Status == status && c.LastTransitionTime.Time.Add(period).Before(time.Now()) { + continue + } + } + return false + } + + return true +} + +// AllFailed returns true if all members are failed +func (l MemberStatusList) AllFailed() bool { + for _, x := range l { + if !x.Phase.IsFailed() { + return false + } + } + return true +} diff --git a/pkg/deployment/resilience/member_failure.go b/pkg/deployment/resilience/member_failure.go index fb67492f4..c9d81f3ac 100644 --- a/pkg/deployment/resilience/member_failure.go +++ b/pkg/deployment/resilience/member_failure.go @@ -49,11 +49,20 @@ func (r *Resilience) CheckMemberFailure() error { Str("id", m.ID). Str("role", group.AsRole()). Logger() - // Check current state - if m.Phase != api.MemberPhaseCreated { - // Phase is not Created, so we're not looking further. + + // Check if there are Members with Phase Upgrading or Rotation but no plan + switch m.Phase { + case api.MemberPhaseNone: continue + case api.MemberPhaseUpgrading, api.MemberPhaseRotating, api.MemberPhaseCleanOut: + if len(status.Plan) == 0 { + log.Error().Msgf("No plan but member is in phase %s - marking as failed", m.Phase) + m.Phase = api.MemberPhaseFailed + status.Members.Update(m, group) + updateStatusNeeded = true + } } + // Check if pod is ready if m.Conditions.IsTrue(api.ConditionTypeReady) { // Pod is now ready, so we're not looking further diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 44ef74f0b..2093da95f 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -237,6 +237,19 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled()) status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "") + if spec.GetMode().HasCoordinators() && status.Members.Coordinators.AllFailed() { + log.Error().Msg("All coordinators failed - reset") + for _, m := range status.Members.Coordinators { + if err := r.context.DeletePod(m.PodName); err != nil { + log.Error().Err(err).Msg("Failed to delete pod") + } + m.Phase = api.MemberPhaseNone + if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil { + log.Error().Err(err).Msg("Failed to update member") + } + } + } + // Update conditions if len(podNamesWithScheduleTimeout) > 0 { if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,