Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions pkg/apis/deployment/v1alpha/member_status_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ package v1alpha
import (
"math/rand"
"sort"
"time"

"github.com/pkg/errors"
v1 "k8s.io/api/core/v1"
)

// MemberStatusList is a list of MemberStatus entries
Expand Down Expand Up @@ -178,3 +180,27 @@ func (l MemberStatusList) MembersReady() int {
func (l MemberStatusList) AllMembersReady() bool {
return len(l) == l.MembersReady()
}

// AllConditionTrueSince returns true if all members satisfy the condition since the given period
func (l MemberStatusList) AllConditionTrueSince(cond ConditionType, status v1.ConditionStatus, period time.Duration) bool {
for _, x := range l {
if c, ok := x.Conditions.Get(cond); ok {
if c.Status == status && c.LastTransitionTime.Time.Add(period).Before(time.Now()) {
continue
}
}
return false
}

return true
}

// AllFailed returns true if all members are failed
func (l MemberStatusList) AllFailed() bool {
for _, x := range l {
if !x.Phase.IsFailed() {
return false
}
}
return true
}
15 changes: 12 additions & 3 deletions pkg/deployment/resilience/member_failure.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,20 @@ func (r *Resilience) CheckMemberFailure() error {
Str("id", m.ID).
Str("role", group.AsRole()).
Logger()
// Check current state
if m.Phase != api.MemberPhaseCreated {
// Phase is not Created, so we're not looking further.

// Check if there are Members with Phase Upgrading or Rotation but no plan
switch m.Phase {
case api.MemberPhaseNone:
continue
case api.MemberPhaseUpgrading, api.MemberPhaseRotating, api.MemberPhaseCleanOut:
if len(status.Plan) == 0 {
log.Error().Msgf("No plan but member is in phase %s - marking as failed", m.Phase)
m.Phase = api.MemberPhaseFailed
status.Members.Update(m, group)
updateStatusNeeded = true
}
}

// Check if pod is ready
if m.Conditions.IsTrue(api.ConditionTypeReady) {
// Pod is now ready, so we're not looking further
Expand Down
13 changes: 13 additions & 0 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,19 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled())
status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "")

if spec.GetMode().HasCoordinators() && status.Members.Coordinators.AllFailed() {
log.Error().Msg("All coordinators failed - reset")
for _, m := range status.Members.Coordinators {
if err := r.context.DeletePod(m.PodName); err != nil {
log.Error().Err(err).Msg("Failed to delete pod")
}
m.Phase = api.MemberPhaseNone
if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil {
log.Error().Err(err).Msg("Failed to update member")
}
}
}

// Update conditions
if len(podNamesWithScheduleTimeout) > 0 {
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,
Expand Down