diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index bac433f7d..5fca6d70c 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -121,6 +121,12 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject)) } + // Immediate actions + if err := d.reconciler.CheckDeployment(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Reconciler immediate actions failed", err, d.apiObject)) + } + // Create scale/update plan if err := d.reconciler.CreatePlan(); err != nil { hasError = true diff --git a/pkg/deployment/reconcile/reconciler.go b/pkg/deployment/reconcile/reconciler.go index 7cd53ba15..804baa06a 100644 --- a/pkg/deployment/reconcile/reconciler.go +++ b/pkg/deployment/reconcile/reconciler.go @@ -22,7 +22,10 @@ package reconcile -import "github.com/rs/zerolog" +import ( + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/rs/zerolog" +) // Reconciler is the service that takes care of bring the a deployment // in line with its (changed) specification. @@ -38,3 +41,34 @@ func NewReconciler(log zerolog.Logger, context Context) *Reconciler { context: context, } } + +// CheckDeployment checks for obviously broken things and fixes them immediately +func (r *Reconciler) CheckDeployment() error { + spec := r.context.GetSpec() + status, _ := r.context.GetStatus() + + if spec.GetMode().HasCoordinators() { + // Check if there are coordinators + if len(status.Members.Coordinators) == 0 { + // No more coordinators! Take immediate action + r.log.Error().Msg("No Coordinator members! Create one member immediately") + _, err := r.context.CreateMember(api.ServerGroupCoordinators, "") + if err != nil { + return err + } + } else if status.Members.Coordinators.AllFailed() { + r.log.Error().Msg("All coordinators failed - reset") + for _, m := range status.Members.Coordinators { + if err := r.context.DeletePod(m.PodName); err != nil { + r.log.Error().Err(err).Msg("Failed to delete pod") + } + m.Phase = api.MemberPhaseNone + if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil { + r.log.Error().Err(err).Msg("Failed to update member") + } + } + } + } + + return nil +} diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 207c37e62..33e0dd1a0 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -243,19 +243,6 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled()) status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "") - if spec.GetMode().HasCoordinators() && status.Members.Coordinators.AllFailed() { - log.Error().Msg("All coordinators failed - reset") - for _, m := range status.Members.Coordinators { - if err := r.context.DeletePod(m.PodName); err != nil { - log.Error().Err(err).Msg("Failed to delete pod") - } - m.Phase = api.MemberPhaseNone - if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil { - log.Error().Err(err).Msg("Failed to update member") - } - } - } - // Update conditions if len(podNamesWithScheduleTimeout) > 0 { if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,