Skip to content

Commit ec1f360

Browse files
authored
Merge pull request #362 from arangodb/bug-fix/all-coordinators-gone-fix-2
Coordinators
2 parents 241e91a + 515c453 commit ec1f360

File tree

3 files changed

+41
-14
lines changed

3 files changed

+41
-14
lines changed

pkg/deployment/deployment_inspector.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval
121121
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
122122
}
123123

124+
// Immediate actions
125+
if err := d.reconciler.CheckDeployment(); err != nil {
126+
hasError = true
127+
d.CreateEvent(k8sutil.NewErrorEvent("Reconciler immediate actions failed", err, d.apiObject))
128+
}
129+
124130
// Create scale/update plan
125131
if err := d.reconciler.CreatePlan(); err != nil {
126132
hasError = true

pkg/deployment/reconcile/reconciler.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222

2323
package reconcile
2424

25-
import "github.com/rs/zerolog"
25+
import (
26+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
27+
"github.com/rs/zerolog"
28+
)
2629

2730
// Reconciler is the service that takes care of bring the a deployment
2831
// in line with its (changed) specification.
@@ -38,3 +41,34 @@ func NewReconciler(log zerolog.Logger, context Context) *Reconciler {
3841
context: context,
3942
}
4043
}
44+
45+
// CheckDeployment checks for obviously broken things and fixes them immediately
46+
func (r *Reconciler) CheckDeployment() error {
47+
spec := r.context.GetSpec()
48+
status, _ := r.context.GetStatus()
49+
50+
if spec.GetMode().HasCoordinators() {
51+
// Check if there are coordinators
52+
if len(status.Members.Coordinators) == 0 {
53+
// No more coordinators! Take immediate action
54+
r.log.Error().Msg("No Coordinator members! Create one member immediately")
55+
_, err := r.context.CreateMember(api.ServerGroupCoordinators, "")
56+
if err != nil {
57+
return err
58+
}
59+
} else if status.Members.Coordinators.AllFailed() {
60+
r.log.Error().Msg("All coordinators failed - reset")
61+
for _, m := range status.Members.Coordinators {
62+
if err := r.context.DeletePod(m.PodName); err != nil {
63+
r.log.Error().Err(err).Msg("Failed to delete pod")
64+
}
65+
m.Phase = api.MemberPhaseNone
66+
if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil {
67+
r.log.Error().Err(err).Msg("Failed to update member")
68+
}
69+
}
70+
}
71+
}
72+
73+
return nil
74+
}

pkg/deployment/resources/pod_inspector.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -243,19 +243,6 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
243243
allMembersReady := status.Members.AllMembersReady(spec.GetMode(), spec.Sync.IsEnabled())
244244
status.Conditions.Update(api.ConditionTypeReady, allMembersReady, "", "")
245245

246-
if spec.GetMode().HasCoordinators() && status.Members.Coordinators.AllFailed() {
247-
log.Error().Msg("All coordinators failed - reset")
248-
for _, m := range status.Members.Coordinators {
249-
if err := r.context.DeletePod(m.PodName); err != nil {
250-
log.Error().Err(err).Msg("Failed to delete pod")
251-
}
252-
m.Phase = api.MemberPhaseNone
253-
if err := status.Members.Update(m, api.ServerGroupCoordinators); err != nil {
254-
log.Error().Err(err).Msg("Failed to update member")
255-
}
256-
}
257-
}
258-
259246
// Update conditions
260247
if len(podNamesWithScheduleTimeout) > 0 {
261248
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, true,

0 commit comments

Comments
 (0)