From 45adb01f2628b202c3501ab54036d8c01eb0199d Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:10:10 +0000 Subject: [PATCH 1/4] [Bugfix] [Platform] Ensure Inventory picks active leader --- CHANGELOG.md | 1 + .../reconcile/plan_builder_license.go | 61 ++++++++++++------- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c63544737..f47e3301b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - (Feature) (License) Activation API Integration - (Feature) (Platform) Chart & Service Kubernetes Events - (Feature) (Platform) Registry Secret +- (Bugfix) (Platform) Ensure Inventory picks active leader ## [1.3.1](https://github.com/arangodb/kube-arangodb/tree/1.3.1) (2025-10-07) - (Documentation) Add ArangoPlatformStorage Docs & Examples diff --git a/pkg/deployment/reconcile/plan_builder_license.go b/pkg/deployment/reconcile/plan_builder_license.go index 5ac97b75b..f96dd468e 100644 --- a/pkg/deployment/reconcile/plan_builder_license.go +++ b/pkg/deployment/reconcile/plan_builder_license.go @@ -117,6 +117,41 @@ func (r *Reconciler) updateClusterLicenseDiscover(spec api.DeploymentSpec, conte return "", errors.Errorf("Unable to discover License mode") } +func (r *Reconciler) updateClusterLicenseMember(spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) (api.DeploymentStatusMemberElement, bool) { + members := status.Members.AsListInGroups(arangod.GroupsWithLicenseV2()...).Filter(func(a api.DeploymentStatusMemberElement) bool { + i := a.Member.Image + if i == nil { + return false + } + + return i.ArangoDBVersion.CompareTo("3.9.0") >= 0 && i.Enterprise + }) + + if spec.Mode.Get() == api.DeploymentModeActiveFailover { + cache := context.ACS().CurrentClusterCache() + + // For AF is different + members = members.Filter(func(a api.DeploymentStatusMemberElement) bool { + pod, ok := cache.Pod().V1().GetSimple(a.Member.Pod.GetName()) + if !ok { + return false + } + + if _, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; ok { + return true + } + + return false + }) + } + + if len(members) == 0 { + return api.DeploymentStatusMemberElement{}, false + } + + return members[0], true +} + func (r *Reconciler) updateClusterLicenseKey(ctx context.Context, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { l, err := k8sutil.GetLicenseFromSecret(context.ACS().CurrentClusterCache(), spec.License.GetSecretName()) if err != nil { @@ -129,23 +164,14 @@ func (r *Reconciler) updateClusterLicenseKey(ctx context.Context, spec api.Deplo return nil } - members := status.Members.AsListInGroups(arangod.GroupsWithLicenseV2()...).Filter(func(a api.DeploymentStatusMemberElement) bool { - i := a.Member.Image - if i == nil { - return false - } - - return i.ArangoDBVersion.CompareTo("3.9.0") >= 0 && i.Enterprise - }) + member, ok := r.updateClusterLicenseMember(spec, status, context) - if len(members) == 0 { + if !ok { // No member found to take this action r.log.Trace("No enterprise member in version 3.9.0 or above") return nil } - member := members[0] - ctxChild, cancel := globals.GetGlobals().Timeouts().ArangoD().WithTimeout(ctx) defer cancel() @@ -187,23 +213,14 @@ func (r *Reconciler) updateClusterLicenseAPI(ctx context.Context, spec api.Deplo return nil } - members := status.Members.AsListInGroups(api.ServerGroupCoordinators, api.ServerGroupSingle).Filter(func(a api.DeploymentStatusMemberElement) bool { - i := a.Member.Image - if i == nil { - return false - } - - return i.ArangoDBVersion.CompareTo("3.9.0") >= 0 && i.Enterprise - }) + member, ok := r.updateClusterLicenseMember(spec, status, context) - if len(members) == 0 { + if !ok { // No member found to take this action r.log.Trace("No enterprise member in version 3.9.0 or above") return nil } - member := members[0] - ctxChild, cancel := globals.GetGlobals().Timeouts().ArangoD().WithTimeout(ctx) defer cancel() From e96d10546e16ccf615ec9bd32b6d6c995e145eb4 Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Thu, 6 Nov 2025 07:44:48 +0000 Subject: [PATCH 2/4] Iter --- .../reconcile/plan_builder_license.go | 33 ++-- pkg/deployment/resources/pod_inspector.go | 9 + pkg/deployment/resources/pod_leader.go | 176 +----------------- pkg/util/k8sutil/services.go | 2 +- 4 files changed, 23 insertions(+), 197 deletions(-) diff --git a/pkg/deployment/reconcile/plan_builder_license.go b/pkg/deployment/reconcile/plan_builder_license.go index f96dd468e..837828222 100644 --- a/pkg/deployment/reconcile/plan_builder_license.go +++ b/pkg/deployment/reconcile/plan_builder_license.go @@ -22,6 +22,7 @@ package reconcile import ( "context" + "math/rand" "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" @@ -117,7 +118,7 @@ func (r *Reconciler) updateClusterLicenseDiscover(spec api.DeploymentSpec, conte return "", errors.Errorf("Unable to discover License mode") } -func (r *Reconciler) updateClusterLicenseMember(spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) (api.DeploymentStatusMemberElement, bool) { +func (r *Reconciler) updateClusterLicenseMember(status api.DeploymentStatus) (api.DeploymentStatusMemberElement, bool) { members := status.Members.AsListInGroups(arangod.GroupsWithLicenseV2()...).Filter(func(a api.DeploymentStatusMemberElement) bool { i := a.Member.Image if i == nil { @@ -125,31 +126,19 @@ func (r *Reconciler) updateClusterLicenseMember(spec api.DeploymentSpec, status } return i.ArangoDBVersion.CompareTo("3.9.0") >= 0 && i.Enterprise + }).Filter(func(a api.DeploymentStatusMemberElement) bool { + return a.Member.Conditions.IsTrue(api.ConditionTypeReady) }) - if spec.Mode.Get() == api.DeploymentModeActiveFailover { - cache := context.ACS().CurrentClusterCache() - - // For AF is different - members = members.Filter(func(a api.DeploymentStatusMemberElement) bool { - pod, ok := cache.Pod().V1().GetSimple(a.Member.Pod.GetName()) - if !ok { - return false - } - - if _, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; ok { - return true - } - - return false - }) - } - if len(members) == 0 { return api.DeploymentStatusMemberElement{}, false } - return members[0], true + if len(members) == 1 { + return members[0], true + } + + return members[rand.Intn(len(members))], true } func (r *Reconciler) updateClusterLicenseKey(ctx context.Context, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { @@ -164,7 +153,7 @@ func (r *Reconciler) updateClusterLicenseKey(ctx context.Context, spec api.Deplo return nil } - member, ok := r.updateClusterLicenseMember(spec, status, context) + member, ok := r.updateClusterLicenseMember(status) if !ok { // No member found to take this action @@ -213,7 +202,7 @@ func (r *Reconciler) updateClusterLicenseAPI(ctx context.Context, spec api.Deplo return nil } - member, ok := r.updateClusterLicenseMember(spec, status, context) + member, ok := r.updateClusterLicenseMember(status) if !ok { // No member found to take this action diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 0c27e612d..486f21e75 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -253,6 +253,15 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter } } + if k8sutil.IsPodReady(pod) && spec.Mode.Get() == api.DeploymentModeActiveFailover && features.FailoverLeadership().Enabled() { + if v, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; !ok || v != k8sutil.LabelValueArangoActive { + pod.Labels[k8sutil.LabelKeyArangoLeader] = k8sutil.LabelValueArangoActive + if err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), pod.Labels)); err != nil { + log.Str("pod-name", pod.GetName()).Err(err).Error("Unable to update labels") + } + } + } + if memberStatus.Conditions.IsTrue(api.ConditionTypeActive) { if v, ok := pod.Labels[k8sutil.LabelKeyArangoActive]; !ok || v != k8sutil.LabelValueArangoActive { pod.Labels[k8sutil.LabelKeyArangoActive] = k8sutil.LabelValueArangoActive diff --git a/pkg/deployment/resources/pod_leader.go b/pkg/deployment/resources/pod_leader.go index 9af5b167a..d33f16230 100644 --- a/pkg/deployment/resources/pod_leader.go +++ b/pkg/deployment/resources/pod_leader.go @@ -22,17 +22,12 @@ package resources import ( "context" - goStrings "strings" - "sync" core "k8s.io/api/core/v1" meta "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" - "github.com/arangodb/kube-arangodb/pkg/deployment/features" "github.com/arangodb/kube-arangodb/pkg/deployment/patch" - "github.com/arangodb/kube-arangodb/pkg/util/arangod" "github.com/arangodb/kube-arangodb/pkg/util/errors" "github.com/arangodb/kube-arangodb/pkg/util/globals" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" @@ -46,7 +41,7 @@ import ( // consequentially service will not point to any pod. // It works only in active fail-over mode. func (r *Resources) EnsureLeader(ctx context.Context, cachedStatus inspectorInterface.Inspector) error { - if r.context.GetSpec().GetMode() != api.DeploymentModeActiveFailover { + if !r.context.GetSpec().GetMode().HasAgents() { return nil } @@ -129,7 +124,7 @@ func (r *Resources) EnsureLeader(ctx context.Context, cachedStatus inspectorInte return err } else { if !c { - return r.ensureSingleServerLeader(ctx, cachedStatus) + return nil } return errors.Reconcile() @@ -150,170 +145,3 @@ func (r *Resources) EnsureLeader(ctx context.Context, cachedStatus inspectorInte // The service has been created. return errors.Reconcile() } - -// getSingleServerLeaderID returns ids of a single server leaders. -func (r *Resources) getSingleServerLeaderID(ctx context.Context) ([]string, error) { - status := r.context.GetStatus() - var mutex sync.Mutex - var leaderIDs []string - var anyError error - - ctxCancel, cancel := context.WithCancel(ctx) - defer cancel() - - var wg sync.WaitGroup - for _, m := range status.Members.Single { - wg.Add(1) - go func(id string) { - defer wg.Done() - err := globals.GetGlobalTimeouts().ArangoD().RunWithTimeout(ctxCancel, func(ctxChild context.Context) error { - c, err := r.context.GetMembersState().GetMemberClient(id) - if err != nil { - return err - } - - if available, err := arangod.IsServerAvailable(ctxChild, c); err != nil { - return err - } else if !available { - return errors.New("not available") - } - - mutex.Lock() - leaderIDs = append(leaderIDs, id) - mutex.Unlock() - return nil - }) - - if err != nil { - mutex.Lock() - anyError = err - mutex.Unlock() - } - }(m.ID) - } - wg.Wait() - - if len(leaderIDs) > 0 { - return leaderIDs, nil - } - - if anyError != nil { - return nil, errors.WithMessagef(anyError, "unable to get a leader") - } - - return nil, errors.New("unable to get a leader") -} - -// setSingleServerLeadership adds or removes leadership label on a single server pod. -func (r *Resources) ensureSingleServerLeader(ctx context.Context, cachedStatus inspectorInterface.Inspector) error { - changed := false - - enabled := features.FailoverLeadership().Enabled() - var leaderID string - if enabled { - leaderIDs, err := r.getSingleServerLeaderID(ctx) - if err != nil { - return err - } - - if len(leaderIDs) == 1 { - leaderID = leaderIDs[0] - } else if len(leaderIDs) > 1 { - r.log.Error("multiple leaders found: %s. Blocking traffic to the deployment services", goStrings.Join(leaderIDs, ", ")) - } - } - - status := r.context.GetStatus() - for _, m := range status.Members.Single { - pod, exist := cachedStatus.Pod().V1().GetSimple(m.Pod.GetName()) - if !exist { - continue - } - - labels := pod.GetLabels() - if enabled && m.ID == leaderID { - if value, ok := labels[k8sutil.LabelKeyArangoLeader]; ok && value == "true" { - // Single server is available, and it has a leader label. - continue - } - - labels = addLabel(labels, k8sutil.LabelKeyArangoLeader, "true") - } else { - if _, ok := labels[k8sutil.LabelKeyArangoLeader]; !ok { - // Single server is not available, and it does not have a leader label. - continue - } - - delete(labels, k8sutil.LabelKeyArangoLeader) - } - - err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), labels)) - if err != nil { - return errors.WithMessagef(err, "unable to change leader label for pod %s", m.Pod.GetName()) - } - changed = true - } - - if changed { - return errors.Reconcile() - } - - return r.ensureSingleServerLeaderServices(ctx, cachedStatus) -} - -// ensureSingleServerLeaderServices adds a leadership label to deployment service and external deployment service. -func (r *Resources) ensureSingleServerLeaderServices(ctx context.Context, cachedStatus inspectorInterface.Inspector) error { - // Add a leadership label to deployment service and external deployment service. - deploymentName := r.context.GetAPIObject().GetName() - changed := false - services := []string{ - k8sutil.CreateDatabaseClientServiceName(deploymentName), - k8sutil.CreateDatabaseExternalAccessServiceName(deploymentName), - } - - enabled := features.FailoverLeadership().Enabled() - for _, svcName := range services { - svc, exists := cachedStatus.Service().V1().GetSimple(svcName) - if !exists { - // It will be created later with a leadership label. - continue - } - selector := svc.Spec.Selector - if enabled { - if v, ok := selector[k8sutil.LabelKeyArangoLeader]; ok && v == "true" { - // It is already OK. - continue - } - - selector = addLabel(selector, k8sutil.LabelKeyArangoLeader, "true") - } else { - if _, ok := selector[k8sutil.LabelKeyArangoLeader]; !ok { - // Service does not have a leader label, and it should not have. - continue - } - - delete(selector, k8sutil.LabelKeyArangoLeader) - } - - parser := patch.Patch([]patch.Item{patch.ItemReplace(patch.NewPath("spec", "selector"), selector)}) - data, err := parser.Marshal() - if err != nil { - return errors.WithMessagef(err, "unable to marshal labels for service %s", svcName) - } - - err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error { - _, err := cachedStatus.ServicesModInterface().V1().Patch(ctxChild, svcName, types.JSONPatchType, data, meta.PatchOptions{}) - return err - }) - if err != nil { - return errors.WithMessagef(err, "unable to patch labels for service %s", svcName) - } - changed = true - } - - if changed { - return errors.Reconcile() - } - - return nil -} diff --git a/pkg/util/k8sutil/services.go b/pkg/util/k8sutil/services.go index ed97109d7..af3b67a45 100644 --- a/pkg/util/k8sutil/services.go +++ b/pkg/util/k8sutil/services.go @@ -72,7 +72,7 @@ func CreateExporterClientServiceName(deploymentName string) string { // CreateAgentLeaderServiceName returns the name of the service used to access a leader agent. func CreateAgentLeaderServiceName(deploymentName string) string { - return deploymentName + "-agent" + return deploymentName + "-agent-leader" } // CreateExporterService From e29c9b54108848abc109d3492916ef93a5d9711c Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:16:03 +0000 Subject: [PATCH 3/4] Iter --- pkg/deployment/resources/pod_inspector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 486f21e75..32d712f4a 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -253,7 +253,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter } } - if k8sutil.IsPodReady(pod) && spec.Mode.Get() == api.DeploymentModeActiveFailover && features.FailoverLeadership().Enabled() { + if k8sutil.IsPodReady(pod) && spec.Mode.Get() == api.DeploymentModeActiveFailover && features.FailoverLeadership().Enabled() && group == api.ServerGroupSingle { if v, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; !ok || v != k8sutil.LabelValueArangoActive { pod.Labels[k8sutil.LabelKeyArangoLeader] = k8sutil.LabelValueArangoActive if err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), pod.Labels)); err != nil { From b2f8362fd5ce41b9202855cf75bd01118c3f96df Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:58:03 +0000 Subject: [PATCH 4/4] Iter --- pkg/deployment/resources/pod_inspector.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 32d712f4a..19768ff8e 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -253,11 +253,21 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter } } - if k8sutil.IsPodReady(pod) && spec.Mode.Get() == api.DeploymentModeActiveFailover && features.FailoverLeadership().Enabled() && group == api.ServerGroupSingle { - if v, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; !ok || v != k8sutil.LabelValueArangoActive { - pod.Labels[k8sutil.LabelKeyArangoLeader] = k8sutil.LabelValueArangoActive - if err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), pod.Labels)); err != nil { - log.Str("pod-name", pod.GetName()).Err(err).Error("Unable to update labels") + if spec.Mode.Get() == api.DeploymentModeActiveFailover && features.FailoverLeadership().Enabled() && group == api.ServerGroupSingle { + s, ok := r.context.GetMembersState().MemberState(memberStatus.ID) + if ok && s.IsServing() { + if v, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; !ok || v != k8sutil.LabelValueArangoActive { + pod.Labels[k8sutil.LabelKeyArangoLeader] = k8sutil.LabelValueArangoActive + if err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), pod.Labels)); err != nil { + log.Str("pod-name", pod.GetName()).Err(err).Error("Unable to update labels") + } + } + } else { + if _, ok := pod.Labels[k8sutil.LabelKeyArangoLeader]; ok { + delete(pod.Labels, k8sutil.LabelKeyArangoLeader) + if err := r.context.ApplyPatchOnPod(ctx, pod, patch.ItemReplace(patch.NewPath("metadata", "labels"), pod.Labels)); err != nil { + log.Str("pod-name", pod.GetName()).Err(err).Error("Unable to update labels") + } } } }