From f6ad843652c1c73e6d26ea08dde2055e62e4bcbf Mon Sep 17 00:00:00 2001 From: Anton Borisov Date: Tue, 21 Apr 2026 16:03:42 +0100 Subject: [PATCH] [helm] scheduling primitives --- helm/templates/sts-coordinator.yaml | 16 ++ helm/templates/sts-tablet.yaml | 16 ++ helm/tests/scheduling_test.yaml | 209 ++++++++++++++++++ helm/values.yaml | 29 +++ .../install-deploy/deploying-with-helm.md | 80 +++++++ 5 files changed, 350 insertions(+) create mode 100644 helm/tests/scheduling_test.yaml diff --git a/helm/templates/sts-coordinator.yaml b/helm/templates/sts-coordinator.yaml index d1c25b2074..6288929a0b 100644 --- a/helm/templates/sts-coordinator.yaml +++ b/helm/templates/sts-coordinator.yaml @@ -46,6 +46,22 @@ spec: serviceAccountName: {{ .Values.serviceAccount.name | default (include "fluss.fullname" .) }} {{- end }} {{- include "fluss.imagePullSecrets" . | nindent 6 }} + {{- with .Values.coordinator.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.coordinator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.coordinator.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.coordinator.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} {{- with .Values.coordinator.initContainers }} initContainers: {{- toYaml . | nindent 8 }} diff --git a/helm/templates/sts-tablet.yaml b/helm/templates/sts-tablet.yaml index 83ed7057b9..60abb11005 100644 --- a/helm/templates/sts-tablet.yaml +++ b/helm/templates/sts-tablet.yaml @@ -46,6 +46,22 @@ spec: serviceAccountName: {{ .Values.serviceAccount.name | default (include "fluss.fullname" .) }} {{- end }} {{- include "fluss.imagePullSecrets" . | nindent 6 }} + {{- with .Values.tablet.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tablet.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tablet.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tablet.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} {{- with .Values.tablet.initContainers }} initContainers: {{- toYaml . | nindent 8 }} diff --git a/helm/tests/scheduling_test.yaml b/helm/tests/scheduling_test.yaml new file mode 100644 index 0000000000..7c52155449 --- /dev/null +++ b/helm/tests/scheduling_test.yaml @@ -0,0 +1,209 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +suite: scheduling-defaults +templates: + - templates/sts-tablet.yaml + - templates/sts-coordinator.yaml +tests: + - it: does not render scheduling fields by default for tablet + template: templates/sts-tablet.yaml + asserts: + - isNull: + path: spec.template.spec.affinity + - isNull: + path: spec.template.spec.nodeSelector + - isNull: + path: spec.template.spec.tolerations + - isNull: + path: spec.template.spec.topologySpreadConstraints + - it: does not render scheduling fields by default for coordinator + template: templates/sts-coordinator.yaml + asserts: + - isNull: + path: spec.template.spec.affinity + - isNull: + path: spec.template.spec.nodeSelector + - isNull: + path: spec.template.spec.tolerations + - isNull: + path: spec.template.spec.topologySpreadConstraints + +--- + +suite: tablet-affinity +templates: + - templates/sts-tablet.yaml +tests: + - it: renders pod anti-affinity for tablet server + set: + tablet.affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/component: tablet + topologyKey: kubernetes.io/hostname + asserts: + - equal: + path: spec.template.spec.affinity.podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].topologyKey + value: kubernetes.io/hostname + - equal: + path: spec.template.spec.affinity.podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecution[0].labelSelector.matchLabels["app.kubernetes.io/component"] + value: tablet + +--- + +suite: coordinator-affinity +templates: + - templates/sts-coordinator.yaml +tests: + - it: renders pod anti-affinity for coordinator server + set: + coordinator.affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/component: coordinator + topologyKey: kubernetes.io/hostname + asserts: + - equal: + path: spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution[0].weight + value: 100 + - equal: + path: spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecution[0].podAffinityTerm.topologyKey + value: kubernetes.io/hostname + +--- + +suite: tablet-node-selector +templates: + - templates/sts-tablet.yaml +tests: + - it: renders node selector for tablet server + set: + tablet.nodeSelector: + disktype: ssd + workload: fluss + asserts: + - equal: + path: spec.template.spec.nodeSelector.disktype + value: ssd + - equal: + path: spec.template.spec.nodeSelector.workload + value: fluss + +--- + +suite: tablet-tolerations +templates: + - templates/sts-tablet.yaml +tests: + - it: renders tolerations for tablet server + set: + tablet.tolerations: + - key: dedicated + operator: Equal + value: fluss + effect: NoSchedule + asserts: + - contains: + path: spec.template.spec.tolerations + content: + key: dedicated + operator: Equal + value: fluss + effect: NoSchedule + +--- + +suite: tablet-topology-spread-constraints +templates: + - templates/sts-tablet.yaml +tests: + - it: renders topology spread constraints for tablet server + set: + tablet.topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/component: tablet + asserts: + - contains: + path: spec.template.spec.topologySpreadConstraints + content: + maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app.kubernetes.io/component: tablet + +--- + +suite: coordinator-node-selector-and-tolerations +templates: + - templates/sts-coordinator.yaml +tests: + - it: renders node selector for coordinator server + set: + coordinator.nodeSelector: + workload: fluss + asserts: + - equal: + path: spec.template.spec.nodeSelector.workload + value: fluss + - it: renders tolerations for coordinator server + set: + coordinator.tolerations: + - key: dedicated + operator: Equal + value: fluss + effect: NoSchedule + asserts: + - contains: + path: spec.template.spec.tolerations + content: + key: dedicated + operator: Equal + value: fluss + effect: NoSchedule + - it: renders topology spread constraints for coordinator server + set: + coordinator.topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: coordinator + asserts: + - contains: + path: spec.template.spec.topologySpreadConstraints + content: + maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: coordinator diff --git a/helm/values.yaml b/helm/values.yaml index f306f633c2..c92d5ff943 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -54,6 +54,31 @@ tablet: enabled: false # minAvailable: 1 # maxUnavailable: 1 + # Pod scheduling configuration + affinity: {} + # Example: spread tablet server pods across availability zones and nodes. + # If multiple Fluss releases share the cluster, also scope by instance via + # `app.kubernetes.io/instance: ` to avoid cross-release anti-affinity. + # affinity: + # podAntiAffinity: + # preferredDuringSchedulingIgnoredDuringExecution: + # - weight: 100 + # podAffinityTerm: + # topologyKey: topology.kubernetes.io/zone + # labelSelector: + # matchLabels: + # app.kubernetes.io/name: fluss + # app.kubernetes.io/component: tablet + # - weight: 50 + # podAffinityTerm: + # topologyKey: kubernetes.io/hostname + # labelSelector: + # matchLabels: + # app.kubernetes.io/name: fluss + # app.kubernetes.io/component: tablet + nodeSelector: {} + tolerations: [] + topologySpreadConstraints: [] coordinator: numberOfReplicas: 1 @@ -72,6 +97,10 @@ coordinator: enabled: false # minAvailable: 1 # maxUnavailable: 1 + affinity: {} + nodeSelector: {} + tolerations: [] + topologySpreadConstraints: [] # Fluss listener configurations listeners: diff --git a/website/docs/install-deploy/deploying-with-helm.md b/website/docs/install-deploy/deploying-with-helm.md index fb5c910395..9b59060199 100644 --- a/website/docs/install-deploy/deploying-with-helm.md +++ b/website/docs/install-deploy/deploying-with-helm.md @@ -240,6 +240,19 @@ It is recommended to set these explicitly in production. |-----------|-------------|---------| | `tablet.numberOfReplicas` | Number of TabletServer replicas to deploy | `3` | +### Scheduling Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `tablet.affinity` | Affinity rules for TabletServer pods | `{}` | +| `tablet.nodeSelector` | Node selector for TabletServer pods | `{}` | +| `tablet.tolerations` | Tolerations for TabletServer pods | `[]` | +| `tablet.topologySpreadConstraints` | Topology spread constraints for TabletServer pods | `[]` | +| `coordinator.affinity` | Affinity rules for CoordinatorServer pods | `{}` | +| `coordinator.nodeSelector` | Node selector for CoordinatorServer pods | `{}` | +| `coordinator.tolerations` | Tolerations for CoordinatorServer pods | `[]` | +| `coordinator.topologySpreadConstraints` | Topology spread constraints for CoordinatorServer pods | `[]` | + ### Storage Parameters | Parameter | Description | Default | @@ -487,6 +500,73 @@ configurationOverrides: remote.data.dir: "s3://my-bucket/fluss-data" ``` +### Pod Scheduling + +By default, Kubernetes may schedule all tablet server pods on the same node. With replication factor 3, a single node failure could take out all replicas simultaneously, causing data loss for segments not yet tiered to remote storage. + +Use pod anti-affinity to spread tablet server pods across availability zones and nodes: + +```yaml +tablet: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + topologyKey: topology.kubernetes.io/zone + labelSelector: + matchLabels: + app.kubernetes.io/name: fluss + app.kubernetes.io/component: tablet + - weight: 50 + podAffinityTerm: + topologyKey: kubernetes.io/hostname + labelSelector: + matchLabels: + app.kubernetes.io/name: fluss + app.kubernetes.io/component: tablet +``` + +This configuration prioritizes zone-level spreading (weight 100) while also avoiding co-location on the same node (weight 50). For stricter guarantees, use `requiredDuringSchedulingIgnoredDuringExecution` instead — but note that pods will stay pending if no suitable node is available. + +If multiple Fluss releases share the cluster, also scope the selector by instance via `app.kubernetes.io/instance: ` to avoid cross-release anti-affinity. + +Alternatively, use `topologySpreadConstraints` for even distribution across failure domains: + +```yaml +tablet: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: fluss + app.kubernetes.io/component: tablet + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: fluss + app.kubernetes.io/component: tablet +``` + +You can also pin pods to specific nodes using `nodeSelector` or allow scheduling on tainted nodes with `tolerations`: + +```yaml +tablet: + nodeSelector: + workload: fluss + tolerations: + - key: dedicated + operator: Equal + value: fluss + effect: NoSchedule +``` + +The same scheduling fields are available for coordinator servers under `coordinator.affinity`, `coordinator.nodeSelector`, `coordinator.tolerations`, and `coordinator.topologySpreadConstraints`. + ### Loading Filesystem Plugins via Init Containers Fluss discovers filesystem plugins at startup by scanning subdirectories under `$FLUSS_HOME/plugins/`.