diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc2e5f99..74fd461f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Split & Unify Lifecycle management functionality - Drop support for ArangoDB <= 3.5 (versions already EOL) - Add new admin commands to fetch agency dump and agency state +- Add Graceful shutdown as finalizer (supports kubectl delete) ## [1.2.4](https://github.com/arangodb/kube-arangodb/tree/1.2.4) (2021-10-22) - Replace `beta.kubernetes.io/arch` Pod label with `kubernetes.io/arch` using Silent Rotation diff --git a/go.mod b/go.mod index 0927d6ae3..1be1c32e1 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/arangodb/kube-arangodb go 1.16 replace ( - github.com/arangodb/go-driver => github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18 + github.com/arangodb/go-driver => github.com/arangodb/go-driver v1.2.1 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring => github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.46.0 github.com/prometheus-operator/prometheus-operator/pkg/client => github.com/prometheus-operator/prometheus-operator/pkg/client v0.46.0 github.com/stretchr/testify => github.com/stretchr/testify v1.5.1 @@ -25,8 +25,8 @@ replace ( require ( github.com/arangodb-helper/go-certificates v0.0.0-20180821055445-9fca24fc2680 github.com/arangodb/arangosync-client v0.7.0 - github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18 - github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13 + github.com/arangodb/go-driver v1.2.1 + github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83 github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21 github.com/cenkalti/backoff v2.2.1+incompatible github.com/dchest/uniuri v0.0.0-20160212164326-8902c56451e9 diff --git a/go.sum b/go.sum index ed5f4228f..80caf9c4a 100644 --- a/go.sum +++ b/go.sum @@ -44,8 +44,10 @@ github.com/arangodb/arangosync-client v0.7.0 h1:3vLOVnMyr5vGlPA0OHxJL9Wyy49JJwN0 github.com/arangodb/arangosync-client v0.7.0/go.mod h1:g+JcxH3C63wKaJPnPr9nggYoGbt/bYCWpfcRG0NSodY= github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18 h1:3J0tqp5eQ8ptGOeeu7vo92RKf24bOA7MFy0z3uPiTWg= github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18/go.mod h1:3NUekcRLpgheFIGEwcOvxilEW73MV1queNKW58k7sdc= -github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13 h1:5egTRo3Met3xXUVj/Pbn1gXeY2C4bQZycJoHSnndfig= -github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13/go.mod h1:X3uG4XbfQS35AjsFJLwNLyA6UZofNV5ufe2KoNxcMO0= +github.com/arangodb/go-driver v1.2.1 h1:HREDHhDmzdIWxHmfkfTESbYUnRjESjPh4WUuXq7FZa8= +github.com/arangodb/go-driver v1.2.1/go.mod h1:zdDkJJnCj8DAkfbtIjIXnsTrWIiy6VhP3Vy14p+uQeY= +github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83 h1:PCbi3alUFastUw6InBKGEXqniveJJcQuMYspubJMRS8= +github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83/go.mod h1:B8byYwvt1mDOQzpjiMuDTP5jOif/Y5dcEJtkdvPB7HY= github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21 h1:+W7D5ttxi/Ygh/39vialtypE23p9KI7P0J2qtoqUV4w= github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21/go.mod h1:RkPIG6JJ2pcJUoymc18NxAJGraZd+iAEVnOTDjZey/w= github.com/arangodb/go-velocypack v0.0.0-20200318135517-5af53c29c67e h1:Xg+hGrY2LcQBbxd0ZFdbGSyRKTYMZCfBbw/pMJFOk1g= @@ -184,6 +186,7 @@ github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY= github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= diff --git a/lifecycle.go b/lifecycle.go index 54e626761..6626b9a6e 100644 --- a/lifecycle.go +++ b/lifecycle.go @@ -25,10 +25,16 @@ package main import ( "context" "io" + "net" "os" "path/filepath" + "strconv" "time" + "github.com/arangodb/kube-arangodb/pkg/backup/utils" + + "github.com/arangodb/kube-arangodb/pkg/util/retry" + "github.com/arangodb/kube-arangodb/pkg/version" "github.com/spf13/cobra" @@ -54,6 +60,10 @@ var ( Run: cmdLifecyclePreStopRunFinalizer, Hidden: true, } + cmdLifecyclePreStopPort = &cobra.Command{ + Use: "port", + Hidden: true, + } cmdLifecycleCopy = &cobra.Command{ Use: "copy", Run: cmdLifecycleCopyRun, @@ -68,7 +78,15 @@ var ( func init() { cmdMain.AddCommand(cmdLifecycle) - cmdLifecyclePreStop.AddCommand(cmdLifecyclePreStopFinalizers) + var preStopPort cmdLifecyclePreStopRunPort + + cmdLifecyclePreStopPort.RunE = preStopPort.run + + f := cmdLifecyclePreStopPort.Flags() + + f.DurationVar(&preStopPort.timeout, "timeout", 6*60*time.Minute, "PreStopTimeout") + + cmdLifecyclePreStop.AddCommand(cmdLifecyclePreStopFinalizers, cmdLifecyclePreStopPort) cmdLifecycle.AddCommand(cmdLifecyclePreStop) cmdLifecycle.AddCommand(cmdLifecycleCopy) @@ -162,3 +180,63 @@ func cmdLifecycleCopyRun(cmd *cobra.Command, args []string) { cliLog.Info().Msgf("Executable copied to %s", targetPath) } + +type cmdLifecyclePreStopRunPort struct { + timeout time.Duration +} + +// Wait until port 8529 is closed. +func (c *cmdLifecyclePreStopRunPort) run(cmd *cobra.Command, args []string) error { + address := net.JoinHostPort("127.0.0.1", strconv.Itoa(k8sutil.ArangoPort)) + + // Get environment + namespace := os.Getenv(constants.EnvOperatorPodNamespace) + if len(namespace) == 0 { + cliLog.Fatal().Msgf("%s environment variable missing", constants.EnvOperatorPodNamespace) + } + name := os.Getenv(constants.EnvOperatorPodName) + if len(name) == 0 { + cliLog.Fatal().Msgf("%s environment variable missing", constants.EnvOperatorPodName) + } + + // Create kubernetes client + kubecli, err := k8sutil.NewKubeClient() + if err != nil { + cliLog.Fatal().Err(err).Msg("Failed to create Kubernetes client") + } + + pods := kubecli.CoreV1().Pods(namespace) + + recentErrors := 0 + + return retry.NewTimeout(func() error { + conn, err := net.DialTimeout("tcp", address, 500*time.Millisecond) + + if err != nil { + return retry.Interrput() + } + + conn.Close() + + p, err := pods.Get(context.Background(), name, metav1.GetOptions{}) + if k8sutil.IsNotFound(err) { + cliLog.Warn().Msg("Pod not found") + return nil + } else if err != nil { + recentErrors++ + cliLog.Error().Err(err).Msg("Failed to get pod") + if recentErrors > 20 { + cliLog.Fatal().Err(err).Msg("Too many recent errors") + return nil + } + } else { + // We got our pod + finalizers := utils.StringList(p.GetFinalizers()) + if !finalizers.Has(constants.FinalizerPodGracefulShutdown) { + return retry.Interrput() + } + } + + return nil + }).Timeout(125*time.Millisecond, c.timeout) +} diff --git a/pkg/apis/deployment/v1/plan.go b/pkg/apis/deployment/v1/plan.go index 55922cedf..aa0248367 100644 --- a/pkg/apis/deployment/v1/plan.go +++ b/pkg/apis/deployment/v1/plan.go @@ -73,6 +73,8 @@ const ( ActionTypeShutdownMember ActionType = "ShutdownMember" // ActionTypeResignLeadership causes a member to resign leadership. ActionTypeResignLeadership ActionType = "ResignLeadership" + // ActionTypeKillMemberPod causes a pod to get delete request. It also waits until Delay finalizer will be removed. + ActionTypeKillMemberPod ActionType = "KillMemberPod" // ActionTypeRotateMember causes a member to be shutdown and have it's pod removed. ActionTypeRotateMember ActionType = "RotateMember" // ActionTypeRotateStartMember causes a member to be shutdown and have it's pod removed. Do not wait to pod recover. diff --git a/pkg/apis/deployment/v1/server_group.go b/pkg/apis/deployment/v1/server_group.go index 0baefb9b7..ddaa9ed8a 100644 --- a/pkg/apis/deployment/v1/server_group.go +++ b/pkg/apis/deployment/v1/server_group.go @@ -148,6 +148,8 @@ func (g ServerGroup) DefaultTerminationGracePeriod() time.Duration { return time.Minute case ServerGroupDBServers: return time.Hour + case ServerGroupCoordinators: + return time.Hour default: return time.Second * 30 } diff --git a/pkg/apis/deployment/v1/server_group_spec.go b/pkg/apis/deployment/v1/server_group_spec.go index 30c22eb87..fc3e18a72 100644 --- a/pkg/apis/deployment/v1/server_group_spec.go +++ b/pkg/apis/deployment/v1/server_group_spec.go @@ -682,3 +682,16 @@ func (s *ServerGroupSpec) GetEntrypoint(defaultEntrypoint string) string { return *s.Entrypoint } + +// GetShutdownDelay returns defined or default Group ShutdownDelay in seconds +func (s ServerGroupSpec) GetShutdownDelay(group ServerGroup) int { + if s.ShutdownDelay == nil { + switch group { + case ServerGroupCoordinators: + return 3 + default: + return 0 + } + } + return *s.ShutdownDelay +} diff --git a/pkg/apis/deployment/v2alpha1/deployment_metrics_spec.go b/pkg/apis/deployment/v2alpha1/deployment_metrics_spec.go index c1fdf0b67..5bee1ac1e 100644 --- a/pkg/apis/deployment/v2alpha1/deployment_metrics_spec.go +++ b/pkg/apis/deployment/v2alpha1/deployment_metrics_spec.go @@ -41,6 +41,7 @@ func (m MetricsMode) New() *MetricsMode { return &m } +// deprecated func (m MetricsMode) GetMetricsEndpoint() string { switch m { case MetricsModeInternal: @@ -51,9 +52,12 @@ func (m MetricsMode) GetMetricsEndpoint() string { } const ( + // deprecated // MetricsModeExporter exporter mode for old exporter type MetricsModeExporter MetricsMode = "exporter" - MetricsModeSidecar MetricsMode = "sidecar" + // deprecated + MetricsModeSidecar MetricsMode = "sidecar" + // deprecated MetricsModeInternal MetricsMode = "internal" ) @@ -67,12 +71,14 @@ func (m *MetricsMode) Get() MetricsMode { // MetricsSpec contains spec for arangodb exporter type MetricsSpec struct { - Enabled *bool `json:"enabled,omitempty"` + Enabled *bool `json:"enabled,omitempty"` + // deprecated Image *string `json:"image,omitempty"` Authentication MetricsAuthenticationSpec `json:"authentication,omitempty"` Resources v1.ResourceRequirements `json:"resources,omitempty"` - Mode *MetricsMode `json:"mode,omitempty"` - TLS *bool `json:"tls,omitempty"` + // deprecated + Mode *MetricsMode `json:"mode,omitempty"` + TLS *bool `json:"tls,omitempty"` ServiceMonitor *MetricsServiceMonitorSpec `json:"serviceMonitor,omitempty"` @@ -100,11 +106,13 @@ func (s *MetricsSpec) IsEnabled() bool { return util.BoolOrDefault(s.Enabled, false) } +// deprecated // HasImage returns whether a image was specified or not func (s *MetricsSpec) HasImage() bool { return s.Image != nil } +// deprecated // GetImage returns the Image or empty string func (s *MetricsSpec) GetImage() string { return util.StringOrDefault(s.Image) diff --git a/pkg/apis/deployment/v2alpha1/plan.go b/pkg/apis/deployment/v2alpha1/plan.go index 3d737e9fc..f388f53d4 100644 --- a/pkg/apis/deployment/v2alpha1/plan.go +++ b/pkg/apis/deployment/v2alpha1/plan.go @@ -73,6 +73,8 @@ const ( ActionTypeShutdownMember ActionType = "ShutdownMember" // ActionTypeResignLeadership causes a member to resign leadership. ActionTypeResignLeadership ActionType = "ResignLeadership" + // ActionTypeKillMemberPod causes a pod to get delete request. It also waits until Delay finalizer will be removed. + ActionTypeKillMemberPod ActionType = "KillMemberPod" // ActionTypeRotateMember causes a member to be shutdown and have it's pod removed. ActionTypeRotateMember ActionType = "RotateMember" // ActionTypeRotateStartMember causes a member to be shutdown and have it's pod removed. Do not wait to pod recover. diff --git a/pkg/apis/deployment/v2alpha1/server_group.go b/pkg/apis/deployment/v2alpha1/server_group.go index 1087b2a44..5aa77f1de 100644 --- a/pkg/apis/deployment/v2alpha1/server_group.go +++ b/pkg/apis/deployment/v2alpha1/server_group.go @@ -148,6 +148,8 @@ func (g ServerGroup) DefaultTerminationGracePeriod() time.Duration { return time.Minute case ServerGroupDBServers: return time.Hour + case ServerGroupCoordinators: + return time.Hour default: return time.Second * 30 } diff --git a/pkg/apis/deployment/v2alpha1/server_group_containers.go b/pkg/apis/deployment/v2alpha1/server_group_containers.go new file mode 100644 index 000000000..d6621adf3 --- /dev/null +++ b/pkg/apis/deployment/v2alpha1/server_group_containers.go @@ -0,0 +1,35 @@ +// +// DISCLAIMER +// +// Copyright 2016-2021 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package v2alpha1 + +const ( + ServerGroupReservedContainerNameServer = "server" + ServerGroupReservedContainerNameExporter = "exporter" +) + +func IsReservedServerGroupContainerName(name string) bool { + switch name { + case ServerGroupReservedContainerNameServer, ServerGroupReservedContainerNameExporter: + return true + default: + return false + } +} diff --git a/pkg/apis/deployment/v2alpha1/server_group_spec.go b/pkg/apis/deployment/v2alpha1/server_group_spec.go index a107c1f0a..fb18be22a 100644 --- a/pkg/apis/deployment/v2alpha1/server_group_spec.go +++ b/pkg/apis/deployment/v2alpha1/server_group_spec.go @@ -682,3 +682,16 @@ func (s *ServerGroupSpec) GetEntrypoint(defaultEntrypoint string) string { return *s.Entrypoint } + +// GetShutdownDelay returns defined or default Group ShutdownDelay in seconds +func (s ServerGroupSpec) GetShutdownDelay(group ServerGroup) int { + if s.ShutdownDelay == nil { + switch group { + case ServerGroupCoordinators: + return 3 + default: + return 0 + } + } + return *s.ShutdownDelay +} diff --git a/pkg/deployment/deployment_core_test.go b/pkg/deployment/deployment_core_test.go index b8947db5f..b704eabe8 100644 --- a/pkg/deployment/deployment_core_test.go +++ b/pkg/deployment/deployment_core_test.go @@ -890,67 +890,12 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { }, }, }, - { - Name: "Agent Pod can not have metrics exporter", - ArangoDeployment: &api.ArangoDeployment{ - Spec: api.DeploymentSpec{ - Image: util.NewString(testImage), - Authentication: noAuthentication, - TLS: noTLS, - Metrics: metricsSpec, - }, - }, - Helper: func(t *testing.T, deployment *Deployment, testCase *testCaseStruct) { - deployment.status.last = api.DeploymentStatus{ - Members: api.DeploymentStatusMembers{ - Agents: api.MemberStatusList{ - firstAgentStatus, - }, - }, - Images: createTestImages(false), - } - - testCase.createTestPodData(deployment, api.ServerGroupAgents, firstAgentStatus) - testCase.ExpectedPod.ObjectMeta.Labels[k8sutil.LabelKeyArangoExporter] = testYes - }, - ExpectedEvent: "member agent is created", - ExpectedPod: core.Pod{ - Spec: core.PodSpec{ - Volumes: []core.Volume{ - k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), - k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), - }, - Containers: []core.Container{ - { - Name: k8sutil.ServerContainerName, - Image: testImage, - Command: createTestCommandForAgent(firstAgentStatus.ID, false, false, false), - Ports: createTestPorts(), - VolumeMounts: []core.VolumeMount{ - k8sutil.ArangodVolumeMount(), - }, - Resources: emptyResources, - LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), - ImagePullPolicy: core.PullIfNotPresent, - SecurityContext: securityContext.NewSecurityContext(), - }, - testArangodbInternalExporterContainer(false, emptyResources), - }, - RestartPolicy: core.RestartPolicyNever, - TerminationGracePeriodSeconds: &defaultAgentTerminationTimeout, - Hostname: testDeploymentName + "-" + api.ServerGroupAgentsString + "-" + firstAgentStatus.ID, - Subdomain: testDeploymentName + "-int", - Affinity: k8sutil.CreateAffinity(testDeploymentName, api.ServerGroupAgentsString, - false, ""), - }, - }, - }, { Name: "DBserver Pod with internal metrics exporter", ArangoDeployment: &api.ArangoDeployment{ Spec: api.DeploymentSpec{ Image: util.NewString(testImage), - Authentication: noAuthentication, + Authentication: authenticationSpec, TLS: noTLS, Metrics: metricsSpec, }, @@ -967,6 +912,12 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { testCase.createTestPodData(deployment, api.ServerGroupDBServers, firstDBServerStatus) testCase.ExpectedPod.ObjectMeta.Labels[k8sutil.LabelKeyArangoExporter] = testYes + + authorization, err := createTestToken(deployment, testCase, []string{"/_api/version"}) + require.NoError(t, err) + + testCase.ExpectedPod.Spec.Containers[0].LivenessProbe = createTestLivenessProbe(httpProbe, false, + authorization, k8sutil.ArangoPort) }, ExpectedEvent: "member dbserver is created", ExpectedPod: core.Pod{ @@ -974,22 +925,23 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), + k8sutil.CreateVolumeWithSecret(k8sutil.ClusterJWTSecretVolumeName, testJWTSecretName), }, Containers: []core.Container{ { Name: k8sutil.ServerContainerName, Image: testImage, - Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, false, false), + Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, true, false), Ports: createTestPorts(), VolumeMounts: []core.VolumeMount{ k8sutil.ArangodVolumeMount(), + k8sutil.ClusterJWTVolumeMount(), }, Resources: emptyResources, - LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, true, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -1005,7 +957,7 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { ArangoDeployment: &api.ArangoDeployment{ Spec: api.DeploymentSpec{ Image: util.NewString(testImage), - Authentication: noAuthentication, + Authentication: authenticationSpec, TLS: noTLS, Metrics: api.MetricsSpec{ Enabled: util.NewBool(true), @@ -1029,6 +981,12 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { testCase.createTestPodData(deployment, api.ServerGroupDBServers, firstDBServerStatus) testCase.ExpectedPod.ObjectMeta.Labels[k8sutil.LabelKeyArangoExporter] = testYes + + authorization, err := createTestToken(deployment, testCase, []string{"/_api/version"}) + require.NoError(t, err) + + testCase.ExpectedPod.Spec.Containers[0].LivenessProbe = createTestLivenessProbe(httpProbe, false, + authorization, k8sutil.ArangoPort) }, ExpectedEvent: "member dbserver is created", ExpectedPod: core.Pod{ @@ -1036,22 +994,23 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), + k8sutil.CreateVolumeWithSecret(k8sutil.ClusterJWTSecretVolumeName, testJWTSecretName), }, Containers: []core.Container{ { Name: k8sutil.ServerContainerName, Image: testImage, - Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, false, false), + Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, true, false), Ports: createTestPorts(), VolumeMounts: []core.VolumeMount{ k8sutil.ArangodVolumeMount(), + k8sutil.ClusterJWTVolumeMount(), }, Resources: emptyResources, - LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, k8sutil.ExtractPodResourceRequirement(resourcesUnfiltered)), + testArangodbInternalExporterContainer(false, true, k8sutil.ExtractPodResourceRequirement(resourcesUnfiltered)), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -1067,7 +1026,7 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { ArangoDeployment: &api.ArangoDeployment{ Spec: api.DeploymentSpec{ Image: util.NewString(testImage), - Authentication: noAuthentication, + Authentication: authenticationSpec, TLS: noTLS, Metrics: metricsSpec, Lifecycle: api.LifecycleSpec{ @@ -1087,6 +1046,12 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { testCase.createTestPodData(deployment, api.ServerGroupDBServers, firstDBServerStatus) testCase.ExpectedPod.ObjectMeta.Labels[k8sutil.LabelKeyArangoExporter] = testYes + + authorization, err := createTestToken(deployment, testCase, []string{"/_api/version"}) + require.NoError(t, err) + + testCase.ExpectedPod.Spec.Containers[0].LivenessProbe = createTestLivenessProbe(httpProbe, false, + authorization, k8sutil.ArangoPort) }, config: Config{ OperatorImage: testImageOperator, @@ -1097,6 +1062,7 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), + k8sutil.CreateVolumeWithSecret(k8sutil.ClusterJWTSecretVolumeName, testJWTSecretName), k8sutil.LifecycleVolume(), }, InitContainers: []core.Container{ @@ -1106,19 +1072,19 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { { Name: k8sutil.ServerContainerName, Image: testImage, - Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, false, false), + Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, true, false), Ports: createTestPorts(), VolumeMounts: []core.VolumeMount{ k8sutil.ArangodVolumeMount(), k8sutil.LifecycleVolumeMount(), + k8sutil.ClusterJWTVolumeMount(), }, Resources: emptyResources, - Lifecycle: createTestLifecycle(), - LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), + Lifecycle: createTestLifecycle(api.ServerGroupAgents), ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, true, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -1134,7 +1100,7 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { ArangoDeployment: &api.ArangoDeployment{ Spec: api.DeploymentSpec{ Image: util.NewString(testImage), - Authentication: noAuthentication, + Authentication: authenticationSpec, TLS: noTLS, Metrics: metricsSpec, }, @@ -1151,6 +1117,12 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { testCase.createTestPodData(deployment, api.ServerGroupDBServers, firstDBServerStatus) testCase.ExpectedPod.ObjectMeta.Labels[k8sutil.LabelKeyArangoExporter] = testYes + + authorization, err := createTestToken(deployment, testCase, []string{"/_api/version"}) + require.NoError(t, err) + + testCase.ExpectedPod.Spec.Containers[0].LivenessProbe = createTestLivenessProbe(httpProbe, false, + authorization, k8sutil.ArangoPort) }, config: Config{ OperatorImage: testImageOperator, @@ -1161,6 +1133,7 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), + k8sutil.CreateVolumeWithSecret(k8sutil.ClusterJWTSecretVolumeName, testJWTSecretName), }, InitContainers: []core.Container{ createTestLifecycleContainer(emptyResources), @@ -1170,19 +1143,19 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { { Name: k8sutil.ServerContainerName, Image: testImage, - Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, false, false), + Command: createTestCommandForDBServer(firstDBServerStatus.ID, false, true, false), Ports: createTestPorts(), VolumeMounts: []core.VolumeMount{ k8sutil.ArangodVolumeMount(), k8sutil.LifecycleVolumeMount(), + k8sutil.ClusterJWTVolumeMount(), }, Resources: emptyResources, - Lifecycle: createTestLifecycle(), - LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), + Lifecycle: createTestLifecycle(api.ServerGroupAgents), ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, true, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -1326,14 +1299,14 @@ func TestEnsurePod_ArangoDB_Core(t *testing.T) { runTestCases(t, testCases...) } -func testArangodbInternalExporterContainer(secure bool, resources core.ResourceRequirements, ports ...int32) core.Container { +func testArangodbInternalExporterContainer(secure, auth bool, resources core.ResourceRequirements, ports ...int32) core.Container { var port int32 = k8sutil.ArangoExporterPort if len(ports) > 0 { port = ports[0] } - return core.Container{ + c := core.Container{ Name: k8sutil.ExporterContainerName, Image: testImage, Command: createTestInternalExporterCommand(secure, port), @@ -1356,9 +1329,14 @@ func testArangodbInternalExporterContainer(secure bool, resources core.ResourceR }, VolumeMounts: []core.VolumeMount{ k8sutil.LifecycleVolumeMount(), - k8sutil.ExporterJWTVolumeMount(), }, } + + if auth { + c.VolumeMounts = append(c.VolumeMounts, k8sutil.ExporterJWTVolumeMount()) + } + + return c } func createTestInternalExporterCommand(secure bool, port int32) []string { diff --git a/pkg/deployment/deployment_encryption_test.go b/pkg/deployment/deployment_encryption_test.go index f8a881ab2..2fa3d4823 100644 --- a/pkg/deployment/deployment_encryption_test.go +++ b/pkg/deployment/deployment_encryption_test.go @@ -157,7 +157,7 @@ func TestEnsurePod_ArangoDB_Encryption(t *testing.T) { Image: testImage, Command: createTestCommandForDBServer(firstDBServerStatus.ID, true, true, true), Ports: createTestPorts(), - Lifecycle: createTestLifecycle(), + Lifecycle: createTestLifecycle(api.ServerGroupAgents), LivenessProbe: createTestLivenessProbe(httpProbe, false, "", k8sutil.ArangoPort), ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), @@ -171,7 +171,7 @@ func TestEnsurePod_ArangoDB_Encryption(t *testing.T) { Resources: emptyResources, }, func() core.Container { - c := testArangodbInternalExporterContainer(true, emptyResources) + c := testArangodbInternalExporterContainer(true, true, emptyResources) c.VolumeMounts = append(c.VolumeMounts, k8sutil.TlsKeyfileVolumeMount()) return c }(), diff --git a/pkg/deployment/deployment_metrics_test.go b/pkg/deployment/deployment_metrics_test.go index 7ec3ad11d..257970060 100644 --- a/pkg/deployment/deployment_metrics_test.go +++ b/pkg/deployment/deployment_metrics_test.go @@ -69,7 +69,6 @@ func TestEnsurePod_Metrics(t *testing.T) { Spec: core.PodSpec{ Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), - k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), }, Containers: []core.Container{ { @@ -85,7 +84,7 @@ func TestEnsurePod_Metrics(t *testing.T) { ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources, 9999), + testArangodbInternalExporterContainer(false, false, emptyResources, 9999), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -130,7 +129,6 @@ func TestEnsurePod_Metrics(t *testing.T) { Spec: core.PodSpec{ Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), - k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), }, Containers: []core.Container{ { @@ -146,7 +144,7 @@ func TestEnsurePod_Metrics(t *testing.T) { ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, false, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -191,7 +189,6 @@ func TestEnsurePod_Metrics(t *testing.T) { Spec: core.PodSpec{ Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), - k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), }, Containers: []core.Container{ { @@ -217,7 +214,7 @@ func TestEnsurePod_Metrics(t *testing.T) { ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, false, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultDBServerTerminationTimeout, @@ -262,7 +259,6 @@ func TestEnsurePod_Metrics(t *testing.T) { Spec: core.PodSpec{ Volumes: []core.Volume{ k8sutil.CreateVolumeEmptyDir(k8sutil.ArangodVolumeName), - k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, testExporterToken), }, Containers: []core.Container{ { @@ -288,7 +284,7 @@ func TestEnsurePod_Metrics(t *testing.T) { ImagePullPolicy: core.PullIfNotPresent, SecurityContext: securityContext.NewSecurityContext(), }, - testArangodbInternalExporterContainer(false, emptyResources), + testArangodbInternalExporterContainer(false, false, emptyResources), }, RestartPolicy: core.RestartPolicyNever, TerminationGracePeriodSeconds: &defaultAgentTerminationTimeout, diff --git a/pkg/deployment/deployment_pod_sync_test.go b/pkg/deployment/deployment_pod_sync_test.go index 88c6614ca..47d12e467 100644 --- a/pkg/deployment/deployment_pod_sync_test.go +++ b/pkg/deployment/deployment_pod_sync_test.go @@ -262,7 +262,7 @@ func TestEnsurePod_Sync_Master(t *testing.T) { k8sutil.CreateEnvFieldPath(constants.EnvOperatorNodeNameArango, "spec.nodeName"), }, ImagePullPolicy: core.PullIfNotPresent, - Lifecycle: createTestLifecycle(), + Lifecycle: createTestLifecycle(api.ServerGroupSyncMasters), Resources: resourcesUnfiltered, SecurityContext: securityContext.NewSecurityContext(), VolumeMounts: []core.VolumeMount{ @@ -356,7 +356,7 @@ func TestEnsurePod_Sync_Master(t *testing.T) { }, Resources: emptyResources, ImagePullPolicy: core.PullIfNotPresent, - Lifecycle: createTestLifecycle(), + Lifecycle: createTestLifecycle(api.ServerGroupSyncMasters), SecurityContext: securityContext.NewSecurityContext(), VolumeMounts: []core.VolumeMount{ k8sutil.LifecycleVolumeMount(), @@ -454,7 +454,7 @@ func TestEnsurePod_Sync_Worker(t *testing.T) { k8sutil.CreateEnvFieldPath(constants.EnvOperatorNodeNameArango, "spec.nodeName"), }, ImagePullPolicy: core.PullIfNotPresent, - Lifecycle: createTestLifecycle(), + Lifecycle: createTestLifecycle(api.ServerGroupSyncMasters), Resources: k8sutil.ExtractPodResourceRequirement(resourcesUnfiltered), SecurityContext: securityContext.NewSecurityContext(), VolumeMounts: []core.VolumeMount{ diff --git a/pkg/deployment/deployment_run_test.go b/pkg/deployment/deployment_run_test.go index b79774fba..62ed80036 100644 --- a/pkg/deployment/deployment_run_test.go +++ b/pkg/deployment/deployment_run_test.go @@ -124,6 +124,11 @@ func runTestCase(t *testing.T, testCase testCaseStruct) { require.Equal(t, testCase.Features.EncryptionRotation, *features.EncryptionRotation().EnabledPointer()) *features.JWTRotation().EnabledPointer() = testCase.Features.JWTRotation *features.TLSSNI().EnabledPointer() = testCase.Features.TLSSNI + if g := testCase.Features.Graceful; g != nil { + *features.GracefulShutdown().EnabledPointer() = *g + } else { + *features.GracefulShutdown().EnabledPointer() = features.GracefulShutdown().EnabledByDefault() + } *features.TLSRotation().EnabledPointer() = testCase.Features.TLSRotation } diff --git a/pkg/deployment/deployment_suite_test.go b/pkg/deployment/deployment_suite_test.go index dd8092ccd..6984ced96 100644 --- a/pkg/deployment/deployment_suite_test.go +++ b/pkg/deployment/deployment_suite_test.go @@ -79,6 +79,7 @@ const ( type testCaseFeatures struct { TLSSNI, TLSRotation, JWTRotation, EncryptionRotation bool + Graceful *bool } type testCaseStruct struct { @@ -100,8 +101,12 @@ func createTestTLSVolume(serverGroupString, ID string) core.Volume { k8sutil.CreateTLSKeyfileSecretName(testDeploymentName, serverGroupString, ID)) } -func createTestLifecycle() *core.Lifecycle { - lifecycle, _ := k8sutil.NewLifecycle() +func createTestLifecycle(group api.ServerGroup) *core.Lifecycle { + if group.IsArangosync() { + lifecycle, _ := k8sutil.NewLifecycleFinalizers() + return lifecycle + } + lifecycle, _ := k8sutil.NewLifecyclePort() return lifecycle } @@ -586,9 +591,14 @@ func finalizers(group api.ServerGroup) []string { var finalizers []string switch group { case api.ServerGroupAgents: - finalizers = append(finalizers, constants.FinalizerPodAgencyServing) + finalizers = append(finalizers, constants.FinalizerPodGracefulShutdown) + case api.ServerGroupCoordinators: + finalizers = append(finalizers, constants.FinalizerDelayPodTermination) + finalizers = append(finalizers, constants.FinalizerPodGracefulShutdown) case api.ServerGroupDBServers: - finalizers = append(finalizers, constants.FinalizerPodDrainDBServer) + finalizers = append(finalizers, constants.FinalizerPodGracefulShutdown) + case api.ServerGroupSingle: + finalizers = append(finalizers, constants.FinalizerPodGracefulShutdown) } return finalizers @@ -732,7 +742,7 @@ func addLifecycle(name string, uuidRequired bool, license string, group api.Serv } if len(p.Spec.Containers) > 0 { - p.Spec.Containers[0].Lifecycle = createTestLifecycle() + p.Spec.Containers[0].Lifecycle = createTestLifecycle(api.ServerGroupAgents) } if len(p.Spec.Containers) > 0 { diff --git a/pkg/deployment/features/graceful.go b/pkg/deployment/features/graceful.go new file mode 100644 index 000000000..7c0059662 --- /dev/null +++ b/pkg/deployment/features/graceful.go @@ -0,0 +1,40 @@ +// +// DISCLAIMER +// +// Copyright 2016-2021 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Adam Janikowski +// Author Tomasz Mielech +// + +package features + +func init() { + registerFeature(gracefulShutdown) +} + +var gracefulShutdown = &feature{ + name: "graceful-shutdown", + description: "Define graceful shutdown, using finalizers, is enabled", + version: "3.6.0", + enterpriseRequired: false, + enabledByDefault: true, +} + +func GracefulShutdown() Feature { + return gracefulShutdown +} diff --git a/pkg/deployment/reconcile/action_kill_member_pod.go b/pkg/deployment/reconcile/action_kill_member_pod.go new file mode 100644 index 000000000..59c781eac --- /dev/null +++ b/pkg/deployment/reconcile/action_kill_member_pod.go @@ -0,0 +1,111 @@ +// +// DISCLAIMER +// +// Copyright 2020-2021 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// Author Tomasz Mielech +// + +package reconcile + +import ( + "context" + + "github.com/arangodb/kube-arangodb/pkg/backup/utils" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/util/constants" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/rs/zerolog" +) + +func init() { + registerAction(api.ActionTypeKillMemberPod, newKillMemberPodAction) +} + +// newKillMemberPodAction creates a new Action that implements the given +// planned KillMemberPod action. +func newKillMemberPodAction(log zerolog.Logger, action api.Action, actionCtx ActionContext) Action { + a := &actionKillMemberPod{} + + a.actionImpl = newActionImplDefRef(log, action, actionCtx, defaultTimeout) + + return a +} + +// actionKillMemberPod implements an KillMemberPod. +type actionKillMemberPod struct { + // actionImpl implement timeout and member id functions + actionImpl +} + +// Start performs the start of the action. +// Returns true if the action is completely finished, false in case +// the start time needs to be recorded and a ready condition needs to be checked. +func (a *actionKillMemberPod) Start(ctx context.Context) (bool, error) { + if !features.GracefulShutdown().Enabled() { + return true, nil + } + + log := a.log + m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID) + if !ok { + log.Error().Msg("No such member") + return true, nil + } + + if err := a.actionCtx.DeletePod(ctx, m.PodName); err != nil { + log.Error().Err(err).Msg("Unable to kill pod") + return true, nil + } + + return false, nil +} + +// CheckProgress checks the progress of the action. +// Returns: ready, abort, error. +func (a *actionKillMemberPod) CheckProgress(ctx context.Context) (bool, bool, error) { + if !features.GracefulShutdown().Enabled() { + return true, false, nil + } + + log := a.log + m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID) + if !ok { + log.Error().Msg("No such member") + return true, false, nil + } + + p, ok := a.actionCtx.GetCachedStatus().Pod(m.PodName) + if !ok { + log.Error().Msg("No such member") + return true, false, nil + } + + l := utils.StringList(p.Finalizers) + + if !l.Has(constants.FinalizerPodGracefulShutdown) { + return true, false, nil + } + + if l.Has(constants.FinalizerDelayPodTermination) { + return false, false, nil + } + + return true, false, nil +} diff --git a/pkg/deployment/reconcile/helper_shutdown.go b/pkg/deployment/reconcile/helper_shutdown.go index 463e86122..64615f679 100644 --- a/pkg/deployment/reconcile/helper_shutdown.go +++ b/pkg/deployment/reconcile/helper_shutdown.go @@ -26,6 +26,8 @@ package reconcile import ( "context" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/util/arangod" "github.com/arangodb/kube-arangodb/pkg/util/errors" @@ -34,6 +36,10 @@ import ( ) func getShutdownHelper(a *api.Action, ctx ActionContext, log zerolog.Logger) ActionCore { + if features.GracefulShutdown().Enabled() { + return shutdownHelperAPI{action: a, actionCtx: ctx, log: log} + } + serverGroup := ctx.GetSpec().GetServerGroupSpec(a.Group) switch serverGroup.ShutdownMethod.Get() { @@ -66,9 +72,12 @@ func (s shutdownHelperAPI) Start(ctx context.Context) (bool, error) { return true, nil } // Remove finalizers, so Kubernetes will quickly terminate the pod - if err := s.actionCtx.RemovePodFinalizers(ctx, m.PodName); err != nil { - return false, errors.WithStack(err) + if !features.GracefulShutdown().Enabled() { + if err := s.actionCtx.RemovePodFinalizers(ctx, m.PodName); err != nil { + return false, errors.WithStack(err) + } } + if group.IsArangod() { // Invoke shutdown endpoint ctxChild, cancel := context.WithTimeout(ctx, arangod.GetRequestTimeout()) @@ -82,7 +91,7 @@ func (s shutdownHelperAPI) Start(ctx context.Context) (bool, error) { log.Debug().Bool("removeFromCluster", removeFromCluster).Msg("Shutting down member") ctxChild, cancel = context.WithTimeout(ctx, shutdownTimeout) defer cancel() - if err := c.Shutdown(ctxChild, removeFromCluster); err != nil { + if err := c.ShutdownV2(ctxChild, removeFromCluster, true); err != nil { // Shutdown failed. Let's check if we're already done if ready, _, err := s.CheckProgress(ctxChild); err == nil && ready { // We're done diff --git a/pkg/deployment/reconcile/helper_wrap.go b/pkg/deployment/reconcile/helper_wrap.go index 2a9766486..741f4e96a 100644 --- a/pkg/deployment/reconcile/helper_wrap.go +++ b/pkg/deployment/reconcile/helper_wrap.go @@ -61,6 +61,7 @@ func cleanOutMember(group api.ServerGroup, m api.MemberStatus) api.Plan { ) } plan = append(plan, + api.NewAction(api.ActionTypeKillMemberPod, group, m.ID), api.NewAction(api.ActionTypeShutdownMember, group, m.ID), api.NewAction(api.ActionTypeRemoveMember, group, m.ID), ) diff --git a/pkg/deployment/reconcile/plan_builder_storage.go b/pkg/deployment/reconcile/plan_builder_storage.go index a236d7167..cf393c4f2 100644 --- a/pkg/deployment/reconcile/plan_builder_storage.go +++ b/pkg/deployment/reconcile/plan_builder_storage.go @@ -83,6 +83,7 @@ func createRotateServerStoragePlan(ctx context.Context, api.NewAction(api.ActionTypeMarkToRemoveMember, group, m.ID)) } else if group == api.ServerGroupAgents { plan = append(plan, + api.NewAction(api.ActionTypeKillMemberPod, group, m.ID), api.NewAction(api.ActionTypeShutdownMember, group, m.ID), api.NewAction(api.ActionTypeRemoveMember, group, m.ID), api.NewAction(api.ActionTypeAddMember, group, m.ID), @@ -187,6 +188,7 @@ func pvcResizePlan(log zerolog.Logger, group api.ServerGroup, groupSpec api.Serv case api.PVCResizeModeRotate: return api.Plan{ api.NewAction(api.ActionTypeResignLeadership, group, memberID), + api.NewAction(api.ActionTypeKillMemberPod, group, memberID), api.NewAction(api.ActionTypeRotateStartMember, group, memberID), api.NewAction(api.ActionTypePVCResize, group, memberID), api.NewAction(api.ActionTypePVCResized, group, memberID), diff --git a/pkg/deployment/reconcile/plan_builder_test.go b/pkg/deployment/reconcile/plan_builder_test.go index 6e9a3c925..7deb42717 100644 --- a/pkg/deployment/reconcile/plan_builder_test.go +++ b/pkg/deployment/reconcile/plan_builder_test.go @@ -507,11 +507,13 @@ func TestCreatePlanActiveFailoverScale(t *testing.T) { } newPlan, changed = createNormalPlan(ctx, log, depl, nil, spec, status, inspector.NewEmptyInspector(), c) assert.True(t, changed) - require.Len(t, newPlan, 2) // Note: Downscaling is only down 1 at a time - assert.Equal(t, api.ActionTypeShutdownMember, newPlan[0].Type) - assert.Equal(t, api.ActionTypeRemoveMember, newPlan[1].Type) + require.Len(t, newPlan, 3) // Note: Downscaling is only down 1 at a time + assert.Equal(t, api.ActionTypeKillMemberPod, newPlan[0].Type) + assert.Equal(t, api.ActionTypeShutdownMember, newPlan[1].Type) + assert.Equal(t, api.ActionTypeRemoveMember, newPlan[2].Type) assert.Equal(t, api.ServerGroupSingle, newPlan[0].Group) assert.Equal(t, api.ServerGroupSingle, newPlan[1].Group) + assert.Equal(t, api.ServerGroupSingle, newPlan[2].Group) } // TestCreatePlanClusterScale creates a `cluster` deployment to test the creating of scaling plan. @@ -609,17 +611,21 @@ func TestCreatePlanClusterScale(t *testing.T) { spec.Coordinators.Count = util.NewInt(1) newPlan, changed = createNormalPlan(ctx, log, depl, nil, spec, status, inspector.NewEmptyInspector(), c) assert.True(t, changed) - require.Len(t, newPlan, 5) // Note: Downscaling is done 1 at a time + require.Len(t, newPlan, 7) // Note: Downscaling is done 1 at a time assert.Equal(t, api.ActionTypeCleanOutMember, newPlan[0].Type) - assert.Equal(t, api.ActionTypeShutdownMember, newPlan[1].Type) - assert.Equal(t, api.ActionTypeRemoveMember, newPlan[2].Type) - assert.Equal(t, api.ActionTypeShutdownMember, newPlan[3].Type) - assert.Equal(t, api.ActionTypeRemoveMember, newPlan[4].Type) + assert.Equal(t, api.ActionTypeKillMemberPod, newPlan[1].Type) + assert.Equal(t, api.ActionTypeShutdownMember, newPlan[2].Type) + assert.Equal(t, api.ActionTypeRemoveMember, newPlan[3].Type) + assert.Equal(t, api.ActionTypeKillMemberPod, newPlan[4].Type) + assert.Equal(t, api.ActionTypeShutdownMember, newPlan[5].Type) + assert.Equal(t, api.ActionTypeRemoveMember, newPlan[6].Type) assert.Equal(t, api.ServerGroupDBServers, newPlan[0].Group) assert.Equal(t, api.ServerGroupDBServers, newPlan[1].Group) assert.Equal(t, api.ServerGroupDBServers, newPlan[2].Group) - assert.Equal(t, api.ServerGroupCoordinators, newPlan[3].Group) + assert.Equal(t, api.ServerGroupDBServers, newPlan[3].Group) assert.Equal(t, api.ServerGroupCoordinators, newPlan[4].Group) + assert.Equal(t, api.ServerGroupCoordinators, newPlan[5].Group) + assert.Equal(t, api.ServerGroupCoordinators, newPlan[6].Group) } type LastLogRecord struct { @@ -792,6 +798,7 @@ func TestCreatePlan(t *testing.T) { ad.Status.Members.Agents[0].PersistentVolumeClaimName = pvcName }, ExpectedPlan: []api.Action{ + api.NewAction(api.ActionTypeKillMemberPod, api.ServerGroupAgents, ""), api.NewAction(api.ActionTypeShutdownMember, api.ServerGroupAgents, ""), api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupAgents, ""), api.NewAction(api.ActionTypeAddMember, api.ServerGroupAgents, ""), @@ -988,6 +995,7 @@ func TestCreatePlan(t *testing.T) { }, ExpectedPlan: []api.Action{ api.NewAction(api.ActionTypeCleanOutMember, api.ServerGroupDBServers, "id"), + api.NewAction(api.ActionTypeKillMemberPod, api.ServerGroupDBServers, ""), api.NewAction(api.ActionTypeShutdownMember, api.ServerGroupDBServers, ""), api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, ""), }, @@ -1006,6 +1014,7 @@ func TestCreatePlan(t *testing.T) { }, ExpectedPlan: []api.Action{ api.NewAction(api.ActionTypeCleanOutMember, api.ServerGroupDBServers, "id"), + api.NewAction(api.ActionTypeKillMemberPod, api.ServerGroupDBServers, ""), api.NewAction(api.ActionTypeShutdownMember, api.ServerGroupDBServers, ""), api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, ""), }, diff --git a/pkg/deployment/reconcile/plan_builder_utils.go b/pkg/deployment/reconcile/plan_builder_utils.go index 10c7036e8..009d838e9 100644 --- a/pkg/deployment/reconcile/plan_builder_utils.go +++ b/pkg/deployment/reconcile/plan_builder_utils.go @@ -44,6 +44,7 @@ func createRotateMemberPlan(log zerolog.Logger, member api.MemberStatus, plan := api.Plan{ api.NewAction(api.ActionTypeCleanTLSKeyfileCertificate, group, member.ID, "Remove server keyfile and enforce renewal/recreation"), api.NewAction(api.ActionTypeResignLeadership, group, member.ID, reason), + api.NewAction(api.ActionTypeKillMemberPod, group, member.ID, reason), api.NewAction(api.ActionTypeRotateMember, group, member.ID, reason), api.NewAction(api.ActionTypeWaitForMemberUp, group, member.ID), api.NewAction(api.ActionTypeWaitForMemberInSync, group, member.ID), diff --git a/pkg/deployment/resources/pod_creator_arangod.go b/pkg/deployment/resources/pod_creator_arangod.go index 07fda3674..a075c2325 100644 --- a/pkg/deployment/resources/pod_creator_arangod.go +++ b/pkg/deployment/resources/pod_creator_arangod.go @@ -27,6 +27,8 @@ import ( "math" "os" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/deployment/topology" "github.com/arangodb/kube-arangodb/pkg/util/collection" @@ -193,7 +195,10 @@ func (a *ArangoDContainer) GetResourceRequirements() core.ResourceRequirements { } func (a *ArangoDContainer) GetLifecycle() (*core.Lifecycle, error) { - return k8sutil.NewLifecycle() + if features.GracefulShutdown().Enabled() { + return k8sutil.NewLifecyclePort() + } + return k8sutil.NewLifecycleFinalizers() } func (a *ArangoDContainer) GetImagePullPolicy() core.PullPolicy { @@ -348,7 +353,7 @@ func (m *MemberArangoDPod) GetVolumes() ([]core.Volume, []core.VolumeMount) { if m.spec.Metrics.IsEnabled() { token := m.spec.Metrics.GetJWTTokenSecretName() - if token != "" { + if m.spec.Authentication.IsAuthenticated() && token != "" { vol := k8sutil.CreateVolumeWithSecret(k8sutil.ExporterJWTVolumeName, token) volumes.AddVolume(vol) } @@ -460,15 +465,20 @@ func (m *MemberArangoDPod) GetInitContainers(cachedStatus interfaces.Inspector) func (m *MemberArangoDPod) GetFinalizers() []string { var finalizers []string - if d := m.spec.GetServerGroupSpec(m.group).ShutdownDelay; d != nil { + + if d := m.spec.GetServerGroupSpec(m.group).GetShutdownDelay(m.group); d != 0 { finalizers = append(finalizers, constants.FinalizerDelayPodTermination) } - switch m.group { - case api.ServerGroupAgents: - finalizers = append(finalizers, constants.FinalizerPodAgencyServing) - case api.ServerGroupDBServers: - finalizers = append(finalizers, constants.FinalizerPodDrainDBServer) + if features.GracefulShutdown().Enabled() { + finalizers = append(finalizers, constants.FinalizerPodGracefulShutdown) // No need for other finalizers, quorum will be managed + } else { + switch m.group { + case api.ServerGroupAgents: + finalizers = append(finalizers, constants.FinalizerPodAgencyServing) + case api.ServerGroupDBServers: + finalizers = append(finalizers, constants.FinalizerPodDrainDBServer) + } } return finalizers @@ -502,7 +512,7 @@ func (m *MemberArangoDPod) createMetricsExporterSidecarInternalExporter() (*core return nil, err } - if m.spec.Metrics.GetJWTTokenSecretName() != "" { + if m.spec.Authentication.IsAuthenticated() && m.spec.Metrics.GetJWTTokenSecretName() != "" { c.VolumeMounts = append(c.VolumeMounts, k8sutil.ExporterJWTVolumeMount()) } diff --git a/pkg/deployment/resources/pod_creator_sync.go b/pkg/deployment/resources/pod_creator_sync.go index 676c722d1..f1b8d3cb8 100644 --- a/pkg/deployment/resources/pod_creator_sync.go +++ b/pkg/deployment/resources/pod_creator_sync.go @@ -113,7 +113,7 @@ func (a *ArangoSyncContainer) GetResourceRequirements() core.ResourceRequirement } func (a *ArangoSyncContainer) GetLifecycle() (*core.Lifecycle, error) { - return k8sutil.NewLifecycle() + return k8sutil.NewLifecycleFinalizers() } func (a *ArangoSyncContainer) GetImagePullPolicy() core.PullPolicy { diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 667b7af58..208091cca 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -78,6 +78,12 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu } else { log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove Pod finalizer yet") } + case constants.FinalizerPodGracefulShutdown: + // We are in graceful shutdown, only one way to remove it is when container is already dead + if isServerContainerDead { + log.Debug().Msg("Server Container is dead, removing finalizer") + removalList = append(removalList, f) + } case constants.FinalizerDelayPodTermination: if isServerContainerDead { log.Debug().Msg("Server Container is dead, removing finalizer") @@ -93,11 +99,12 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu log.Error().Str("finalizer", f).Msg("Delay finalizer") groupSpec := r.context.GetSpec().GetServerGroupSpec(group) - d := time.Duration(util.IntOrDefault(groupSpec.ShutdownDelay, 0)) * time.Second if t := p.ObjectMeta.DeletionTimestamp; t != nil { - e := p.ObjectMeta.DeletionTimestamp.Time.Sub(time.Now().Add(d)) - log.Error().Str("finalizer", f).Dur("left", e).Msg("Delay finalizer status") - if e < 0 { + d := time.Duration(groupSpec.GetShutdownDelay(group)) * time.Second + gr := time.Duration(util.Int64OrDefault(p.ObjectMeta.GetDeletionGracePeriodSeconds(), 0)) * time.Second + e := t.Time.Add(-1 * gr).Sub(time.Now().Add(-1 * d)) + log.Error().Str("finalizer", f).Str("left", e.String()).Msg("Delay finalizer status") + if e < 0 || d == 0 { removalList = append(removalList, f) } } else { diff --git a/pkg/deployment/rotation/check.go b/pkg/deployment/rotation/check.go index 3a3431cd5..d2eefdb9a 100644 --- a/pkg/deployment/rotation/check.go +++ b/pkg/deployment/rotation/check.go @@ -25,6 +25,8 @@ package rotation import ( "github.com/arangodb/kube-arangodb/pkg/apis/deployment" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/backup/utils" + "github.com/arangodb/kube-arangodb/pkg/util/constants" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector" "github.com/rs/zerolog" @@ -52,17 +54,7 @@ func (m Mode) And(b Mode) Mode { // CheckPossible returns true if rotation is possible func CheckPossible(member api.MemberStatus) bool { - if !member.Phase.IsReady() { - // Skip rotation when we are not ready - return false - } - - if member.Conditions.IsTrue(api.ConditionTypeTerminated) || member.Conditions.IsTrue(api.ConditionTypeTerminating) { - // Termination in progress, nothing to do - return false - } - - return true + return !member.Conditions.IsTrue(api.ConditionTypeTerminated) } func IsRotationRequired(log zerolog.Logger, cachedStatus inspectorInterface.Inspector, spec api.DeploymentSpec, member api.MemberStatus, group api.ServerGroup, pod *core.Pod, specTemplate, statusTemplate *api.ArangoMemberPodTemplate) (mode Mode, plan api.Plan, reason string, err error) { @@ -71,6 +63,16 @@ func IsRotationRequired(log zerolog.Logger, cachedStatus inspectorInterface.Insp // Set default mode for return value mode = SkippedRotation + // We are under termination + if member.Conditions.IsTrue(api.ConditionTypeTerminating) || (pod != nil && pod.DeletionTimestamp != nil) { + if l := utils.StringList(pod.Finalizers); l.Has(constants.FinalizerPodGracefulShutdown) && !l.Has(constants.FinalizerDelayPodTermination) { + reason = "Recreation enforced by deleted state" + mode = EnforcedRotation + } + + return + } + if !CheckPossible(member) { // Check is not possible due to improper state of member return diff --git a/pkg/util/constants/constants.go b/pkg/util/constants/constants.go index 82ad8abc7..b7742d526 100644 --- a/pkg/util/constants/constants.go +++ b/pkg/util/constants/constants.go @@ -50,6 +50,7 @@ const ( FinalizerDeplReplStopSync = "replication.database.arangodb.com/stop-sync" // Finalizer added to ArangoDeploymentReplication, indicating the need to stop synchronization FinalizerPodAgencyServing = "agent.database.arangodb.com/agency-serving" // Finalizer added to Agents, indicating the need for keeping enough agents alive FinalizerPodDrainDBServer = "dbserver.database.arangodb.com/drain" // Finalizer added to DBServers, indicating the need for draining that dbserver + FinalizerPodGracefulShutdown = "database.arangodb.com/graceful-shutdown" // Finalizer added to All members, indicating the need for graceful shutdown FinalizerPVCMemberExists = "pvc.database.arangodb.com/member-exists" // Finalizer added to PVCs, indicating the need to keep is as long as its member exists FinalizerDelayPodTermination = "pod.database.arangodb.com/delay" // Finalizer added to Pod, delays termination diff --git a/pkg/util/k8sutil/lifecycle.go b/pkg/util/k8sutil/lifecycle.go index 172c0fadf..9c5d06cb0 100644 --- a/pkg/util/k8sutil/lifecycle.go +++ b/pkg/util/k8sutil/lifecycle.go @@ -62,8 +62,18 @@ func InitLifecycleContainer(image string, resources *core.ResourceRequirements, return c, nil } +// NewLifecycleFinalizers creates a lifecycle structure with preStop handler which wait for finalizers to be removed. +func NewLifecycleFinalizers() (*core.Lifecycle, error) { + return NewLifecycle("finalizers") +} + +// NewLifecyclePort creates a lifecycle structure with preStop handler which wait for port to be closed. +func NewLifecyclePort() (*core.Lifecycle, error) { + return NewLifecycle("port") +} + // NewLifecycle creates a lifecycle structure with preStop handler. -func NewLifecycle() (*core.Lifecycle, error) { +func NewLifecycle(t string) (*core.Lifecycle, error) { binaryPath, err := os.Executable() if err != nil { return nil, errors.WithStack(err) @@ -72,7 +82,7 @@ func NewLifecycle() (*core.Lifecycle, error) { lifecycle := &core.Lifecycle{ PreStop: &core.Handler{ Exec: &core.ExecAction{ - Command: append([]string{exePath}, "lifecycle", "preStop"), + Command: append([]string{exePath}, "lifecycle", "preStop", t), }, }, } diff --git a/pkg/util/k8sutil/pods.go b/pkg/util/k8sutil/pods.go index e1f6d87fc..ea0bb3b56 100644 --- a/pkg/util/k8sutil/pods.go +++ b/pkg/util/k8sutil/pods.go @@ -154,11 +154,11 @@ func IsContainerRunning(pod *core.Pod, name string) bool { continue } - if c.State.Terminated != nil { + if c.State.Running == nil { return false - } else { - return true } + + return true } return false }