diff --git a/pkg/deployment/reconcile/action.go b/pkg/deployment/reconcile/action.go index e3b0ded79..b06356a15 100644 --- a/pkg/deployment/reconcile/action.go +++ b/pkg/deployment/reconcile/action.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" ) // Action executes a single Plan item. @@ -35,4 +36,6 @@ type Action interface { // CheckProgress checks the progress of the action. // Returns true if the action is completely finished, false otherwise. CheckProgress(ctx context.Context) (bool, error) + // Timeout returns the amount of time after which this action will timeout. + Timeout() time.Duration } diff --git a/pkg/deployment/reconcile/action_add_member.go b/pkg/deployment/reconcile/action_add_member.go index cf96ca726..c63225a3f 100644 --- a/pkg/deployment/reconcile/action_add_member.go +++ b/pkg/deployment/reconcile/action_add_member.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/rs/zerolog" @@ -64,3 +65,8 @@ func (a *actionAddMember) CheckProgress(ctx context.Context) (bool, error) { // Nothing todo return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionAddMember) Timeout() time.Duration { + return addMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_cleanout_member.go b/pkg/deployment/reconcile/action_cleanout_member.go index eb9d9fbc6..cb4b86fe7 100644 --- a/pkg/deployment/reconcile/action_cleanout_member.go +++ b/pkg/deployment/reconcile/action_cleanout_member.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/rs/zerolog" @@ -114,3 +115,8 @@ func (a *actionCleanoutMember) CheckProgress(ctx context.Context) (bool, error) // Cleanout completed return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionCleanoutMember) Timeout() time.Duration { + return cleanoutMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_remove_member.go b/pkg/deployment/reconcile/action_remove_member.go index feb5ac8c6..64cda8ccb 100644 --- a/pkg/deployment/reconcile/action_remove_member.go +++ b/pkg/deployment/reconcile/action_remove_member.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" "github.com/pkg/errors" "github.com/rs/zerolog" @@ -94,3 +95,8 @@ func (a *actionRemoveMember) CheckProgress(ctx context.Context) (bool, error) { // Nothing todo return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionRemoveMember) Timeout() time.Duration { + return removeMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_renew_tls_certificate.go b/pkg/deployment/reconcile/action_renew_tls_certificate.go index 284394a0f..99e0ffc93 100644 --- a/pkg/deployment/reconcile/action_renew_tls_certificate.go +++ b/pkg/deployment/reconcile/action_renew_tls_certificate.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/rs/zerolog" @@ -69,3 +70,8 @@ func (a *renewTLSCertificateAction) Start(ctx context.Context) (bool, error) { func (a *renewTLSCertificateAction) CheckProgress(ctx context.Context) (bool, error) { return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *renewTLSCertificateAction) Timeout() time.Duration { + return renewTLSCertificateTimeout +} diff --git a/pkg/deployment/reconcile/action_rotate_member.go b/pkg/deployment/reconcile/action_rotate_member.go index 84af5c8bf..c829be15d 100644 --- a/pkg/deployment/reconcile/action_rotate_member.go +++ b/pkg/deployment/reconcile/action_rotate_member.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/rs/zerolog" @@ -116,3 +117,8 @@ func (a *actionRotateMember) CheckProgress(ctx context.Context) (bool, error) { } return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionRotateMember) Timeout() time.Duration { + return rotateMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_shutdown_member.go b/pkg/deployment/reconcile/action_shutdown_member.go index bb0ec47ca..833295d18 100644 --- a/pkg/deployment/reconcile/action_shutdown_member.go +++ b/pkg/deployment/reconcile/action_shutdown_member.go @@ -111,3 +111,8 @@ func (a *actionShutdownMember) CheckProgress(ctx context.Context) (bool, error) // Member still not shutdown, retry soon return false, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionShutdownMember) Timeout() time.Duration { + return shutdownMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_upgrade_member.go b/pkg/deployment/reconcile/action_upgrade_member.go index a9ec6564d..793470b0a 100644 --- a/pkg/deployment/reconcile/action_upgrade_member.go +++ b/pkg/deployment/reconcile/action_upgrade_member.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/rs/zerolog" @@ -126,3 +127,8 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) { } return isUpgrading, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionUpgradeMember) Timeout() time.Duration { + return upgradeMemberTimeout +} diff --git a/pkg/deployment/reconcile/action_wait_for_member_up.go b/pkg/deployment/reconcile/action_wait_for_member_up.go index e95bc286b..9675ae293 100644 --- a/pkg/deployment/reconcile/action_wait_for_member_up.go +++ b/pkg/deployment/reconcile/action_wait_for_member_up.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" driver "github.com/arangodb/go-driver" "github.com/arangodb/go-driver/agency" @@ -164,3 +165,8 @@ func (a *actionWaitForMemberUp) checkProgressArangoSync(ctx context.Context) (bo } return true, nil } + +// Timeout returns the amount of time after which this action will timeout. +func (a *actionWaitForMemberUp) Timeout() time.Duration { + return waitForMemberUpTimeout +} diff --git a/pkg/deployment/reconcile/context.go b/pkg/deployment/reconcile/context.go index 342de904b..72771c651 100644 --- a/pkg/deployment/reconcile/context.go +++ b/pkg/deployment/reconcile/context.go @@ -54,6 +54,9 @@ type Context interface { GetAgencyClients(ctx context.Context, predicate func(id string) bool) ([]driver.Connection, error) // GetSyncServerClient returns a cached client for a specific arangosync server. GetSyncServerClient(ctx context.Context, group api.ServerGroup, id string) (client.API, error) + // CreateEvent creates a given event. + // On error, the error is logged. + CreateEvent(evt *v1.Event) // CreateMember adds a new member to the given group. // If ID is non-empty, it will be used, otherwise a new ID is created. CreateMember(group api.ServerGroup, id string) error diff --git a/pkg/deployment/reconcile/plan_executor.go b/pkg/deployment/reconcile/plan_executor.go index ca0291e38..59a113ddf 100644 --- a/pkg/deployment/reconcile/plan_executor.go +++ b/pkg/deployment/reconcile/plan_executor.go @@ -25,11 +25,13 @@ package reconcile import ( "context" "fmt" + "time" + "github.com/rs/zerolog" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" - "github.com/rs/zerolog" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) // ExecutePlan tries to execute the plan as far as possible. @@ -106,7 +108,21 @@ func (d *Reconciler) ExecutePlan(ctx context.Context) (bool, error) { } log.Debug().Bool("ready", ready).Msg("Action CheckProgress completed") if !ready { - // Not ready check, come back soon + // Not ready yet, check timeout + deadline := planAction.CreationTime.Add(action.Timeout()) + if time.Now().After(deadline) { + // Timeout has expired + log.Warn().Msg("Action not finished in time. Removing the entire plan") + d.context.CreateEvent(k8sutil.NewPlanTimeoutEvent(d.context.GetAPIObject(), string(planAction.Type), planAction.MemberID, planAction.Group.AsRole())) + // Replace plan with empty one and save it. + status.Plan = api.Plan{} + if err := d.context.UpdateStatus(status); err != nil { + log.Debug().Err(err).Msg("Failed to update CR status") + return false, maskAny(err) + } + return true, nil + } + // Timeout not yet expired, come back soon return true, nil } // Continue with next action diff --git a/pkg/deployment/reconcile/timeouts.go b/pkg/deployment/reconcile/timeouts.go new file mode 100644 index 000000000..f7c31a3ce --- /dev/null +++ b/pkg/deployment/reconcile/timeouts.go @@ -0,0 +1,36 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package reconcile + +import "time" + +const ( + addMemberTimeout = time.Minute * 5 + cleanoutMemberTimeout = time.Hour * 12 + removeMemberTimeout = time.Minute * 15 + renewTLSCertificateTimeout = time.Minute * 30 + rotateMemberTimeout = time.Minute * 30 + shutdownMemberTimeout = time.Minute * 30 + upgradeMemberTimeout = time.Hour * 6 + waitForMemberUpTimeout = time.Minute * 15 +) diff --git a/pkg/deployment/resources/context.go b/pkg/deployment/resources/context.go index 3cb1636fc..386e72646 100644 --- a/pkg/deployment/resources/context.go +++ b/pkg/deployment/resources/context.go @@ -62,7 +62,7 @@ type Context interface { GetLifecycleImage() string // GetNamespace returns the namespace that contains the deployment GetNamespace() string - // createEvent creates a given event. + // CreateEvent creates a given event. // On error, the error is logged. CreateEvent(evt *v1.Event) // GetOwnedPods returns a list of all pods owned by the deployment. diff --git a/pkg/util/k8sutil/events.go b/pkg/util/k8sutil/events.go index 93fae7e83..6a03d9028 100644 --- a/pkg/util/k8sutil/events.go +++ b/pkg/util/k8sutil/events.go @@ -145,6 +145,16 @@ func NewAccessPackageDeletedEvent(apiObject APIObject, apSecretName string) *v1. return event } +// NewPlanTimeoutEvent creates an event indicating that an item on a reconciliation plan did not +// finish before its deadline. +func NewPlanTimeoutEvent(apiObject APIObject, itemType, memberID, role string) *v1.Event { + event := newDeploymentEvent(apiObject) + event.Type = v1.EventTypeNormal + event.Reason = "Reconciliation Plan Timeout" + event.Message = fmt.Sprintf("An plan item of type %s or member %s with role %s did not finish in time", itemType, memberID, role) + return event +} + // NewErrorEvent creates an even of type error. func NewErrorEvent(reason string, err error, apiObject APIObject) *v1.Event { event := newDeploymentEvent(apiObject)