diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 42963f35..1c0642d3 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -53,6 +53,13 @@ func main() { log.Fatal().Err(err).Msg("Failed to parse cli args,") } + if nthConfig.JsonLogging { + log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger() + printJsonConfigArgs(nthConfig) + } else { + printHumanConfigArgs(nthConfig) + } + err = webhook.ValidateWebhookConfig(nthConfig) if err != nil { log.Fatal().Err(err).Msg("Webhook validation failed,") @@ -62,10 +69,6 @@ func main() { log.Fatal().Err(err).Msg("Unable to instantiate a node for various kubernetes node functions,") } - if nthConfig.JsonLogging { - log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger() - } - metrics, err := observability.InitMetrics(nthConfig.EnablePrometheus, nthConfig.PrometheusPort) if err != nil { log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,") @@ -79,7 +82,7 @@ func main() { if nthConfig.EnableScheduledEventDraining { err = handleRebootUncordon(nthConfig.NodeName, interruptionEventStore, *node) if err != nil { - log.Log().Msgf("Unable to complete the uncordon after reboot workflow on startup: %v", err) + log.Log().Err(err).Msg("Unable to complete the uncordon after reboot workflow on startup") } } @@ -100,13 +103,13 @@ func main() { for _, fn := range monitoringFns { go func(monitor monitor.Monitor) { - log.Log().Msgf("Started monitoring for %s events", monitor.Kind()) + log.Log().Str("event_type", monitor.Kind()).Msg("Started monitoring for events") var previousErr error var duplicateErrCount int for range time.Tick(time.Second * 2) { err := monitor.Monitor() if err != nil { - log.Log().Msgf("There was a problem monitoring for %s events: %v", monitor.Kind(), err) + log.Log().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events") metrics.ErrorEventsInc(monitor.Kind()) if err == previousErr { duplicateErrCount++ @@ -178,7 +181,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int log.Log().Msg("Uncordoning the node due to a cancellation event") err := node.Uncordon(nodeName) if err != nil { - log.Log().Msgf("Uncordoning the node failed: %v", err) + log.Log().Err(err).Msg("Uncordoning the node failed") } metrics.NodeActionsInc("uncordon", nodeName, err) @@ -196,7 +199,7 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto if drainEvent.PreDrainTask != nil { err := drainEvent.PreDrainTask(*drainEvent, node) if err != nil { - log.Log().Msgf("There was a problem executing the pre-drain task: %v", err) + log.Log().Err(err).Msg("There was a problem executing the pre-drain task") } metrics.NodeActionsInc("pre-drain", nodeName, err) } @@ -204,18 +207,18 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto if nthConfig.CordonOnly { err := node.Cordon(nodeName) if err != nil { - log.Log().Msgf("There was a problem while trying to cordon the node: %v", err) + log.Log().Err(err).Msg("There was a problem while trying to cordon the node") os.Exit(1) } - log.Log().Msgf("Node %q successfully cordoned.", nodeName) + log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned") metrics.NodeActionsInc("cordon", nodeName, err) } else { err := node.CordonAndDrain(nodeName) if err != nil { - log.Log().Msgf("There was a problem while trying to cordon and drain the node: %v", err) + log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node") os.Exit(1) } - log.Log().Msgf("Node %q successfully cordoned and drained.", nodeName) + log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained") metrics.NodeActionsInc("cordon-and-drain", nodeName, err) } @@ -229,3 +232,74 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto func logFormatLevel(interface{}) string { return "" } + +func printJsonConfigArgs(config config.Config) { + // manually setting fields instead of using log.Log().Interface() to use snake_case instead of PascalCase + // intentionally did not log webhook configuration as there may be secrets + log.Log(). + Bool("dry_run", config.DryRun). + Str("node_name", config.NodeName). + Str("metadata_url", config.MetadataURL). + Str("kubernetes_service_host", config.KubernetesServiceHost). + Str("kubernetes_service_port", config.KubernetesServicePort). + Bool("delete_local_data", config.DeleteLocalData). + Bool("ignore_daemon_sets", config.IgnoreDaemonSets). + Int("pod_termination_grace_period", config.PodTerminationGracePeriod). + Int("node_termination_grace_period", config.NodeTerminationGracePeriod). + Bool("enable_scheduled_event_draining", config.EnableScheduledEventDraining). + Bool("enable_spot_interruption_draining", config.EnableSpotInterruptionDraining). + Int("metadata_tries", config.MetadataTries). + Bool("cordon_only", config.CordonOnly). + Bool("taint_node", config.TaintNode). + Bool("json_logging", config.JsonLogging). + Str("webhook_proxy", config.WebhookProxy). + Str("uptime_from_file", config.UptimeFromFile). + Bool("enable_prometheus_server", config.EnablePrometheus). + Int("prometheus_server_port", config.PrometheusPort). + Msg("aws-node-termination-handler arguments") +} + +func printHumanConfigArgs(config config.Config) { + // intentionally did not log webhook configuration as there may be secrets + log.Log().Msgf( + "aws-node-termination-handler arguments: \n"+ + "\tdry-run: %t,\n"+ + "\tnode-name: %s,\n"+ + "\tmetadata-url: %s,\n"+ + "\tkubernetes-service-host: %s,\n"+ + "\tkubernetes-service-port: %s,\n"+ + "\tdelete-local-data: %t,\n"+ + "\tignore-daemon-sets: %t,\n"+ + "\tpod-termination-grace-period: %d,\n"+ + "\tnode-termination-grace-period: %d,\n"+ + "\tenable-scheduled-event-draining: %t,\n"+ + "\tenable-spot-interruption-draining: %t,\n"+ + "\tmetadata-tries: %d,\n"+ + "\tcordon-only: %t,\n"+ + "\ttaint-node: %t,\n"+ + "\tjson-logging: %t,\n"+ + "\twebhook-proxy: %s,\n"+ + "\tuptime-from-file: %s,\n"+ + "\tenable-prometheus-server: %t,\n"+ + "\tprometheus-server-port: %d,\n", + config.DryRun, + config.NodeName, + config.MetadataURL, + config.KubernetesServiceHost, + config.KubernetesServicePort, + config.DeleteLocalData, + config.IgnoreDaemonSets, + config.PodTerminationGracePeriod, + config.NodeTerminationGracePeriod, + config.EnableScheduledEventDraining, + config.EnableSpotInterruptionDraining, + config.MetadataTries, + config.CordonOnly, + config.TaintNode, + config.JsonLogging, + config.WebhookProxy, + config.UptimeFromFile, + config.EnablePrometheus, + config.PrometheusPort, + ) +} diff --git a/pkg/config/config.go b/pkg/config/config.go index 1da23c4e..d43996bf 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -146,49 +146,6 @@ func ParseCliArgs() (config Config, err error) { os.Setenv(kubernetesServiceHostConfigKey, config.KubernetesServiceHost) os.Setenv(kubernetesServicePortConfigKey, config.KubernetesServicePort) - // intentionally did not log webhook configuration as there may be secrets - fmt.Printf( - "aws-node-termination-handler arguments: \n"+ - "\tdry-run: %t,\n"+ - "\tnode-name: %s,\n"+ - "\tmetadata-url: %s,\n"+ - "\tkubernetes-service-host: %s,\n"+ - "\tkubernetes-service-port: %s,\n"+ - "\tdelete-local-data: %t,\n"+ - "\tignore-daemon-sets: %t,\n"+ - "\tpod-termination-grace-period: %d,\n"+ - "\tnode-termination-grace-period: %d,\n"+ - "\tenable-scheduled-event-draining: %t,\n"+ - "\tenable-spot-interruption-draining: %t,\n"+ - "\tmetadata-tries: %d,\n"+ - "\tcordon-only: %t,\n"+ - "\ttaint-node: %t,\n"+ - "\tjson-logging: %t,\n"+ - "\twebhook-proxy: %s,\n"+ - "\tuptime-from-file: %s,\n"+ - "\tenable-prometheus-server: %t,\n"+ - "\tprometheus-server-port: %d,\n", - config.DryRun, - config.NodeName, - config.MetadataURL, - config.KubernetesServiceHost, - config.KubernetesServicePort, - config.DeleteLocalData, - config.IgnoreDaemonSets, - config.PodTerminationGracePeriod, - config.NodeTerminationGracePeriod, - config.EnableScheduledEventDraining, - config.EnableSpotInterruptionDraining, - config.MetadataTries, - config.CordonOnly, - config.TaintNode, - config.JsonLogging, - config.WebhookProxy, - config.UptimeFromFile, - config.EnablePrometheus, - config.PrometheusPort, - ) - return config, err } diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go index 935fafe7..f5d17404 100644 --- a/pkg/ec2metadata/ec2metadata.go +++ b/pkg/ec2metadata/ec2metadata.go @@ -190,7 +190,7 @@ func (e *Service) Request(contextPath string) (*http.Response, error) { if err != nil { e.v2Token = "" e.tokenTTL = -1 - log.Log().Msgf("Unable to retrieve an IMDSv2 token, continuing with IMDSv1: %v", err) + log.Log().Err(err).Msg("Unable to retrieve an IMDSv2 token, continuing with IMDSv1") } else { e.v2Token = token e.tokenTTL = ttl @@ -293,7 +293,7 @@ func (e *Service) GetNodeMetadata() NodeMetadata { metadata.LocalIP, _ = e.GetMetadataInfo(LocalIPPath) metadata.AvailabilityZone, _ = e.GetMetadataInfo(AZPlacementPath) - log.Log().Msgf("Startup Metadata Retrieved: %+v", metadata) + log.Log().Interface("metadata", metadata).Msg("Startup Metadata Retrieved") return metadata } diff --git a/pkg/monitor/scheduledevent/scheduled-event-monitor.go b/pkg/monitor/scheduledevent/scheduled-event-monitor.go index 6fc406fa..1072a49e 100644 --- a/pkg/monitor/scheduledevent/scheduled-event-monitor.go +++ b/pkg/monitor/scheduledevent/scheduled-event-monitor.go @@ -94,7 +94,7 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption notAfter, err := time.Parse(scheduledEventDateFormat, scheduledEvent.NotAfter) if err != nil { notAfter = notBefore - log.Log().Msgf("Unable to parse scheduled event end time, continuing: %v", err) + log.Log().Err(err).Msg("Unable to parse scheduled event end time, continuing") } events = append(events, monitor.InterruptionEvent{ EventID: scheduledEvent.EventID, diff --git a/pkg/monitor/scheduledevent/scheduled-event-monitor_internal_test.go b/pkg/monitor/scheduledevent/scheduled-event-monitor_internal_test.go index 233cf6d1..1e061b39 100644 --- a/pkg/monitor/scheduledevent/scheduled-event-monitor_internal_test.go +++ b/pkg/monitor/scheduledevent/scheduled-event-monitor_internal_test.go @@ -19,6 +19,8 @@ import ( "testing" "time" + "github.com/rs/zerolog/log" + "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" @@ -46,8 +48,8 @@ func getDrainHelper(client *fake.Clientset) *drain.Helper { IgnoreAllDaemonSets: true, DeleteLocalData: true, Timeout: time.Duration(120) * time.Second, - Out: os.Stdout, - ErrOut: os.Stderr, + Out: log.Logger, + ErrOut: log.Logger, } } diff --git a/pkg/monitor/spotitn/spot-itn-monitor_internal_test.go b/pkg/monitor/spotitn/spot-itn-monitor_internal_test.go index dc22473a..6d808b9e 100644 --- a/pkg/monitor/spotitn/spot-itn-monitor_internal_test.go +++ b/pkg/monitor/spotitn/spot-itn-monitor_internal_test.go @@ -14,10 +14,11 @@ package spotitn import ( - "os" "testing" "time" + "github.com/rs/zerolog/log" + "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/monitor" "github.com/aws/aws-node-termination-handler/pkg/node" @@ -39,8 +40,8 @@ func getSpotDrainHelper(client *fake.Clientset) *drain.Helper { IgnoreAllDaemonSets: true, DeleteLocalData: true, Timeout: time.Duration(120) * time.Second, - Out: os.Stdout, - ErrOut: os.Stderr, + Out: log.Logger, + ErrOut: log.Logger, } } diff --git a/pkg/node/node.go b/pkg/node/node.go index b95c2adb..95d1b0e8 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -16,7 +16,6 @@ package node import ( "encoding/json" "fmt" - "os" "strconv" "strings" "time" @@ -86,7 +85,7 @@ func NewWithValues(nthConfig config.Config, drainHelper *drain.Helper, uptime up // CordonAndDrain will cordon the node and evict pods based on the config func (n Node) CordonAndDrain(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Msgf("Node %s would have been cordoned and drained, but dry-run flag was set", nodeName) + log.Log().Str("node_name", nodeName).Msg("Node would have been cordoned and drained, but dry-run flag was set") return nil } log.Log().Msg("Cordoning the node") @@ -106,7 +105,7 @@ func (n Node) CordonAndDrain(nodeName string) error { // Cordon will add a NoSchedule on the node func (n Node) Cordon(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Msgf("Node %s would have been cordoned, but dry-run flag was set", nodeName) + log.Log().Str("node_name", nodeName).Msg("Node would have been cordoned, but dry-run flag was set") return nil } node, err := n.fetchKubernetesNode(nodeName) @@ -123,7 +122,7 @@ func (n Node) Cordon(nodeName string) error { // Uncordon will remove the NoSchedule on the node func (n Node) Uncordon(nodeName string) error { if n.nthConfig.DryRun { - log.Log().Msgf("Node %s would have been uncordoned, but dry-run flag was set", nodeName) + log.Log().Str("node_name", nodeName).Msg("Node would have been uncordoned, but dry-run flag was set") return nil } node, err := n.fetchKubernetesNode(nodeName) @@ -178,7 +177,7 @@ func (n Node) GetEventID(nodeName string) (string, error) { } val, ok := node.Labels[EventIDLabelKey] if n.nthConfig.DryRun && !ok { - log.Log().Msgf("Would have returned Error: Event ID Lable %s was not found on the node, but dry-run flag was set", EventIDLabelKey) + log.Log().Msgf("Would have returned Error: 'Event ID Label %s was not found on the node', but dry-run flag was set", EventIDLabelKey) return "", nil } if !ok { @@ -408,8 +407,8 @@ func getDrainHelper(nthConfig config.Config) (*drain.Helper, error) { IgnoreAllDaemonSets: nthConfig.IgnoreDaemonSets, DeleteLocalData: nthConfig.DeleteLocalData, Timeout: time.Duration(nthConfig.NodeTerminationGracePeriod) * time.Second, - Out: os.Stdout, - ErrOut: os.Stderr, + Out: log.Logger, + ErrOut: log.Logger, } if nthConfig.DryRun { return drainHelper, nil @@ -450,7 +449,10 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e // Get the newest version of the node. freshNode, err = client.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{}) if err != nil || freshNode == nil { - log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) + log.Log(). + Str("taint_key", taintKey). + Str("node_name", node.Name). + Msg("Error while adding taint on node") return fmt.Errorf("failed to get node %v: %v", node.Name, err) } } @@ -471,10 +473,16 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e } if err != nil { - log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) + log.Log(). + Str("taint_key", taintKey). + Str("node_name", node.Name). + Msg("Error while adding taint on node") return err } - log.Log().Msgf("Successfully added %v on node %v", taintKey, node.Name) + log.Log(). + Str("taint_key", taintKey). + Str("node_name", node.Name). + Msg("Successfully added taint on node") return nil } } @@ -482,7 +490,11 @@ func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, e func addTaintToSpec(node *corev1.Node, taintKey string, taintValue string, effect corev1.TaintEffect) bool { for _, taint := range node.Spec.Taints { if taint.Key == taintKey { - log.Log().Msgf("%v already present on node %v, taint: %v", taintKey, node.Name, taint) + log.Log(). + Str("taint_key", taintKey). + Interface("taint", taint). + Str("node_name", node.Name). + Msg("Taint key already present on node") return false } } @@ -504,14 +516,16 @@ func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string // Get the newest version of the node. freshNode, err = client.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{}) if err != nil || freshNode == nil { - log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) return false, fmt.Errorf("failed to get node %v: %v", node.Name, err) } } newTaints := make([]corev1.Taint, 0) for _, taint := range freshNode.Spec.Taints { if taint.Key == taintKey { - log.Log().Msgf("Releasing taint %+v on node %v", taint, node.Name) + log.Log(). + Interface("taint", taint). + Str("node_name", node.Name). + Msg("Releasing taint on node") } else { newTaints = append(newTaints, taint) } @@ -535,10 +549,16 @@ func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string } if err != nil { - log.Log().Msgf("Error while releasing %v taint on node %v: %v", taintKey, node.Name, err) + log.Log(). + Str("taint_key", taintKey). + Str("node_name", node.Name). + Msg("Error while releasing taint on node") return false, err } - log.Log().Msgf("Successfully released %v on node %v", taintKey, node.Name) + log.Log(). + Str("taint_key", taintKey). + Str("node_name", node.Name). + Msg("Successfully released taint on node") return true, nil } } diff --git a/pkg/node/node_internal_test.go b/pkg/node/node_internal_test.go index f17abdc6..5d2ba4d0 100644 --- a/pkg/node/node_internal_test.go +++ b/pkg/node/node_internal_test.go @@ -21,6 +21,8 @@ import ( "testing" "time" + "github.com/rs/zerolog/log" + "github.com/aws/aws-node-termination-handler/pkg/config" h "github.com/aws/aws-node-termination-handler/pkg/test" "github.com/aws/aws-node-termination-handler/pkg/uptime" @@ -53,8 +55,8 @@ func getTestDrainHelper(client *fake.Clientset) *drain.Helper { IgnoreAllDaemonSets: true, DeleteLocalData: true, Timeout: time.Duration(120) * time.Second, - Out: os.Stdout, - ErrOut: os.Stderr, + Out: log.Logger, + ErrOut: log.Logger, } } diff --git a/pkg/node/node_test.go b/pkg/node/node_test.go index 5bc6636a..0deed198 100644 --- a/pkg/node/node_test.go +++ b/pkg/node/node_test.go @@ -20,6 +20,8 @@ import ( "testing" "time" + "github.com/rs/zerolog/log" + "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/aws/aws-node-termination-handler/pkg/node" h "github.com/aws/aws-node-termination-handler/pkg/test" @@ -46,8 +48,8 @@ func getDrainHelper(client *fake.Clientset) *drain.Helper { IgnoreAllDaemonSets: true, DeleteLocalData: true, Timeout: time.Duration(120) * time.Second, - Out: os.Stdout, - ErrOut: os.Stderr, + Out: log.Logger, + ErrOut: log.Logger, } } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index c8742dc0..0f2b1ba6 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -41,7 +41,10 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv if nthConfig.WebhookTemplateFile != "" { content, err := ioutil.ReadFile(nthConfig.WebhookTemplateFile) if err != nil { - log.Log().Msgf("Webhook Error: Could not read template file %s - %s", nthConfig.WebhookTemplateFile, err) + log.Log(). + Str("webhook_template_file", nthConfig.WebhookTemplateFile). + Err(err). + Msg("Webhook Error: Could not read template file") return } webhookTemplateContent = string(content) @@ -52,7 +55,7 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv webhookTemplate, err := template.New("message").Parse(webhookTemplateContent) if err != nil { - log.Log().Msgf("Webhook Error: Template parsing failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Template parsing failed") return } @@ -61,20 +64,20 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv var byteBuffer bytes.Buffer err = webhookTemplate.Execute(&byteBuffer, combined) if err != nil { - log.Log().Msgf("Webhook Error: Template execution failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Template execution failed") return } request, err := http.NewRequest("POST", nthConfig.WebhookURL, &byteBuffer) if err != nil { - log.Log().Msgf("Webhook Error: Http NewRequest failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Http NewRequest failed") return } headerMap := make(map[string]interface{}) err = json.Unmarshal([]byte(nthConfig.WebhookHeaders), &headerMap) if err != nil { - log.Log().Msgf("Webhook Error: Header Unmarshal failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Header Unmarshal failed") return } for key, value := range headerMap { @@ -99,14 +102,14 @@ func Post(additionalInfo ec2metadata.NodeMetadata, event *monitor.InterruptionEv } response, err := client.Do(request) if err != nil { - log.Log().Msgf("Webhook Error: Client Do failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Client Do failed") return } defer response.Body.Close() if response.StatusCode < 200 || response.StatusCode > 299 { - log.Log().Msgf("Webhook Error: Received Status Code %d", response.StatusCode) + log.Log().Int("status_code", response.StatusCode).Msg("Webhook Error: Received Non-Successful Status Code") return } diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index d54fc979..03864c6e 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -46,7 +46,7 @@ func parseScheduledEventTime(inputTime string) time.Time { func getExpectedMessage(event *monitor.InterruptionEvent) string { webhookTemplate, err := template.New("").Parse(testWebhookTemplate) if err != nil { - log.Log().Msgf("Webhook Error: Template parsing failed - %s", err) + log.Log().Err(err).Msg("Webhook Error: Template parsing failed") return "" }