Skip to content

Commit

Permalink
Reformat logs to support proper JSON logging (#255)
Browse files Browse the repository at this point in the history
* Reformat logs to support proper JSON logging

* Make the log keys snake_case

* change webhook template log
  • Loading branch information
haugenj committed Sep 17, 2020
1 parent f90e90c commit 37bae7f
Show file tree
Hide file tree
Showing 11 changed files with 152 additions and 91 deletions.
100 changes: 87 additions & 13 deletions cmd/node-termination-handler.go
Expand Up @@ -53,6 +53,13 @@ func main() {
log.Fatal().Err(err).Msg("Failed to parse cli args,")
}

if nthConfig.JsonLogging {
log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger()
printJsonConfigArgs(nthConfig)
} else {
printHumanConfigArgs(nthConfig)
}

err = webhook.ValidateWebhookConfig(nthConfig)
if err != nil {
log.Fatal().Err(err).Msg("Webhook validation failed,")
Expand All @@ -62,10 +69,6 @@ func main() {
log.Fatal().Err(err).Msg("Unable to instantiate a node for various kubernetes node functions,")
}

if nthConfig.JsonLogging {
log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger()
}

metrics, err := observability.InitMetrics(nthConfig.EnablePrometheus, nthConfig.PrometheusPort)
if err != nil {
log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,")
Expand All @@ -79,7 +82,7 @@ func main() {
if nthConfig.EnableScheduledEventDraining {
err = handleRebootUncordon(nthConfig.NodeName, interruptionEventStore, *node)
if err != nil {
log.Log().Msgf("Unable to complete the uncordon after reboot workflow on startup: %v", err)
log.Log().Err(err).Msg("Unable to complete the uncordon after reboot workflow on startup")
}
}

Expand All @@ -100,13 +103,13 @@ func main() {

for _, fn := range monitoringFns {
go func(monitor monitor.Monitor) {
log.Log().Msgf("Started monitoring for %s events", monitor.Kind())
log.Log().Str("event_type", monitor.Kind()).Msg("Started monitoring for events")
var previousErr error
var duplicateErrCount int
for range time.Tick(time.Second * 2) {
err := monitor.Monitor()
if err != nil {
log.Log().Msgf("There was a problem monitoring for %s events: %v", monitor.Kind(), err)
log.Log().Str("event_type", monitor.Kind()).Err(err).Msg("There was a problem monitoring for events")
metrics.ErrorEventsInc(monitor.Kind())
if err == previousErr {
duplicateErrCount++
Expand Down Expand Up @@ -178,7 +181,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
log.Log().Msg("Uncordoning the node due to a cancellation event")
err := node.Uncordon(nodeName)
if err != nil {
log.Log().Msgf("Uncordoning the node failed: %v", err)
log.Log().Err(err).Msg("Uncordoning the node failed")
}
metrics.NodeActionsInc("uncordon", nodeName, err)

Expand All @@ -196,26 +199,26 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
if drainEvent.PreDrainTask != nil {
err := drainEvent.PreDrainTask(*drainEvent, node)
if err != nil {
log.Log().Msgf("There was a problem executing the pre-drain task: %v", err)
log.Log().Err(err).Msg("There was a problem executing the pre-drain task")
}
metrics.NodeActionsInc("pre-drain", nodeName, err)
}

if nthConfig.CordonOnly {
err := node.Cordon(nodeName)
if err != nil {
log.Log().Msgf("There was a problem while trying to cordon the node: %v", err)
log.Log().Err(err).Msg("There was a problem while trying to cordon the node")
os.Exit(1)
}
log.Log().Msgf("Node %q successfully cordoned.", nodeName)
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned")
metrics.NodeActionsInc("cordon", nodeName, err)
} else {
err := node.CordonAndDrain(nodeName)
if err != nil {
log.Log().Msgf("There was a problem while trying to cordon and drain the node: %v", err)
log.Log().Err(err).Msg("There was a problem while trying to cordon and drain the node")
os.Exit(1)
}
log.Log().Msgf("Node %q successfully cordoned and drained.", nodeName)
log.Log().Str("node_name", nodeName).Msg("Node successfully cordoned and drained")
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
}

Expand All @@ -229,3 +232,74 @@ func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Sto
func logFormatLevel(interface{}) string {
return ""
}

func printJsonConfigArgs(config config.Config) {
// manually setting fields instead of using log.Log().Interface() to use snake_case instead of PascalCase
// intentionally did not log webhook configuration as there may be secrets
log.Log().
Bool("dry_run", config.DryRun).
Str("node_name", config.NodeName).
Str("metadata_url", config.MetadataURL).
Str("kubernetes_service_host", config.KubernetesServiceHost).
Str("kubernetes_service_port", config.KubernetesServicePort).
Bool("delete_local_data", config.DeleteLocalData).
Bool("ignore_daemon_sets", config.IgnoreDaemonSets).
Int("pod_termination_grace_period", config.PodTerminationGracePeriod).
Int("node_termination_grace_period", config.NodeTerminationGracePeriod).
Bool("enable_scheduled_event_draining", config.EnableScheduledEventDraining).
Bool("enable_spot_interruption_draining", config.EnableSpotInterruptionDraining).
Int("metadata_tries", config.MetadataTries).
Bool("cordon_only", config.CordonOnly).
Bool("taint_node", config.TaintNode).
Bool("json_logging", config.JsonLogging).
Str("webhook_proxy", config.WebhookProxy).
Str("uptime_from_file", config.UptimeFromFile).
Bool("enable_prometheus_server", config.EnablePrometheus).
Int("prometheus_server_port", config.PrometheusPort).
Msg("aws-node-termination-handler arguments")
}

func printHumanConfigArgs(config config.Config) {
// intentionally did not log webhook configuration as there may be secrets
log.Log().Msgf(
"aws-node-termination-handler arguments: \n"+
"\tdry-run: %t,\n"+
"\tnode-name: %s,\n"+
"\tmetadata-url: %s,\n"+
"\tkubernetes-service-host: %s,\n"+
"\tkubernetes-service-port: %s,\n"+
"\tdelete-local-data: %t,\n"+
"\tignore-daemon-sets: %t,\n"+
"\tpod-termination-grace-period: %d,\n"+
"\tnode-termination-grace-period: %d,\n"+
"\tenable-scheduled-event-draining: %t,\n"+
"\tenable-spot-interruption-draining: %t,\n"+
"\tmetadata-tries: %d,\n"+
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\tjson-logging: %t,\n"+
"\twebhook-proxy: %s,\n"+
"\tuptime-from-file: %s,\n"+
"\tenable-prometheus-server: %t,\n"+
"\tprometheus-server-port: %d,\n",
config.DryRun,
config.NodeName,
config.MetadataURL,
config.KubernetesServiceHost,
config.KubernetesServicePort,
config.DeleteLocalData,
config.IgnoreDaemonSets,
config.PodTerminationGracePeriod,
config.NodeTerminationGracePeriod,
config.EnableScheduledEventDraining,
config.EnableSpotInterruptionDraining,
config.MetadataTries,
config.CordonOnly,
config.TaintNode,
config.JsonLogging,
config.WebhookProxy,
config.UptimeFromFile,
config.EnablePrometheus,
config.PrometheusPort,
)
}
43 changes: 0 additions & 43 deletions pkg/config/config.go
Expand Up @@ -146,49 +146,6 @@ func ParseCliArgs() (config Config, err error) {
os.Setenv(kubernetesServiceHostConfigKey, config.KubernetesServiceHost)
os.Setenv(kubernetesServicePortConfigKey, config.KubernetesServicePort)

// intentionally did not log webhook configuration as there may be secrets
fmt.Printf(
"aws-node-termination-handler arguments: \n"+
"\tdry-run: %t,\n"+
"\tnode-name: %s,\n"+
"\tmetadata-url: %s,\n"+
"\tkubernetes-service-host: %s,\n"+
"\tkubernetes-service-port: %s,\n"+
"\tdelete-local-data: %t,\n"+
"\tignore-daemon-sets: %t,\n"+
"\tpod-termination-grace-period: %d,\n"+
"\tnode-termination-grace-period: %d,\n"+
"\tenable-scheduled-event-draining: %t,\n"+
"\tenable-spot-interruption-draining: %t,\n"+
"\tmetadata-tries: %d,\n"+
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\tjson-logging: %t,\n"+
"\twebhook-proxy: %s,\n"+
"\tuptime-from-file: %s,\n"+
"\tenable-prometheus-server: %t,\n"+
"\tprometheus-server-port: %d,\n",
config.DryRun,
config.NodeName,
config.MetadataURL,
config.KubernetesServiceHost,
config.KubernetesServicePort,
config.DeleteLocalData,
config.IgnoreDaemonSets,
config.PodTerminationGracePeriod,
config.NodeTerminationGracePeriod,
config.EnableScheduledEventDraining,
config.EnableSpotInterruptionDraining,
config.MetadataTries,
config.CordonOnly,
config.TaintNode,
config.JsonLogging,
config.WebhookProxy,
config.UptimeFromFile,
config.EnablePrometheus,
config.PrometheusPort,
)

return config, err
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/ec2metadata/ec2metadata.go
Expand Up @@ -190,7 +190,7 @@ func (e *Service) Request(contextPath string) (*http.Response, error) {
if err != nil {
e.v2Token = ""
e.tokenTTL = -1
log.Log().Msgf("Unable to retrieve an IMDSv2 token, continuing with IMDSv1: %v", err)
log.Log().Err(err).Msg("Unable to retrieve an IMDSv2 token, continuing with IMDSv1")
} else {
e.v2Token = token
e.tokenTTL = ttl
Expand Down Expand Up @@ -293,7 +293,7 @@ func (e *Service) GetNodeMetadata() NodeMetadata {
metadata.LocalIP, _ = e.GetMetadataInfo(LocalIPPath)
metadata.AvailabilityZone, _ = e.GetMetadataInfo(AZPlacementPath)

log.Log().Msgf("Startup Metadata Retrieved: %+v", metadata)
log.Log().Interface("metadata", metadata).Msg("Startup Metadata Retrieved")

return metadata
}
2 changes: 1 addition & 1 deletion pkg/monitor/scheduledevent/scheduled-event-monitor.go
Expand Up @@ -94,7 +94,7 @@ func (m ScheduledEventMonitor) checkForScheduledEvents() ([]monitor.Interruption
notAfter, err := time.Parse(scheduledEventDateFormat, scheduledEvent.NotAfter)
if err != nil {
notAfter = notBefore
log.Log().Msgf("Unable to parse scheduled event end time, continuing: %v", err)
log.Log().Err(err).Msg("Unable to parse scheduled event end time, continuing")
}
events = append(events, monitor.InterruptionEvent{
EventID: scheduledEvent.EventID,
Expand Down
Expand Up @@ -19,6 +19,8 @@ import (
"testing"
"time"

"github.com/rs/zerolog/log"

"github.com/aws/aws-node-termination-handler/pkg/config"
"github.com/aws/aws-node-termination-handler/pkg/monitor"
"github.com/aws/aws-node-termination-handler/pkg/node"
Expand Down Expand Up @@ -46,8 +48,8 @@ func getDrainHelper(client *fake.Clientset) *drain.Helper {
IgnoreAllDaemonSets: true,
DeleteLocalData: true,
Timeout: time.Duration(120) * time.Second,
Out: os.Stdout,
ErrOut: os.Stderr,
Out: log.Logger,
ErrOut: log.Logger,
}
}

Expand Down
7 changes: 4 additions & 3 deletions pkg/monitor/spotitn/spot-itn-monitor_internal_test.go
Expand Up @@ -14,10 +14,11 @@
package spotitn

import (
"os"
"testing"
"time"

"github.com/rs/zerolog/log"

"github.com/aws/aws-node-termination-handler/pkg/config"
"github.com/aws/aws-node-termination-handler/pkg/monitor"
"github.com/aws/aws-node-termination-handler/pkg/node"
Expand All @@ -39,8 +40,8 @@ func getSpotDrainHelper(client *fake.Clientset) *drain.Helper {
IgnoreAllDaemonSets: true,
DeleteLocalData: true,
Timeout: time.Duration(120) * time.Second,
Out: os.Stdout,
ErrOut: os.Stderr,
Out: log.Logger,
ErrOut: log.Logger,
}
}

Expand Down

0 comments on commit 37bae7f

Please sign in to comment.