Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -1207,15 +1207,26 @@ func getCPUTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) {
}
}

func getDiskSpaceConfig(v *viper.Viper) (requiredAvailableDiskSpace uint64, warningThresholdAvailableDiskSpace uint64, err error) {
requiredAvailableDiskSpace = v.GetUint64(SystemTrackerRequiredAvailableDiskSpaceKey)
warningThresholdAvailableDiskSpace = v.GetUint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey)
switch {
case warningThresholdAvailableDiskSpace < requiredAvailableDiskSpace:
return 0, 0, fmt.Errorf("%q (%d) < %q (%d)", SystemTrackerWarningThresholdAvailableDiskSpaceKey, warningThresholdAvailableDiskSpace, SystemTrackerRequiredAvailableDiskSpaceKey, requiredAvailableDiskSpace)
default:
return requiredAvailableDiskSpace, warningThresholdAvailableDiskSpace, nil
}
// getResourceAvailableConfig returns:
// - requiredThreshold under which the node may shutdown
// - warningThreshold under which the node may report unhealthy
func getResourceAvailableConfig(
v *viper.Viper,
requiredKey string,
warningKey string,
) (uint64, uint64, error) {
requiredThreshold := v.GetUint64(requiredKey)
warningThreshold := v.GetUint64(warningKey)
if warningThreshold < requiredThreshold {
return 0, 0, fmt.Errorf(
"%q (%d) < %q (%d)",
warningKey,
warningThreshold,
requiredKey,
requiredThreshold,
)
}
return requiredThreshold, warningThreshold, nil
}

func getDiskTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) {
Expand Down Expand Up @@ -1479,7 +1490,20 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) {
nodeConfig.SystemTrackerCPUHalflife = v.GetDuration(SystemTrackerCPUHalflifeKey)
nodeConfig.SystemTrackerDiskHalflife = v.GetDuration(SystemTrackerDiskHalflifeKey)

nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, err = getDiskSpaceConfig(v)
nodeConfig.RequiredAvailableMemory, nodeConfig.WarningThresholdAvailableMemory, err = getResourceAvailableConfig(
v,
SystemTrackerRequiredAvailableMemoryKey,
SystemTrackerWarningThresholdAvailableMemoryKey,
)
if err != nil {
return node.Config{}, err
}

nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, err = getResourceAvailableConfig(
v,
SystemTrackerRequiredAvailableDiskSpaceKey,
SystemTrackerWarningThresholdAvailableDiskSpaceKey,
)
if err != nil {
return node.Config{}, err
}
Expand Down
4 changes: 3 additions & 1 deletion config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,9 @@ func addNodeFlags(fs *pflag.FlagSet) {
fs.Duration(SystemTrackerProcessingHalflifeKey, 15*time.Second, "Halflife to use for the processing requests tracker. Larger halflife --> usage metrics change more slowly")
fs.Duration(SystemTrackerCPUHalflifeKey, 15*time.Second, "Halflife to use for the cpu tracker. Larger halflife --> cpu usage metrics change more slowly")
fs.Duration(SystemTrackerDiskHalflifeKey, time.Minute, "Halflife to use for the disk tracker. Larger halflife --> disk usage metrics change more slowly")
fs.Uint64(SystemTrackerRequiredAvailableDiskSpaceKey, units.GiB/2, "Minimum number of available bytes on disk, under which the node will shutdown.")
fs.Uint64(SystemTrackerRequiredAvailableMemoryKey, units.GiB/2, "Minimum amount of available memory, under which the node may shutdown.")
fs.Uint64(SystemTrackerWarningThresholdAvailableMemoryKey, units.GiB, fmt.Sprintf("Warning threshold for the amount of available memory, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableMemoryKey))
fs.Uint64(SystemTrackerRequiredAvailableDiskSpaceKey, units.GiB/2, "Minimum number of available bytes on disk, under which the node may shutdown.")
fs.Uint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey, units.GiB, fmt.Sprintf("Warning threshold for the number of available bytes on disk, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpaceKey))

// CPU management
Expand Down
2 changes: 2 additions & 0 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ const (
SystemTrackerProcessingHalflifeKey = "system-tracker-processing-halflife"
SystemTrackerCPUHalflifeKey = "system-tracker-cpu-halflife"
SystemTrackerDiskHalflifeKey = "system-tracker-disk-halflife"
SystemTrackerRequiredAvailableMemoryKey = "system-tracker-memory-required-available"
SystemTrackerWarningThresholdAvailableMemoryKey = "system-tracker-memory-warning-threshold-available"
SystemTrackerRequiredAvailableDiskSpaceKey = "system-tracker-disk-required-available-space"
SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space"
DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc"
Expand Down
3 changes: 3 additions & 0 deletions node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ type Config struct {

DiskTargeterConfig tracker.TargeterConfig `json:"diskTargeterConfig"`

RequiredAvailableMemory uint64 `json:"requiredAvailableMemory"`
WarningThresholdAvailableMemory uint64 `json:"warningThresholdAvailableMemory"`

RequiredAvailableDiskSpace uint64 `json:"requiredAvailableDiskSpace"`
WarningThresholdAvailableDiskSpace uint64 `json:"warningThresholdAvailableDiskSpace"`

Expand Down
61 changes: 56 additions & 5 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1046,9 +1046,51 @@ func (n *Node) initHealthAPI() error {
return fmt.Errorf("couldn't register database health check: %w", err)
}

memorySpaceCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
// Confirm that the node has enough memory to continue operating. If
// there is too little memory remaining, first report unhealthy and then
// shutdown the node.

usedMemoryBytes := n.resourceManager.MemoryUsage()
availableMemoryBytes := n.resourceManager.AvailableMemoryBytes()

var err error
if availableMemoryBytes < n.Config.RequiredAvailableMemory {
// TODO: log a FATAL and shutdown the node
n.Log.Error("critically low on memory",
zap.Uint64("usedMemory", usedMemoryBytes),
zap.Uint64("availableMemory", availableMemoryBytes),
)
err = fmt.Errorf(
"remaining available memory (%d) is below minimum required available memory (%d) when using (%d)",
availableMemoryBytes,
n.Config.RequiredAvailableMemory,
usedMemoryBytes,
)
} else if availableMemoryBytes < n.Config.WarningThresholdAvailableMemory {
err = fmt.Errorf(
"remaining available memory (%d) is below the warning threshold of available memory (%d) when using (%d)",
availableMemoryBytes,
n.Config.WarningThresholdAvailableDiskSpace,
usedMemoryBytes,
)
}

return map[string]interface{}{
"usedMemoryBytes": usedMemoryBytes,
"availableMemoryBytes": availableMemoryBytes,
}, err
})

err = n.health.RegisterHealthCheck("memory", memorySpaceCheck, health.GlobalTag)
if err != nil {
return fmt.Errorf("couldn't register memory resource health check: %w", err)
}

diskSpaceCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
// confirm that the node has enough disk space to continue operating
// if there is too little disk space remaining, first report unhealthy and then shutdown the node
// Confirm that the node has enough disk space to continue operating.
// If there is too little disk space remaining, first report unhealthy
// and then shutdown the node.

availableDiskBytes := n.resourceTracker.DiskTracker().AvailableDiskBytes()

Expand All @@ -1058,9 +1100,17 @@ func (n *Node) initHealthAPI() error {
zap.Uint64("remainingDiskBytes", availableDiskBytes),
)
go n.Shutdown(1)
err = fmt.Errorf("remaining available disk space (%d) is below minimum required available space (%d)", availableDiskBytes, n.Config.RequiredAvailableDiskSpace)
err = fmt.Errorf(
"remaining available disk space (%d) is below minimum required available space (%d)",
availableDiskBytes,
n.Config.RequiredAvailableDiskSpace,
)
} else if availableDiskBytes < n.Config.WarningThresholdAvailableDiskSpace {
err = fmt.Errorf("remaining available disk space (%d) is below the warning threshold of disk space (%d)", availableDiskBytes, n.Config.WarningThresholdAvailableDiskSpace)
err = fmt.Errorf(
"remaining available disk space (%d) is below the warning threshold of disk space (%d)",
availableDiskBytes,
n.Config.WarningThresholdAvailableDiskSpace,
)
}

return map[string]interface{}{
Expand All @@ -1070,7 +1120,7 @@ func (n *Node) initHealthAPI() error {

err = n.health.RegisterHealthCheck("diskspace", diskSpaceCheck, health.GlobalTag)
if err != nil {
return fmt.Errorf("couldn't register resource health check: %w", err)
return fmt.Errorf("couldn't register disk resource health check: %w", err)
}

handler, err := health.NewGetAndPostHandler(n.Log, healthChecker)
Expand Down Expand Up @@ -1197,6 +1247,7 @@ func (n *Node) initVdrs() validators.Set {
// Initialize [n.resourceManager].
func (n *Node) initResourceManager(reg prometheus.Registerer) error {
n.resourceManager = resource.NewManager(
n.Log,
n.Config.DatabaseConfig.Path,
n.Config.SystemTrackerFrequency,
n.Config.SystemTrackerCPUHalflife,
Expand Down
28 changes: 28 additions & 0 deletions utils/resource/mock_user.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions utils/resource/no_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ func (noUsage) CPUUsage() float64 {
return 0
}

func (noUsage) MemoryUsage() uint64 {
return 0
}

func (noUsage) AvailableMemoryBytes() uint64 {
return math.MaxUint64
}

func (noUsage) DiskUsage() (float64, float64) {
return 0, 0
}
Expand Down
Loading