diff --git a/config/config.go b/config/config.go index 82a8bd4540e6..dfed4c5141f6 100644 --- a/config/config.go +++ b/config/config.go @@ -1138,14 +1138,17 @@ func getCPUTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) { } } -func getDiskSpaceConfig(v *viper.Viper) (requiredAvailableDiskSpace uint64, warningThresholdAvailableDiskSpace uint64, err error) { +func getDiskSpaceConfig(v *viper.Viper) (requiredAvailableDiskSpace uint64, warningThresholdAvailableDiskSpace uint64, warningThresholdAvailableDiskSpacePercentage uint64, err error) { requiredAvailableDiskSpace = v.GetUint64(SystemTrackerRequiredAvailableDiskSpaceKey) warningThresholdAvailableDiskSpace = v.GetUint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey) + warningThresholdAvailableDiskSpacePercentage = v.GetUint64(SystemTrackerWarnThreshAvailDiskSpacePercentageKey) switch { + case warningThresholdAvailableDiskSpacePercentage > 50: + return 0, 0, 0, fmt.Errorf("%q (%d) must be in [0, 50]", SystemTrackerWarnThreshAvailDiskSpacePercentageKey, warningThresholdAvailableDiskSpacePercentage) case warningThresholdAvailableDiskSpace < requiredAvailableDiskSpace: - return 0, 0, fmt.Errorf("%q (%d) < %q (%d)", SystemTrackerWarningThresholdAvailableDiskSpaceKey, warningThresholdAvailableDiskSpace, SystemTrackerRequiredAvailableDiskSpaceKey, requiredAvailableDiskSpace) + return 0, 0, 0, fmt.Errorf("%q (%d) < %q (%d)", SystemTrackerWarningThresholdAvailableDiskSpaceKey, warningThresholdAvailableDiskSpace, SystemTrackerRequiredAvailableDiskSpaceKey, requiredAvailableDiskSpace) default: - return requiredAvailableDiskSpace, warningThresholdAvailableDiskSpace, nil + return requiredAvailableDiskSpace, warningThresholdAvailableDiskSpace, warningThresholdAvailableDiskSpacePercentage, nil } } @@ -1400,7 +1403,7 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) { nodeConfig.SystemTrackerCPUHalflife = v.GetDuration(SystemTrackerCPUHalflifeKey) nodeConfig.SystemTrackerDiskHalflife = v.GetDuration(SystemTrackerDiskHalflifeKey) - nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, err = getDiskSpaceConfig(v) + nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpacePercentage, err = getDiskSpaceConfig(v) if err != nil { return node.Config{}, err } diff --git a/config/flags.go b/config/flags.go index 0ea270ec5cc0..a3e66adfb6b7 100644 --- a/config/flags.go +++ b/config/flags.go @@ -359,6 +359,7 @@ func addNodeFlags(fs *pflag.FlagSet) { fs.Duration(SystemTrackerDiskHalflifeKey, time.Minute, "Halflife to use for the disk tracker. Larger halflife --> disk usage metrics change more slowly") fs.Uint64(SystemTrackerRequiredAvailableDiskSpaceKey, 10*units.GiB, "Minimum number of available bytes on disk, under which the node will shutdown.") fs.Uint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey, 200*units.GiB, fmt.Sprintf("Warning threshold for the number of available bytes on disk, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpaceKey)) + fs.Uint64(SystemTrackerWarnThreshAvailDiskSpacePercentageKey, 3, "Warning threshold for the percentage (between 0 and 50) of available disk space, under which the node will be considered unhealthy.") // CPU management fs.Float64(CPUVdrAllocKey, float64(runtime.NumCPU()), "Maximum number of CPUs to allocate for use by validators. Value should be in range [0, total core count]") diff --git a/config/keys.go b/config/keys.go index b02e4d8d2e7b..2a411a31453d 100644 --- a/config/keys.go +++ b/config/keys.go @@ -193,6 +193,7 @@ const ( SystemTrackerDiskHalflifeKey = "system-tracker-disk-halflife" SystemTrackerRequiredAvailableDiskSpaceKey = "system-tracker-disk-required-available-space" SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space" + SystemTrackerWarnThreshAvailDiskSpacePercentageKey = "system-tracker-disk-warning-threshold-available-space-percentage" DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc" DiskMaxNonVdrUsageKey = "throttler-inbound-disk-max-non-validator-usage" DiskMaxNonVdrNodeUsageKey = "throttler-inbound-disk-max-non-validator-node-usage" diff --git a/config/node/config.go b/config/node/config.go index 79c5479143b4..1eb41c5d9cd1 100644 --- a/config/node/config.go +++ b/config/node/config.go @@ -205,8 +205,9 @@ type Config struct { DiskTargeterConfig tracker.TargeterConfig `json:"diskTargeterConfig"` - RequiredAvailableDiskSpace uint64 `json:"requiredAvailableDiskSpace"` - WarningThresholdAvailableDiskSpace uint64 `json:"warningThresholdAvailableDiskSpace"` + RequiredAvailableDiskSpace uint64 `json:"requiredAvailableDiskSpace"` + WarningThresholdAvailableDiskSpace uint64 `json:"warningThresholdAvailableDiskSpace"` + WarningThresholdAvailableDiskSpacePercentage uint64 `json:"warningThresholdAvailableDiskSpacePercentage"` TraceConfig trace.Config `json:"traceConfig"` diff --git a/node/node.go b/node/node.go index 2bf4e09058f4..edf06515cd13 100644 --- a/node/node.go +++ b/node/node.go @@ -1457,20 +1457,31 @@ func (n *Node) initHealthAPI() error { // if there is too little disk space remaining, first report unhealthy and then shutdown the node availableDiskBytes := n.resourceTracker.DiskTracker().AvailableDiskBytes() + availableDiskPercentage := n.resourceTracker.DiskTracker().AvailableDiskPercentage() - var err error + var diskSpaceErrors []error if availableDiskBytes < n.Config.RequiredAvailableDiskSpace { n.Log.Fatal("low on disk space. Shutting down...", zap.Uint64("remainingDiskBytes", availableDiskBytes), ) go n.Shutdown(1) - err = fmt.Errorf("remaining available disk space (%d) is below minimum required available space (%d)", availableDiskBytes, n.Config.RequiredAvailableDiskSpace) + err := fmt.Errorf("remaining available disk space (%d) is below minimum required available space (%d)", availableDiskBytes, n.Config.RequiredAvailableDiskSpace) + diskSpaceErrors = append(diskSpaceErrors, err) } else if availableDiskBytes < n.Config.WarningThresholdAvailableDiskSpace { - err = fmt.Errorf("remaining available disk space (%d) is below the warning threshold of disk space (%d)", availableDiskBytes, n.Config.WarningThresholdAvailableDiskSpace) + err := fmt.Errorf("remaining available disk space (%d) is below the warning threshold of disk space (%d)", availableDiskBytes, n.Config.WarningThresholdAvailableDiskSpace) + diskSpaceErrors = append(diskSpaceErrors, err) } + if availableDiskPercentage < n.Config.WarningThresholdAvailableDiskSpacePercentage { + err := fmt.Errorf("remaining available disk space percentage (%d%%) is below minimum required available space percentage (%d%%)", availableDiskPercentage, n.Config.WarningThresholdAvailableDiskSpacePercentage) + diskSpaceErrors = append(diskSpaceErrors, err) + } + + err = errors.Join(diskSpaceErrors...) + return map[string]interface{}{ - "availableDiskBytes": availableDiskBytes, + "availableDiskBytes": availableDiskBytes, + "availableDiskPercentage": availableDiskPercentage, }, err }) diff --git a/snow/networking/tracker/resource_tracker.go b/snow/networking/tracker/resource_tracker.go index a76cea1c3e2a..65d7019ce2c5 100644 --- a/snow/networking/tracker/resource_tracker.go +++ b/snow/networking/tracker/resource_tracker.go @@ -36,6 +36,7 @@ type Tracker interface { type DiskTracker interface { Tracker AvailableDiskBytes() uint64 + AvailableDiskPercentage() uint64 } // ResourceTracker is an interface for tracking peers' usage of resources @@ -150,6 +151,16 @@ func (t *diskResourceTracker) AvailableDiskBytes() uint64 { return bytesAvailable } +func (t *diskResourceTracker) AvailableDiskPercentage() uint64 { + rt := t.t + rt.lock.Lock() + defer rt.lock.Unlock() + + percentageAvailable := rt.resources.AvailableDiskPercentage() + rt.metrics.diskPercentageAvailable.Set(float64(percentageAvailable)) + return percentageAvailable +} + func (t *diskResourceTracker) TotalUsage() float64 { realReadUsage, _ := t.t.resources.DiskUsage() return realReadUsage @@ -286,11 +297,12 @@ func (rt *resourceTracker) prune(now time.Time) { } type trackerMetrics struct { - processingTimeMetric prometheus.Gauge - cpuMetric prometheus.Gauge - diskReadsMetric prometheus.Gauge - diskWritesMetric prometheus.Gauge - diskSpaceAvailable prometheus.Gauge + processingTimeMetric prometheus.Gauge + cpuMetric prometheus.Gauge + diskReadsMetric prometheus.Gauge + diskWritesMetric prometheus.Gauge + diskSpaceAvailable prometheus.Gauge + diskPercentageAvailable prometheus.Gauge } func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) { @@ -315,6 +327,10 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) { Name: "disk_available_space", Help: "Available space remaining (bytes) on the database volume", }), + diskPercentageAvailable: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "disk_available_percentage", + Help: "Percentage of database volume available", + }), } err := errors.Join( reg.Register(m.processingTimeMetric), @@ -322,6 +338,7 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) { reg.Register(m.diskReadsMetric), reg.Register(m.diskWritesMetric), reg.Register(m.diskSpaceAvailable), + reg.Register(m.diskPercentageAvailable), ) return m, err } diff --git a/utils/resource/no_usage.go b/utils/resource/no_usage.go index 47c754a9f041..da09f53fa403 100644 --- a/utils/resource/no_usage.go +++ b/utils/resource/no_usage.go @@ -10,6 +10,10 @@ var NoUsage User = noUsage{} type noUsage struct{} +func (noUsage) AvailableDiskPercentage() uint64 { + return 100 +} + func (noUsage) CPUUsage() float64 { return 0 } diff --git a/utils/resource/resourcemock/user.go b/utils/resource/resourcemock/user.go index d18ae4dbcf0b..a420c5e1329f 100644 --- a/utils/resource/resourcemock/user.go +++ b/utils/resource/resourcemock/user.go @@ -53,6 +53,20 @@ func (mr *UserMockRecorder) AvailableDiskBytes() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AvailableDiskBytes", reflect.TypeOf((*User)(nil).AvailableDiskBytes)) } +// AvailableDiskPercentage mocks base method. +func (m *User) AvailableDiskPercentage() uint64 { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "AvailableDiskPercentage") + ret0, _ := ret[0].(uint64) + return ret0 +} + +// AvailableDiskPercentage indicates an expected call of AvailableDiskPercentage. +func (mr *UserMockRecorder) AvailableDiskPercentage() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AvailableDiskPercentage", reflect.TypeOf((*User)(nil).AvailableDiskPercentage)) +} + // CPUUsage mocks base method. func (m *User) CPUUsage() float64 { m.ctrl.T.Helper() diff --git a/utils/resource/usage.go b/utils/resource/usage.go index 5fb02840da1f..6e68f269ec94 100644 --- a/utils/resource/usage.go +++ b/utils/resource/usage.go @@ -41,6 +41,9 @@ type DiskUser interface { // returns number of bytes available in the db volume AvailableDiskBytes() uint64 + + // returns percentage free in the db volume + AvailableDiskPercentage() uint64 } type User interface { @@ -82,6 +85,8 @@ type manager struct { availableDiskBytes uint64 + availableDiskPercent uint64 + closeOnce sync.Once onClose chan struct{} } @@ -132,6 +137,13 @@ func (m *manager) AvailableDiskBytes() uint64 { return m.availableDiskBytes } +func (m *manager) AvailableDiskPercentage() uint64 { + m.usageLock.RLock() + defer m.usageLock.RUnlock() + + return m.availableDiskPercent +} + func (m *manager) TrackProcess(pid int) { p, err := process.NewProcess(int32(pid)) if err != nil { @@ -174,7 +186,7 @@ func (m *manager) update(diskPath string, frequency, cpuHalflife, diskHalflife t currentScaledReadUsage := newDiskWeight * currentReadUsage currentScaledWriteUsage := newDiskWeight * currentWriteUsage - availableBytes, getBytesErr := storage.AvailableBytes(diskPath) + availableBytes, availablePercentage, getBytesErr := storage.AvailableBytes(diskPath) if getBytesErr != nil { m.log.Verbo("failed to lookup resource", zap.String("resource", "system disk"), @@ -190,6 +202,7 @@ func (m *manager) update(diskPath string, frequency, cpuHalflife, diskHalflife t if getBytesErr == nil { m.availableDiskBytes = availableBytes + m.availableDiskPercent = availablePercentage } m.usageLock.Unlock() diff --git a/utils/storage/storage_openbsd.go b/utils/storage/storage_openbsd.go index 3b335e580666..3d6ae1c45397 100644 --- a/utils/storage/storage_openbsd.go +++ b/utils/storage/storage_openbsd.go @@ -6,14 +6,23 @@ package storage -import "syscall" +import ( + "errors" + "syscall" +) -func AvailableBytes(storagePath string) (uint64, error) { +var errZeroAvailableBytes = errors.New("available blocks is reported as 0") + +func AvailableBytes(storagePath string) (uint64, uint64, error) { var stat syscall.Statfs_t err := syscall.Statfs(storagePath, &stat) if err != nil { - return 0, err + return 0, 0, err + } + if stat.F_blocks == 0 { + return 0, 0, errZeroAvailableBytes } - avail := uint64(stat.F_bavail) * uint64(stat.F_bsize) - return avail, nil + avail := stat.F_bavail * uint64(stat.F_bsize) + percentage := stat.F_bavail * 100 / stat.F_blocks + return avail, percentage, nil } diff --git a/utils/storage/storage_unix.go b/utils/storage/storage_unix.go index fa23655e7d85..f70e1910fabb 100644 --- a/utils/storage/storage_unix.go +++ b/utils/storage/storage_unix.go @@ -6,14 +6,23 @@ package storage -import "syscall" +import ( + "errors" + "syscall" +) -func AvailableBytes(storagePath string) (uint64, error) { +var errZeroAvailableBytes = errors.New("available blocks is reported as 0") + +func AvailableBytes(storagePath string) (uint64, uint64, error) { var stat syscall.Statfs_t err := syscall.Statfs(storagePath, &stat) if err != nil { - return 0, err + return 0, 0, err + } + if stat.Blocks == 0 { + return 0, 0, errZeroAvailableBytes } avail := stat.Bavail * uint64(stat.Bsize) - return avail, nil + percentage := stat.Bavail * 100 / stat.Blocks + return avail, percentage, nil }