From ef7f392eb8f2a918f9305483275e444991759f6d Mon Sep 17 00:00:00 2001 From: Junfan Zhang Date: Mon, 7 Aug 2023 14:12:10 +0800 Subject: [PATCH] [#1074] feat: Introduce the metric of `local_storage_service_used_space` (#1075) ### What changes were proposed in this pull request? Currently, the local_storage_used_space will show the disks used total size. But if it is in the colocation with Yarn Nodemanager, this is not enough. So this issue is to introduce the metric of local_storage_uniffle_used_space to show the uniffle used space. ### Why are the changes needed? Fix: #1074 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? 1. UTs --- .../uniffle/server/LocalStorageChecker.java | 41 +++++++++++-- .../uniffle/server/ShuffleServerMetrics.java | 12 +++- .../server/LocalStorageCheckerTest.java | 60 +++++++++++++++++++ .../uniffle/server/StorageCheckerTest.java | 12 ++-- 4 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 server/src/test/java/org/apache/uniffle/server/LocalStorageCheckerTest.java diff --git a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java index cbd9afd41f..c4151fa3af 100644 --- a/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java +++ b/server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java @@ -78,7 +78,8 @@ public LocalStorageChecker(ShuffleServerConf conf, List storages) public boolean checkIsHealthy() { AtomicInteger num = new AtomicInteger(0); AtomicLong totalSpace = new AtomicLong(0L); - AtomicLong usedSpace = new AtomicLong(0L); + AtomicLong wholeDiskUsedSpace = new AtomicLong(0L); + AtomicLong serviceUsedSpace = new AtomicLong(0L); AtomicInteger corruptedDirs = new AtomicInteger(0); CountDownLatch cdl = new CountDownLatch(storageInfos.size()); storageInfos @@ -93,7 +94,8 @@ public boolean checkIsHealthy() { } totalSpace.addAndGet(getTotalSpace(storageInfo.storageDir)); - usedSpace.addAndGet(getUsedSpace(storageInfo.storageDir)); + wholeDiskUsedSpace.addAndGet(getWholeDiskUsedSpace(storageInfo.storageDir)); + serviceUsedSpace.addAndGet(getServiceUsedSpace(storageInfo.storageDir)); if (storageInfo.checkIsSpaceEnough()) { num.incrementAndGet(); @@ -106,11 +108,12 @@ public boolean checkIsHealthy() { LOG.error("Failed to check local storage!"); } ShuffleServerMetrics.gaugeLocalStorageTotalSpace.set(totalSpace.get()); - ShuffleServerMetrics.gaugeLocalStorageUsedSpace.set(usedSpace.get()); + ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.set(wholeDiskUsedSpace.get()); + ShuffleServerMetrics.gaugeLocalStorageServiceUsedSpace.set(serviceUsedSpace.get()); ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.set(storageInfos.size()); ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.set(corruptedDirs.get()); ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.set( - usedSpace.get() * 1.0 / totalSpace.get()); + wholeDiskUsedSpace.get() * 1.0 / totalSpace.get()); if (storageInfos.isEmpty()) { if (isHealthy) { @@ -143,10 +146,36 @@ long getTotalSpace(File file) { // Only for testing @VisibleForTesting - long getUsedSpace(File file) { + long getWholeDiskUsedSpace(File file) { return file.getTotalSpace() - file.getUsableSpace(); } + protected static long getServiceUsedSpace(File storageDir) { + if (storageDir == null || !storageDir.exists()) { + return 0; + } + + if (storageDir.isFile()) { + return storageDir.length(); + } + + File[] files = storageDir.listFiles(); + if (files == null) { + return 0; + } + + long totalUsage = 0; + for (File file : files) { + if (file.isFile()) { + totalUsage += file.length(); + } else { + totalUsage += getServiceUsedSpace(file); + } + } + + return totalUsage; + } + // todo: This function will be integrated to MultiStorageManager, currently we only support disk // check. class StorageInfo { @@ -167,7 +196,7 @@ boolean checkIsSpaceEnough() { this.isHealthy = false; return false; } - double usagePercent = getUsedSpace(storageDir) * 100.0 / getTotalSpace(storageDir); + double usagePercent = getWholeDiskUsedSpace(storageDir) * 100.0 / getTotalSpace(storageDir); if (isHealthy) { if (Double.compare(usagePercent, diskMaxUsagePercentage) >= 0) { isHealthy = false; diff --git a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java index 0a8b5431fe..f7334b75ca 100644 --- a/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java +++ b/server/src/main/java/org/apache/uniffle/server/ShuffleServerMetrics.java @@ -61,7 +61,9 @@ public class ShuffleServerMetrics { private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = "local_storage_total_dirs_num"; private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = "local_storage_corrupted_dirs_num"; private static final String LOCAL_STORAGE_TOTAL_SPACE = "local_storage_total_space"; - private static final String LOCAL_STORAGE_USED_SPACE = "local_storage_used_space"; + private static final String LOCAL_STORAGE_WHOLE_DISK_USED_SPACE = + "local_storage_whole_disk_used_space"; + private static final String LOCAL_STORAGE_SERVICE_USED_SPACE = "local_storage_service_used_space"; private static final String LOCAL_STORAGE_USED_SPACE_RATIO = "local_storage_used_space_ratio"; private static final String IS_HEALTHY = "is_healthy"; @@ -143,7 +145,8 @@ public class ShuffleServerMetrics { public static Gauge.Child gaugeLocalStorageTotalDirsNum; public static Gauge.Child gaugeLocalStorageCorruptedDirsNum; public static Gauge.Child gaugeLocalStorageTotalSpace; - public static Gauge.Child gaugeLocalStorageUsedSpace; + public static Gauge.Child gaugeLocalStorageWholeDiskUsedSpace; + public static Gauge.Child gaugeLocalStorageServiceUsedSpace; public static Gauge.Child gaugeLocalStorageUsedSpaceRatio; public static Gauge.Child gaugeIsHealthy; @@ -290,7 +293,10 @@ private static void setUpMetrics() { gaugeLocalStorageCorruptedDirsNum = metricsManager.addLabeledGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM); gaugeLocalStorageTotalSpace = metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_SPACE); - gaugeLocalStorageUsedSpace = metricsManager.addLabeledGauge(LOCAL_STORAGE_USED_SPACE); + gaugeLocalStorageWholeDiskUsedSpace = + metricsManager.addLabeledGauge(LOCAL_STORAGE_WHOLE_DISK_USED_SPACE); + gaugeLocalStorageServiceUsedSpace = + metricsManager.addLabeledGauge(LOCAL_STORAGE_SERVICE_USED_SPACE); gaugeLocalStorageUsedSpaceRatio = metricsManager.addLabeledGauge(LOCAL_STORAGE_USED_SPACE_RATIO); diff --git a/server/src/test/java/org/apache/uniffle/server/LocalStorageCheckerTest.java b/server/src/test/java/org/apache/uniffle/server/LocalStorageCheckerTest.java new file mode 100644 index 0000000000..e5eaba11d6 --- /dev/null +++ b/server/src/test/java/org/apache/uniffle/server/LocalStorageCheckerTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.uniffle.server; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class LocalStorageCheckerTest { + + @Test + public void testGetUniffleUsedSpace(@TempDir File tempDir) throws IOException { + File file1 = createTempFile(tempDir, "file1.txt", 1000); + File file2 = createTempFile(tempDir, "file2.txt", 2000); + File subdir1 = createTempSubDirectory(tempDir, "subdir1"); + File file3 = createTempFile(subdir1, "file3.txt", 500); + File subdir2 = createTempSubDirectory(subdir1, "subdir2"); + File file4 = createTempFile(subdir2, "file4.txt", 1500); + + // Call the method to calculate disk usage + long calculatedUsage = LocalStorageChecker.getServiceUsedSpace(tempDir); + + // The expected total usage should be the sum of file1 + file2 + file3 + file4 + long expectedUsage = file1.length() + file2.length() + file3.length() + file4.length(); + + // Assert that the calculated result matches the expected value + Assertions.assertEquals(expectedUsage, calculatedUsage); + } + + private File createTempFile(File directory, String fileName, long fileSize) throws IOException { + File file = new File(directory, fileName); + Files.write(file.toPath(), new byte[(int) fileSize]); + return file; + } + + private File createTempSubDirectory(File parentDirectory, String directoryName) { + File subDir = new File(parentDirectory, directoryName); + subDir.mkdirs(); + return subDir; + } +} diff --git a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java index 632a8abc5b..71f97ebbaa 100644 --- a/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java +++ b/server/src/test/java/org/apache/uniffle/server/StorageCheckerTest.java @@ -69,7 +69,7 @@ public void checkTest(@TempDir File baseDir) throws Exception { assertTrue(checker.checkIsHealthy()); assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); - assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get()); assertEquals(0.2, ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.get()); assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); @@ -77,14 +77,14 @@ public void checkTest(@TempDir File baseDir) throws Exception { callTimes++; assertTrue(checker.checkIsHealthy()); assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); - assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get()); assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); callTimes++; assertFalse(checker.checkIsHealthy()); assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); - assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get()); assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); @@ -94,7 +94,7 @@ public void checkTest(@TempDir File baseDir) throws Exception { checker = new MockStorageChecker(conf, storages); assertFalse(checker.checkIsHealthy()); assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); - assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get()); assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); @@ -102,7 +102,7 @@ public void checkTest(@TempDir File baseDir) throws Exception { checker.checkIsHealthy(); assertTrue(checker.checkIsHealthy()); assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get()); - assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get()); + assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get()); assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get()); assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get()); @@ -133,7 +133,7 @@ long getTotalSpace(File file) { // we mock this method, and will return different values according // to call times. @Override - long getUsedSpace(File file) { + long getWholeDiskUsedSpace(File file) { long result = 0; switch (file.getName()) { case "st1":