Skip to content

Commit

Permalink
[#1074] feat: Introduce the metric of `local_storage_service_used_spa…
Browse files Browse the repository at this point in the history
…ce` (#1075)

### What changes were proposed in this pull request?

Currently, the local_storage_used_space will show the disks used total size. 
But if it is in the colocation with Yarn Nodemanager, this is not enough.

So this issue is to introduce the metric of local_storage_uniffle_used_space to 
show the uniffle used space.

### Why are the changes needed?

Fix: #1074 

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

1. UTs
  • Loading branch information
zuston committed Aug 7, 2023
1 parent 19e7496 commit ef7f392
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ public LocalStorageChecker(ShuffleServerConf conf, List<LocalStorage> storages)
public boolean checkIsHealthy() {
AtomicInteger num = new AtomicInteger(0);
AtomicLong totalSpace = new AtomicLong(0L);
AtomicLong usedSpace = new AtomicLong(0L);
AtomicLong wholeDiskUsedSpace = new AtomicLong(0L);
AtomicLong serviceUsedSpace = new AtomicLong(0L);
AtomicInteger corruptedDirs = new AtomicInteger(0);
CountDownLatch cdl = new CountDownLatch(storageInfos.size());
storageInfos
Expand All @@ -93,7 +94,8 @@ public boolean checkIsHealthy() {
}

totalSpace.addAndGet(getTotalSpace(storageInfo.storageDir));
usedSpace.addAndGet(getUsedSpace(storageInfo.storageDir));
wholeDiskUsedSpace.addAndGet(getWholeDiskUsedSpace(storageInfo.storageDir));
serviceUsedSpace.addAndGet(getServiceUsedSpace(storageInfo.storageDir));

if (storageInfo.checkIsSpaceEnough()) {
num.incrementAndGet();
Expand All @@ -106,11 +108,12 @@ public boolean checkIsHealthy() {
LOG.error("Failed to check local storage!");
}
ShuffleServerMetrics.gaugeLocalStorageTotalSpace.set(totalSpace.get());
ShuffleServerMetrics.gaugeLocalStorageUsedSpace.set(usedSpace.get());
ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.set(wholeDiskUsedSpace.get());
ShuffleServerMetrics.gaugeLocalStorageServiceUsedSpace.set(serviceUsedSpace.get());
ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.set(storageInfos.size());
ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.set(corruptedDirs.get());
ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.set(
usedSpace.get() * 1.0 / totalSpace.get());
wholeDiskUsedSpace.get() * 1.0 / totalSpace.get());

if (storageInfos.isEmpty()) {
if (isHealthy) {
Expand Down Expand Up @@ -143,10 +146,36 @@ long getTotalSpace(File file) {

// Only for testing
@VisibleForTesting
long getUsedSpace(File file) {
long getWholeDiskUsedSpace(File file) {
return file.getTotalSpace() - file.getUsableSpace();
}

protected static long getServiceUsedSpace(File storageDir) {
if (storageDir == null || !storageDir.exists()) {
return 0;
}

if (storageDir.isFile()) {
return storageDir.length();
}

File[] files = storageDir.listFiles();
if (files == null) {
return 0;
}

long totalUsage = 0;
for (File file : files) {
if (file.isFile()) {
totalUsage += file.length();
} else {
totalUsage += getServiceUsedSpace(file);
}
}

return totalUsage;
}

// todo: This function will be integrated to MultiStorageManager, currently we only support disk
// check.
class StorageInfo {
Expand All @@ -167,7 +196,7 @@ boolean checkIsSpaceEnough() {
this.isHealthy = false;
return false;
}
double usagePercent = getUsedSpace(storageDir) * 100.0 / getTotalSpace(storageDir);
double usagePercent = getWholeDiskUsedSpace(storageDir) * 100.0 / getTotalSpace(storageDir);
if (isHealthy) {
if (Double.compare(usagePercent, diskMaxUsagePercentage) >= 0) {
isHealthy = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ public class ShuffleServerMetrics {
private static final String LOCAL_STORAGE_TOTAL_DIRS_NUM = "local_storage_total_dirs_num";
private static final String LOCAL_STORAGE_CORRUPTED_DIRS_NUM = "local_storage_corrupted_dirs_num";
private static final String LOCAL_STORAGE_TOTAL_SPACE = "local_storage_total_space";
private static final String LOCAL_STORAGE_USED_SPACE = "local_storage_used_space";
private static final String LOCAL_STORAGE_WHOLE_DISK_USED_SPACE =
"local_storage_whole_disk_used_space";
private static final String LOCAL_STORAGE_SERVICE_USED_SPACE = "local_storage_service_used_space";
private static final String LOCAL_STORAGE_USED_SPACE_RATIO = "local_storage_used_space_ratio";

private static final String IS_HEALTHY = "is_healthy";
Expand Down Expand Up @@ -143,7 +145,8 @@ public class ShuffleServerMetrics {
public static Gauge.Child gaugeLocalStorageTotalDirsNum;
public static Gauge.Child gaugeLocalStorageCorruptedDirsNum;
public static Gauge.Child gaugeLocalStorageTotalSpace;
public static Gauge.Child gaugeLocalStorageUsedSpace;
public static Gauge.Child gaugeLocalStorageWholeDiskUsedSpace;
public static Gauge.Child gaugeLocalStorageServiceUsedSpace;
public static Gauge.Child gaugeLocalStorageUsedSpaceRatio;

public static Gauge.Child gaugeIsHealthy;
Expand Down Expand Up @@ -290,7 +293,10 @@ private static void setUpMetrics() {
gaugeLocalStorageCorruptedDirsNum =
metricsManager.addLabeledGauge(LOCAL_STORAGE_CORRUPTED_DIRS_NUM);
gaugeLocalStorageTotalSpace = metricsManager.addLabeledGauge(LOCAL_STORAGE_TOTAL_SPACE);
gaugeLocalStorageUsedSpace = metricsManager.addLabeledGauge(LOCAL_STORAGE_USED_SPACE);
gaugeLocalStorageWholeDiskUsedSpace =
metricsManager.addLabeledGauge(LOCAL_STORAGE_WHOLE_DISK_USED_SPACE);
gaugeLocalStorageServiceUsedSpace =
metricsManager.addLabeledGauge(LOCAL_STORAGE_SERVICE_USED_SPACE);
gaugeLocalStorageUsedSpaceRatio =
metricsManager.addLabeledGauge(LOCAL_STORAGE_USED_SPACE_RATIO);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.uniffle.server;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

public class LocalStorageCheckerTest {

@Test
public void testGetUniffleUsedSpace(@TempDir File tempDir) throws IOException {
File file1 = createTempFile(tempDir, "file1.txt", 1000);
File file2 = createTempFile(tempDir, "file2.txt", 2000);
File subdir1 = createTempSubDirectory(tempDir, "subdir1");
File file3 = createTempFile(subdir1, "file3.txt", 500);
File subdir2 = createTempSubDirectory(subdir1, "subdir2");
File file4 = createTempFile(subdir2, "file4.txt", 1500);

// Call the method to calculate disk usage
long calculatedUsage = LocalStorageChecker.getServiceUsedSpace(tempDir);

// The expected total usage should be the sum of file1 + file2 + file3 + file4
long expectedUsage = file1.length() + file2.length() + file3.length() + file4.length();

// Assert that the calculated result matches the expected value
Assertions.assertEquals(expectedUsage, calculatedUsage);
}

private File createTempFile(File directory, String fileName, long fileSize) throws IOException {
File file = new File(directory, fileName);
Files.write(file.toPath(), new byte[(int) fileSize]);
return file;
}

private File createTempSubDirectory(File parentDirectory, String directoryName) {
File subDir = new File(parentDirectory, directoryName);
subDir.mkdirs();
return subDir;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,22 @@ public void checkTest(@TempDir File baseDir) throws Exception {

assertTrue(checker.checkIsHealthy());
assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
assertEquals(600, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get());
assertEquals(0.2, ShuffleServerMetrics.gaugeLocalStorageUsedSpaceRatio.get());
assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());

callTimes++;
assertTrue(checker.checkIsHealthy());
assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
assertEquals(1400, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get());
assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());

callTimes++;
assertFalse(checker.checkIsHealthy());
assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
assertEquals(2100, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get());
assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());

Expand All @@ -94,15 +94,15 @@ public void checkTest(@TempDir File baseDir) throws Exception {
checker = new MockStorageChecker(conf, storages);
assertFalse(checker.checkIsHealthy());
assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
assertEquals(1600, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get());
assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());

callTimes++;
checker.checkIsHealthy();
assertTrue(checker.checkIsHealthy());
assertEquals(3000, ShuffleServerMetrics.gaugeLocalStorageTotalSpace.get());
assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageUsedSpace.get());
assertEquals(250, ShuffleServerMetrics.gaugeLocalStorageWholeDiskUsedSpace.get());
assertEquals(3, ShuffleServerMetrics.gaugeLocalStorageTotalDirsNum.get());
assertEquals(0, ShuffleServerMetrics.gaugeLocalStorageCorruptedDirsNum.get());

Expand Down Expand Up @@ -133,7 +133,7 @@ long getTotalSpace(File file) {
// we mock this method, and will return different values according
// to call times.
@Override
long getUsedSpace(File file) {
long getWholeDiskUsedSpace(File file) {
long result = 0;
switch (file.getName()) {
case "st1":
Expand Down

0 comments on commit ef7f392

Please sign in to comment.