From 07c97227412cefb3f7c293bea8cdffbddd42ba04 Mon Sep 17 00:00:00 2001 From: sarvekshayr Date: Fri, 8 May 2026 13:13:44 +0530 Subject: [PATCH] HDDS-15150. Datanode scanner should not mark container as UNHEALTHY when FD exhausted --- .../ozoneimpl/ContainerScanHelper.java | 7 ++ .../ozoneimpl/ScanTransientIOUtil.java | 59 +++++++++++++ .../TestBackgroundContainerDataScanner.java | 23 ++++++ ...estBackgroundContainerMetadataScanner.java | 22 +++++ .../ozoneimpl/TestScanTransientIOUtil.java | 82 +++++++++++++++++++ 5 files changed, 193 insertions(+) create mode 100644 hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java create mode 100644 hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java index 4c4a45c55d4a..aa299d4d76f9 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanHelper.java @@ -115,7 +115,14 @@ public void scanMetadata(Container container) logScanCompleted(containerData, now); } + /** + * Marks container UNHEALTHY when the scan reports real errors. + * If every scan error is related to file-descriptor exhaustion, return without marking container unhealthy. + */ public void handleUnhealthyScanResult(ContainerData containerData, ScanResult result) throws IOException { + if (ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(result)) { + return; + } long containerID = containerData.getContainerID(); log.error("Corruption detected in container [{}]. Marking it UNHEALTHY. {}", containerID, result); if (log.isDebugEnabled()) { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java new file mode 100644 index 000000000000..1c2c84e11397 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ScanTransientIOUtil.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.ozoneimpl; + +import java.util.Locale; +import org.apache.hadoop.ozone.container.common.interfaces.ScanResult; + +/** + * Utility to catch transient scan failures (typically related to file-descriptor exhaustion) + * that should not be treated as container data corruption. + */ +public final class ScanTransientIOUtil { + + private static final String TOO_MANY_OPEN_FILES = "too many open files"; + + private ScanTransientIOUtil() { + } + + /** + * Returns true when every scan error is related to file-descriptor exhaustion. + * Each error's exception chain is checked via {@link #isTooManyOpenFiles(Throwable)}. + */ + public static boolean scanErrorsAreOnlyTooManyOpenFiles(ScanResult scanResult) { + if (!scanResult.hasErrors()) { + return false; + } + return scanResult.getErrors().stream() + .allMatch(scanError -> isTooManyOpenFiles(scanError.getException())); + } + + public static boolean isTooManyOpenFiles(Throwable throwable) { + for (Throwable cause = throwable; cause != null; cause = cause.getCause()) { + String message = cause.getMessage(); + if (message != null && containsTooManyOpenFiles(message)) { + return true; + } + } + return false; + } + + private static boolean containsTooManyOpenFiles(String text) { + return text.toLowerCase(Locale.ROOT).contains(TOO_MANY_OPEN_FILES); + } +} diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index 535982422545..2219e4952bd9 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -44,6 +44,7 @@ import java.io.IOException; import java.time.Duration; import java.util.Arrays; +import java.util.Collections; import java.util.Optional; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; @@ -56,12 +57,14 @@ import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeWriter; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.interfaces.ScanResult; import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil; import org.apache.hadoop.ozone.container.metadata.DatanodeSchemaThreeDBDefinition; import org.apache.hadoop.ozone.container.metadata.DatanodeStoreSchemaThreeImpl; +import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -402,4 +405,24 @@ public void testMerkleTreeWritten() throws Exception { .updateContainerChecksum(eq(container.getContainerData().getContainerID()), any()); } } + + /** + * When data scan reports only "too many open files" errors due to file-descriptor exhaustion, + * the container must not be marked UNHEALTHY. + */ + @Test + public void testDataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() throws Exception { + Container container = mockKeyValueContainer(); + IOException ex = new IOException("Too many open files"); + DataScanResult scanResult = DataScanResult.fromErrors(Collections.singletonList( + new ContainerScanError(FailureType.CORRUPT_CHUNK, new File("."), ex)), + new ContainerMerkleTreeWriter()); + when(container.scanData(any(DataTransferThrottler.class), any(Canceler.class))).thenReturn(scanResult); + + setContainers(container, healthy); + scanner.runIteration(); + + verify(controller, never()).markContainerUnhealthy(anyLong(), any(ScanResult.class)); + assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers()); + } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java index 9b6c6aed3f05..d57ff8330dcc 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java @@ -38,8 +38,10 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import java.io.File; import java.io.IOException; import java.time.Duration; +import java.util.Collections; import java.util.Optional; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -49,6 +51,7 @@ import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.interfaces.ScanResult; import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil; +import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -256,4 +259,23 @@ public void testShutdownDuringScan() throws Exception { // The container should remain healthy. verifyContainerMarkedUnhealthy(healthy, never()); } + + /** + * When metadata scan reports only "too many open files" errors due to file-descriptor exhaustion, + * the container must not be marked UNHEALTHY. + */ + @Test + public void testMetadataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() throws Exception { + Container container = mockKeyValueContainer(); + IOException emf = new IOException("Too many open files"); + MetadataScanResult scanResult = MetadataScanResult.fromErrors(Collections.singletonList( + new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new File("."), emf))); + when(container.scanMetaData()).thenReturn(scanResult); + + setContainers(container, healthy); + scanner.runIteration(); + + verify(controller, never()).markContainerUnhealthy(anyLong(), any(ScanResult.class)); + assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers()); + } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java new file mode 100644 index 000000000000..a92603eb21a1 --- /dev/null +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestScanTransientIOUtil.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.ozoneimpl; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.FileSystemException; +import java.util.Arrays; +import java.util.Collections; +import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ScanTransientIOUtil}. + */ +public class TestScanTransientIOUtil { + + @Test + public void detectsTooManyOpenFilesInFileSystemException() { + assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new FileSystemException(null, null, "Too many open files"))); + } + + @Test + public void detectsTooManyOpenFilesInFileNotFoundExceptionMessage() { + String msg = "/data/container/metadata/16341719.container (Too many open files)"; + assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new FileNotFoundException(msg))); + } + + @Test + public void detectsTooManyOpenFilesInMessageCauseChain() { + IOException throwable = new IOException("Too many open files"); + assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new IOException(throwable))); + } + + @Test + public void rejectsUnrelatedIOException() { + assertFalse(ScanTransientIOUtil.isTooManyOpenFiles(new IOException("disk full"))); + } + + @Test + public void scanErrorsOnlyTooManyOpenFilesReturnsTrue() { + IOException ex = new IOException("Too many open files"); + MetadataScanResult scanResult = MetadataScanResult.fromErrors(Collections.singletonList( + new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new File("."), ex))); + assertTrue(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult)); + } + + @Test + public void scanErrorsMixedReturnsFalse() { + IOException ioException = new IOException("Too many open files"); + FileNotFoundException fileNotFoundException = new FileNotFoundException("missing"); + MetadataScanResult scanResult = MetadataScanResult.fromErrors(Arrays.asList( + new ContainerScanError(FailureType.CORRUPT_CHUNK, new File("."), ioException), + new ContainerScanError(FailureType.MISSING_CONTAINER_FILE, new File("."), fileNotFoundException))); + assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult)); + } + + @Test + public void emptyScanResult() { + assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles( + MetadataScanResult.fromErrors(Collections.emptyList()))); + } +}