Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,14 @@ public void scanMetadata(Container<?> container)
logScanCompleted(containerData, now);
}

/**
* Marks container UNHEALTHY when the scan reports real errors.
* If every scan error is related to file-descriptor exhaustion, return without marking container unhealthy.
*/
public void handleUnhealthyScanResult(ContainerData containerData, ScanResult result) throws IOException {
if (ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(result)) {
return;
}
long containerID = containerData.getContainerID();
Comment on lines +123 to 126
log.error("Corruption detected in container [{}]. Marking it UNHEALTHY. {}", containerID, result);
if (log.isDebugEnabled()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.container.ozoneimpl;

import java.util.Locale;
import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;

/**
* Utility to catch transient scan failures (typically related to file-descriptor exhaustion)
* that should not be treated as container data corruption.
*/
public final class ScanTransientIOUtil {

private static final String TOO_MANY_OPEN_FILES = "too many open files";

private ScanTransientIOUtil() {
}

/**
* Returns true when every scan error is related to file-descriptor exhaustion.
* Each error's exception chain is checked via {@link #isTooManyOpenFiles(Throwable)}.
*/
public static boolean scanErrorsAreOnlyTooManyOpenFiles(ScanResult scanResult) {
if (!scanResult.hasErrors()) {
return false;
}
return scanResult.getErrors().stream()
.allMatch(scanError -> isTooManyOpenFiles(scanError.getException()));
}

public static boolean isTooManyOpenFiles(Throwable throwable) {
for (Throwable cause = throwable; cause != null; cause = cause.getCause()) {
String message = cause.getMessage();
if (message != null && containsTooManyOpenFiles(message)) {
return true;
Comment on lines +48 to +50
}
}
return false;
}
Comment on lines +46 to +54

private static boolean containsTooManyOpenFiles(String text) {
return text.toLowerCase(Locale.ROOT).contains(TOO_MANY_OPEN_FILES);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import java.io.IOException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Collections;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
Expand All @@ -56,12 +57,14 @@
import org.apache.hadoop.hdfs.util.Canceler;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeWriter;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
import org.apache.hadoop.ozone.container.metadata.DatanodeSchemaThreeDBDefinition;
import org.apache.hadoop.ozone.container.metadata.DatanodeStoreSchemaThreeImpl;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
import org.apache.ozone.test.GenericTestUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -402,4 +405,24 @@ public void testMerkleTreeWritten() throws Exception {
.updateContainerChecksum(eq(container.getContainerData().getContainerID()), any());
}
}

/**
* When data scan reports only "too many open files" errors due to file-descriptor exhaustion,
* the container must not be marked UNHEALTHY.
*/
@Test
public void testDataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() throws Exception {
Container<?> container = mockKeyValueContainer();
IOException ex = new IOException("Too many open files");
DataScanResult scanResult = DataScanResult.fromErrors(Collections.singletonList(
new ContainerScanError(FailureType.CORRUPT_CHUNK, new File("."), ex)),
new ContainerMerkleTreeWriter());
when(container.scanData(any(DataTransferThrottler.class), any(Canceler.class))).thenReturn(scanResult);

setContainers(container, healthy);
scanner.runIteration();

verify(controller, never()).markContainerUnhealthy(anyLong(), any(ScanResult.class));
assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;

import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.Collections;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
Expand All @@ -49,6 +51,7 @@
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.apache.hadoop.ozone.container.common.interfaces.ScanResult;
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
import org.apache.ozone.test.GenericTestUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -256,4 +259,23 @@ public void testShutdownDuringScan() throws Exception {
// The container should remain healthy.
verifyContainerMarkedUnhealthy(healthy, never());
}

/**
* When metadata scan reports only "too many open files" errors due to file-descriptor exhaustion,
* the container must not be marked UNHEALTHY.
*/
@Test
public void testMetadataScanOnlyTooManyOpenFilesDoesNotMarkUnhealthy() throws Exception {
Container<?> container = mockKeyValueContainer();
IOException emf = new IOException("Too many open files");
MetadataScanResult scanResult = MetadataScanResult.fromErrors(Collections.singletonList(
new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new File("."), emf)));
when(container.scanMetaData()).thenReturn(scanResult);

setContainers(container, healthy);
scanner.runIteration();

verify(controller, never()).markContainerUnhealthy(anyLong(), any(ScanResult.class));
assertEquals(0, scanner.getMetrics().getNumUnHealthyContainers());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.container.ozoneimpl;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.FileSystemException;
import java.util.Arrays;
import java.util.Collections;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScanError.FailureType;
import org.junit.jupiter.api.Test;

/**
* Unit tests for {@link ScanTransientIOUtil}.
*/
public class TestScanTransientIOUtil {

@Test
public void detectsTooManyOpenFilesInFileSystemException() {
assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new FileSystemException(null, null, "Too many open files")));
}

@Test
public void detectsTooManyOpenFilesInFileNotFoundExceptionMessage() {
String msg = "/data/container/metadata/16341719.container (Too many open files)";
assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new FileNotFoundException(msg)));
}

@Test
public void detectsTooManyOpenFilesInMessageCauseChain() {
IOException throwable = new IOException("Too many open files");
assertTrue(ScanTransientIOUtil.isTooManyOpenFiles(new IOException(throwable)));
}

@Test
public void rejectsUnrelatedIOException() {
assertFalse(ScanTransientIOUtil.isTooManyOpenFiles(new IOException("disk full")));
}

@Test
public void scanErrorsOnlyTooManyOpenFilesReturnsTrue() {
IOException ex = new IOException("Too many open files");
MetadataScanResult scanResult = MetadataScanResult.fromErrors(Collections.singletonList(
new ContainerScanError(FailureType.CORRUPT_CONTAINER_FILE, new File("."), ex)));
assertTrue(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult));
}

@Test
public void scanErrorsMixedReturnsFalse() {
IOException ioException = new IOException("Too many open files");
FileNotFoundException fileNotFoundException = new FileNotFoundException("missing");
MetadataScanResult scanResult = MetadataScanResult.fromErrors(Arrays.asList(
new ContainerScanError(FailureType.CORRUPT_CHUNK, new File("."), ioException),
new ContainerScanError(FailureType.MISSING_CONTAINER_FILE, new File("."), fileNotFoundException)));
assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(scanResult));
}

@Test
public void emptyScanResult() {
assertFalse(ScanTransientIOUtil.scanErrorsAreOnlyTooManyOpenFiles(
MetadataScanResult.fromErrors(Collections.emptyList())));
}
}
Loading