Skip to content

Commit

Permalink
HDFS-9231. fsck doesn't list correct file path when Bad Replicas/Bloc…
Browse files Browse the repository at this point in the history
…ks are in a snapshot. (Xiao Chen via Yongjun Zhang)
  • Loading branch information
Yongjun Zhang committed Oct 28, 2015
1 parent bf8e452 commit 97913f4
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 14 deletions.
3 changes: 3 additions & 0 deletions hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
Expand Up @@ -2167,6 +2167,9 @@ Release 2.8.0 - UNRELEASED


HDFS-9268. fuse_dfs chown crashes when uid is passed as -1 (cmccabe) HDFS-9268. fuse_dfs chown crashes when uid is passed as -1 (cmccabe)


HDFS-9231. fsck doesn't list correct file path when Bad Replicas/Blocks
are in a snapshot. (Xiao Chen via Yongjun Zhang)

Release 2.7.2 - UNRELEASED Release 2.7.2 - UNRELEASED


INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
Expand Down
Expand Up @@ -29,9 +29,13 @@
import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectorySnapshottableFeature; import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectorySnapshottableFeature;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager; import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
import org.apache.hadoop.hdfs.util.ReadOnlyList;
import org.apache.hadoop.util.ChunkedArrayList; import org.apache.hadoop.util.ChunkedArrayList;


import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ListIterator;
import java.util.List; import java.util.List;


class FSDirSnapshotOp { class FSDirSnapshotOp {
Expand Down Expand Up @@ -159,6 +163,40 @@ static SnapshotDiffReport getSnapshotDiffReport(FSDirectory fsd,
return diffs; return diffs;
} }


/** Get a collection of full snapshot paths given file and snapshot dir.
* @param lsf a list of snapshottable features
* @param file full path of the file
* @return collection of full paths of snapshot of the file
*/
static Collection<String> getSnapshotFiles(FSDirectory fsd,
List<DirectorySnapshottableFeature> lsf,
String file) throws IOException {
ArrayList<String> snaps = new ArrayList<String>();
ListIterator<DirectorySnapshottableFeature> sfi = lsf.listIterator();
for (DirectorySnapshottableFeature sf : lsf) {
// for each snapshottable dir e.g. /dir1, /dir2
final ReadOnlyList<Snapshot> lsnap = sf.getSnapshotList();
for (Snapshot s : lsnap) {
// for each snapshot name under snapshottable dir
// e.g. /dir1/.snapshot/s1, /dir1/.snapshot/s2
final String dirName = s.getRoot().getRootFullPathName();
if (!file.startsWith(dirName)) {
// file not in current snapshot root dir, no need to check other snaps
break;
}
String snapname = s.getRoot().getFullPathName();
if (dirName.equals(Path.SEPARATOR)) { // handle rootDir
snapname += Path.SEPARATOR;
}
snapname += file.substring(file.indexOf(dirName) + dirName.length());
if (fsd.getFSNamesystem().getFileInfo(snapname, true) != null) {
snaps.add(snapname);
}
}
}
return snaps;
}

/** /**
* Delete a snapshot of a snapshottable directory * Delete a snapshot of a snapshottable directory
* @param snapshotRoot The snapshottable directory * @param snapshotRoot The snapshottable directory
Expand Down
Expand Up @@ -192,6 +192,7 @@
import org.apache.hadoop.hdfs.protocol.RollingUpgradeException; import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo; import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException; import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
import org.apache.hadoop.hdfs.protocol.SnapshotException;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus; import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure; import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
Expand Down Expand Up @@ -230,6 +231,7 @@
import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer; import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean; import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectorySnapshottableFeature;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager; import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
Expand Down Expand Up @@ -6311,6 +6313,79 @@ public String getCorruptFiles() {
return JSON.toString(list); return JSON.toString(list);
} }


@Override // NameNodeMXBean
public long getNumberOfSnapshottableDirs() {
return snapshotManager.getNumSnapshottableDirs();
}

/**
* Get the list of corrupt blocks and corresponding full file path
* including snapshots in given snapshottable directories.
* @param path Restrict corrupt files to this portion of namespace.
* @param snapshottableDirs Snapshottable directories. Passing in null
* will only return corrupt blocks in non-snapshots.
* @param cookieTab Support for continuation; cookieTab tells where
* to start from.
* @return a list in which each entry describes a corrupt file/block
* @throws IOException
*/
List<String> listCorruptFileBlocksWithSnapshot(String path,
List<String> snapshottableDirs, String[] cookieTab) throws IOException {
final Collection<CorruptFileBlockInfo> corruptFileBlocks =
listCorruptFileBlocks(path, cookieTab);
List<String> list = new ArrayList<String>();

// Precalculate snapshottableFeature list
List<DirectorySnapshottableFeature> lsf = new ArrayList<>();
if (snapshottableDirs != null) {
for (String snap : snapshottableDirs) {
final INode isnap = getFSDirectory().getINode(snap, false);
final DirectorySnapshottableFeature sf =
isnap.asDirectory().getDirectorySnapshottableFeature();
if (sf == null) {
throw new SnapshotException(
"Directory is not a snapshottable directory: " + snap);
}
lsf.add(sf);
}
}

for (CorruptFileBlockInfo c : corruptFileBlocks) {
if (getFileInfo(c.path, true) != null) {
list.add(c.toString());
}
final Collection<String> snaps = FSDirSnapshotOp
.getSnapshotFiles(getFSDirectory(), lsf, c.path);
if (snaps != null) {
for (String snap : snaps) {
// follow the syntax of CorruptFileBlockInfo#toString()
list.add(c.block.getBlockName() + "\t" + snap);
}
}
}
return list;
}

/**
* Get the list of snapshottable directories.
* @return The list of all the current snapshottable directories
* @see #getSnapshottableDirListing()
* @throws IOException
*/
List<String> getSnapshottableDirs() throws IOException {
List<String> snapshottableDirs = new ArrayList<String>();
final FSPermissionChecker pc = getFSDirectory().getPermissionChecker();
final String user = pc.isSuperUser() ? null : pc.getUser();
final SnapshottableDirectoryStatus[] snapDirs =
snapshotManager.getSnapshottableDirListing(user);
if (snapDirs != null) {
for (SnapshottableDirectoryStatus sds : snapDirs) {
snapshottableDirs.add(sds.getFullPath().toString());
}
}
return snapshottableDirs;
}

@Override //NameNodeMXBean @Override //NameNodeMXBean
public int getDistinctVersionCount() { public int getDistinctVersionCount() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions() return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
Expand Down
Expand Up @@ -161,6 +161,13 @@ public interface NameNodeMXBean {
*/ */
public long getNumberOfMissingBlocksWithReplicationFactorOne(); public long getNumberOfMissingBlocksWithReplicationFactorOne();


/**
* Gets the total number of snapshottable dirs in the system.
*
* @return the total number of snapshottable dirs in the system
*/
public long getNumberOfSnapshottableDirs();

/** /**
* Gets the number of threads. * Gets the number of threads.
* *
Expand Down
Expand Up @@ -60,7 +60,6 @@
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataEncryptionKeyFactory; import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataEncryptionKeyFactory;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey; import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
Expand Down Expand Up @@ -346,13 +345,7 @@ public void fsck() {
namenode.getNamesystem().logFsckEvent(path, remoteAddress); namenode.getNamesystem().logFsckEvent(path, remoteAddress);


if (snapshottableDirs != null) { if (snapshottableDirs != null) {
SnapshottableDirectoryStatus[] snapshotDirs = namenode.getRpcServer() snapshottableDirs = namenode.getNamesystem().getSnapshottableDirs();
.getSnapshottableDirListing();
if (snapshotDirs != null) {
for (SnapshottableDirectoryStatus dir : snapshotDirs) {
snapshottableDirs.add(dir.getFullPath().toString());
}
}
} }


final HdfsFileStatus file = namenode.getRpcServer().getFileInfo(path); final HdfsFileStatus file = namenode.getRpcServer().getFileInfo(path);
Expand Down Expand Up @@ -424,9 +417,10 @@ public void fsck() {
} }


private void listCorruptFileBlocks() throws IOException { private void listCorruptFileBlocks() throws IOException {
Collection<FSNamesystem.CorruptFileBlockInfo> corruptFiles = namenode. final List<String> corrputBlocksFiles = namenode.getNamesystem()
getNamesystem().listCorruptFileBlocks(path, currentCookie); .listCorruptFileBlocksWithSnapshot(path, snapshottableDirs,
int numCorruptFiles = corruptFiles.size(); currentCookie);
int numCorruptFiles = corrputBlocksFiles.size();
String filler; String filler;
if (numCorruptFiles > 0) { if (numCorruptFiles > 0) {
filler = Integer.toString(numCorruptFiles); filler = Integer.toString(numCorruptFiles);
Expand All @@ -436,8 +430,8 @@ private void listCorruptFileBlocks() throws IOException {
filler = "no more"; filler = "no more";
} }
out.println("Cookie:\t" + currentCookie[0]); out.println("Cookie:\t" + currentCookie[0]);
for (FSNamesystem.CorruptFileBlockInfo c : corruptFiles) { for (String s : corrputBlocksFiles) {
out.println(c.toString()); out.println(s);
} }
out.println("\n\nThe filesystem under path '" + path + "' has " + filler out.println("\n\nThe filesystem under path '" + path + "' has " + filler
+ " CORRUPT files"); + " CORRUPT files");
Expand Down
Expand Up @@ -184,6 +184,14 @@ public ContentSummaryComputationContext computeContentSummary(
public String getFullPathName() { public String getFullPathName() {
return getSnapshotPath(getParent().getFullPathName(), getLocalName()); return getSnapshotPath(getParent().getFullPathName(), getLocalName());
} }

/**
* Get the full path name of the root directory of this snapshot.
* @return full path to the root directory of the snapshot
*/
public String getRootFullPathName() {
return getParent().getFullPathName();
}
} }


/** Snapshot ID. */ /** Snapshot ID. */
Expand Down
Expand Up @@ -117,7 +117,7 @@
{.}<br/> {.}<br/>
{/CorruptFiles} {/CorruptFiles}
</div> </div>
<p>Please check the logs or run fsck in order to identify the missing blocks. See the Hadoop FAQ for common causes and potential solutions.</p> <p>Please check the logs or run fsck in order to identify the missing blocks.{@if cond="{NumberOfSnapshottableDirs} > 0"} Please run fsck with -includeSnapshots in order to see detailed reports about snapshots.{/if} See the Hadoop FAQ for common causes and potential solutions.</p>
</div> </div>
{/if} {/if}
{/nn} {/nn}
Expand Down
Expand Up @@ -1792,4 +1792,92 @@ public void testECFsck() throws Exception {
if (cluster != null) { cluster.shutdown(); } if (cluster != null) { cluster.shutdown(); }
} }
} }

/**
* Test that corrupted snapshot files are listed with full dir.
*/
@Test
public void testFsckListCorruptSnapshotFiles() throws Exception {
Configuration conf = new Configuration();
conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000);
conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_INTERVAL_KEY, 1);
DistributedFileSystem hdfs = null;
final short REPL_FACTOR = 1;

MiniDFSCluster cluster = null;
try {
int numFiles = 3;
int numSnapshots = 0;
cluster = new MiniDFSCluster.Builder(conf).build();
cluster.waitActive();
hdfs = cluster.getFileSystem();
DFSTestUtil util = new DFSTestUtil.Builder().
setName("testGetCorruptFiles").setNumFiles(numFiles).setMaxLevels(1).
setMaxSize(1024).build();

util.createFiles(hdfs, "/corruptData", (short) 1);
final Path fp = new Path("/corruptData/file");
util.createFile(hdfs, fp, 1024, REPL_FACTOR, 1000L);
numFiles++;
util.waitReplication(hdfs, "/corruptData", (short) 1);

hdfs.allowSnapshot(new Path("/corruptData"));
hdfs.createSnapshot(new Path("/corruptData"), "mySnapShot");
numSnapshots = numFiles;

String outStr =
runFsck(conf, 0, false, "/corruptData", "-list-corruptfileblocks");
System.out.println("1. good fsck out: " + outStr);
assertTrue(outStr.contains("has 0 CORRUPT files"));
// delete the blocks
final String bpid = cluster.getNamesystem().getBlockPoolId();
for (int i=0; i<numFiles; i++) {
for (int j=0; j<=1; j++) {
File storageDir = cluster.getInstanceStorageDir(i, j);
File data_dir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
List<File> metadataFiles = MiniDFSCluster.getAllBlockMetadataFiles(
data_dir);
if (metadataFiles == null)
continue;
for (File metadataFile : metadataFiles) {
File blockFile = Block.metaToBlockFile(metadataFile);
assertTrue("Cannot remove file.", blockFile.delete());
assertTrue("Cannot remove file.", metadataFile.delete());
}
}
}
// Delete file when it has a snapshot
hdfs.delete(fp, false);
numFiles--;

// wait for the namenode to see the corruption
final NamenodeProtocols namenode = cluster.getNameNodeRpc();
CorruptFileBlocks corruptFileBlocks = namenode
.listCorruptFileBlocks("/corruptData", null);
int numCorrupt = corruptFileBlocks.getFiles().length;
while (numCorrupt == 0) {
Thread.sleep(1000);
corruptFileBlocks = namenode
.listCorruptFileBlocks("/corruptData", null);
numCorrupt = corruptFileBlocks.getFiles().length;
}

// with -includeSnapshots all files are reported
outStr = runFsck(conf, -1, true, "/corruptData",
"-list-corruptfileblocks", "-includeSnapshots");
System.out.println("2. bad fsck include snapshot out: " + outStr);
assertTrue(outStr
.contains("has " + (numFiles + numSnapshots) + " CORRUPT files"));
assertTrue(outStr.contains("/.snapshot/"));

// without -includeSnapshots only non-snapshots are reported
outStr =
runFsck(conf, -1, true, "/corruptData", "-list-corruptfileblocks");
System.out.println("3. bad fsck exclude snapshot out: " + outStr);
assertTrue(outStr.contains("has " + numFiles + " CORRUPT files"));
assertFalse(outStr.contains("/.snapshot/"));
} finally {
if (cluster != null) {cluster.shutdown();}
}
}
} }

0 comments on commit 97913f4

Please sign in to comment.