From 14086ac08759e1a3387334b7c5a4b85b7ec54412 Mon Sep 17 00:00:00 2001 From: Shrinidhi Talpankar Date: Mon, 11 May 2026 15:46:40 +0530 Subject: [PATCH 1/3] Add regression tests for missing HFileLink back-references after cloning merged-region snapshot - Add TestCloneSnapshotFromClientAfterMergingRegion: integration test that reproduces the data-loss bug where HFileLinkCleaner deletes archived pre-merge HFiles because RestoreSnapshotHelper.restoreReferenceFile() creates HFileLink Reference files in the clone without writing back-references to the archive directory. - Add TestHFileLinkCleaner.testHFileLinkReferenceFileProtectsArchivedHFile: unit test verifying that HFileLinkCleaner treats an HFileLink Reference file as a live forward link and does not delete the archived HFile it points to. --- ...tFromClientAfterMergingRegionTestBase.java | 320 ++++++++++++++++++ ...eSnapshotFromClientAfterMergingRegion.java | 51 +++ .../master/cleaner/TestHFileLinkCleaner.java | 53 ++- 3 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterMergingRegionTestBase.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestCloneSnapshotFromClientAfterMergingRegion.java diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterMergingRegionTestBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterMergingRegionTestBase.java new file mode 100644 index 000000000000..ae013ee2e8d3 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterMergingRegionTestBase.java @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.client; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStates; +import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; +import org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner; +import org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner; +import org.apache.hadoop.hbase.regionserver.CompactedHFilesDischarger; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.io.HFileLink; +import org.apache.hadoop.hbase.regionserver.HStore; +import org.apache.hadoop.hbase.regionserver.HStoreFile; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; +import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.HFileArchiveUtil; +import org.junit.jupiter.api.TestTemplate; + +public class CloneSnapshotFromClientAfterMergingRegionTestBase + extends CloneSnapshotFromClientTestBase { + protected CloneSnapshotFromClientAfterMergingRegionTestBase(int numReplicas) { + super(numReplicas); + } + + /** + * Regression test for the bug where HFiles referenced by HFileLink Reference files in a cloned + * table are deleted from the archive by {@code HFileLinkCleaner} after the pre-merge regions are + * GC'd. + * + *

Sequence that triggers the bug: + *

    + *
  1. Flush the table so every region has HFiles on disk.
  2. + *
  3. Merge two regions — the merged result gets reference files pointing at the pre-merge + * regions' HFiles, which are immediately moved to the archive by the merge procedure.
  4. + *
  5. Snapshot the table while the merged result still holds reference files (compaction + * disabled).
  6. + *
  7. Clone the snapshot — {@code RestoreSnapshotHelper.restoreReferenceFile()} creates + * HFileLink Reference files in the clone pointing at the pre-merge HFiles, but does NOT + * create the back-references that {@code HFileLinkCleaner} relies on.
  8. + *
  9. Delete the snapshot so {@code SnapshotHFileCleaner} no longer protects the archived + * HFiles, then force a cache refresh to evict stale entries.
  10. + *
  11. Re-enable compaction and compact the parent table to resolve the merged result's + * reference files.
  12. + *
  13. {@code HFileLinkCleaner} deletes the archived pre-merge HFiles because there are no + * back-references — breaking every read on the cloned table.
  14. + *
+ * + *

Without the fix, the final {@code getAvailablePath} call throws + * {@code FileNotFoundException}. + */ + @TestTemplate + public void testCloneSnapshotAfterMergingRegionAndGC() throws Exception { + admin.catalogJanitorSwitch(false); + TableName clonedTableName = + TableName.valueOf(getValidMethodName() + "-" + EnvironmentEdgeManager.currentTime()); + try { + // Step 1: flush all regions so each has HFiles on disk before the merge. + // The merge procedure creates reference files in the merged result that point to these HFiles, + // which are then moved to the archive. Without on-disk HFiles there is nothing to reference. + admin.flush(tableName); + setCompactionsEnabled(false); + // Step 2: merge two adjacent regions. Compaction is globally disabled in setupConfiguration() + // so the merged result will retain reference files pointing at the pre-merge HFiles. + // Two regions are sufficient to reproduce the bug: any clone region whose store files are + // HFileLink References (created by RestoreSnapshotHelper.restoreReferenceFile) has no + // back-references, so HFileLinkCleaner deletes the archived pre-merge HFiles and the + // clone's region fails to open. + RegionStates regionStates = + TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + List openBefore = + regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.OPEN); + + List defaultRegions = admin.getRegions(tableName); + RegionReplicaUtil.removeNonDefaultRegions(defaultRegions); + List preMergeRegions = new ArrayList<>(defaultRegions.subList(0, 2)); + + admin.mergeRegionsAsync( + new byte[][] { preMergeRegions.get(0).getEncodedNameAsBytes(), + preMergeRegions.get(1).getEncodedNameAsBytes() }, + false); + + // After merging 2 primaries into 1, the open count drops by numReplicas. + int expectedOpen = openBefore.size() - numReplicas; + TEST_UTIL.waitFor(60_000, () -> { + List open = + regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.OPEN); + return open != null && open.size() == expectedOpen; + }); + + // Sanity-check: the merged result must have reference files. + FileSystem fs = FileSystem.get(TEST_UTIL.getConfiguration()); + Path tableDir = CommonFSUtils.getTableDir( + CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()), tableName); + assertTrue(hasReferenceFiles(fs, tableDir), + "Merged result must have reference files before taking the snapshot"); + + // Step 3: snapshot the table while the merged result still holds reference files. + admin.snapshot(snapshotName2, tableName); + + // Step 4: clone the snapshot. RestoreSnapshotHelper.restoreReferenceFile() will create + // HFileLink Reference files in the clone pointing at the pre-merge HFiles, but will NOT + // create back-references in the archive — this is the bug being tested. + admin.cloneSnapshot(snapshotName2, clonedTableName); + SnapshotTestingUtils.waitForTableToBeOnline(TEST_UTIL, clonedTableName); + + // The clone must have reference files pointing at the pre-merge regions' HFiles. + Path cloneTableDir = CommonFSUtils.getTableDir( + CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()), clonedTableName); + assertTrue(hasReferenceFiles(fs, cloneTableDir), + "Clone must have HFileLink Reference files pointing at the pre-merge HFiles"); + + // Step 5: delete ALL snapshots so SnapshotHFileCleaner no longer protects the archived + // pre-merge HFiles. snapshotName1 (taken by @BeforeEach before the merge) directly + // references A's and B's HFiles; leaving it would cause SnapshotHFileCleaner to protect + // those files even after snapshotName2 is gone, masking the bug entirely. + // Force a cache refresh so stale entries are evicted immediately; without it the cache + // may still hold the pre-merge HFile names and incorrectly protect them. + SnapshotTestingUtils.deleteAllSnapshots(admin); + + HFileCleaner hfileCleaner = + TEST_UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner(); + for (Object delegate : hfileCleaner.getDelegatesForTesting()) { + if (delegate instanceof SnapshotHFileCleaner) { + ((SnapshotHFileCleaner) delegate).getFileCacheForTesting() + .triggerCacheRefreshForTesting(); + break; + } + } + + // Step 6: re-enable compaction and compact the parent table so the merged result resolves + // its reference files into standalone HFiles. + setCompactionsEnabled(true); + admin.majorCompact(tableName); + + // Wait for compaction to commit AND for HDFS to reflect the new state. The two operations + // must happen in the same loop iteration because HStore.hasReferences() checks both current + // store files AND compacted files: until discharge clears the compacted list, hasReferences() + // always returns true even after compaction commits. By running the synchronous discharger + // first in each iteration, we clear the compacted list once compaction commits, causing + // hasReferences() to return false on the same iteration — giving CatalogJanitor a clean + // HDFS view (no reference files) when it runs. + // The foundMergedRegion guard prevents a premature true return in the brief window between + // the merge completing and the merged result being assigned to any RS. + TEST_UTIL.waitFor(60_000, () -> { + boolean foundMergedRegion = false; + for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { + CompactedHFilesDischarger discharger = + TEST_UTIL.getHBaseCluster().getRegionServer(i).getCompactedHFilesDischarger(); + boolean prev = discharger.setUseExecutor(false); + try { + discharger.chore(); + } finally { + discharger.setUseExecutor(prev); + } + for (HRegion region : + TEST_UTIL.getHBaseCluster().getRegionServer(i).getRegions(tableName)) { + if (region.getRegionInfo().getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { + continue; + } + foundMergedRegion = true; + for (HStore store : region.getStores()) { + if (store.hasReferences()) { + return false; + } + } + } + } + return foundMergedRegion; + }); + + // Step 7: CatalogJanitor GCs the pre-merge regions (A and B), archiving their HFiles. + // The merge procedure calls AssignmentManager.markRegionAsMerged() which calls + // regionStates.deleteRegion() for each parent — so A and B are fully removed from + // RegionStates (not put into MERGED state). We therefore use preMergeRegions directly + // rather than querying RegionState.State.MERGED (which always returns an empty list). + admin.catalogJanitorSwitch(true); + TEST_UTIL.getMiniHBaseCluster().getMaster().getCatalogJanitor().choreForTesting(); + TEST_UTIL.waitFor(60_000, () -> { + try { + for (RegionInfo parent : preMergeRegions) { + if (fs.exists(new Path(tableDir, parent.getEncodedName()))) { + return false; + } + } + return true; + } catch (IOException e) { + return false; + } + }); + + // Step 8: expire the archive TTL and run HFileCleaner. + // Without the fix, HFileLinkCleaner sees no back-references for the pre-merge HFiles and + // deletes them, breaking every read on the cloned table. + Path archivePath = HFileArchiveUtil.getArchivePath(TEST_UTIL.getConfiguration()); + long expiredTime = + EnvironmentEdgeManager.currentTime() - TimeToLiveHFileCleaner.DEFAULT_TTL * 1000; + setFileTimesRecursively(fs, archivePath, expiredTime); + hfileCleaner.triggerCleanerNow().get(); + + // Step 9: the clone must still be fully readable — + // its HFileLink References must still resolve. + for (HRegion region : TEST_UTIL.getHBaseCluster().getRegions(clonedTableName)) { + for (HStore store : region.getStores()) { + for (HStoreFile sf : store.getStorefiles()) { + Path path = sf.getPath(); + if (sf.isReference() && StoreFileInfo.isReference(path)) { + Path refPath = StoreFileInfo.getReferredToFile(path); + if (HFileLink.isHFileLink(refPath)) { + HFileLink link = + HFileLink.buildFromHFileLinkPattern(TEST_UTIL.getConfiguration(), refPath); + Path actualPath = link.getAvailablePath(fs); + assertTrue(fs.exists(actualPath), "Actual file does not exist: " + actualPath); + } + } + } + } + } + + // Step 10: disable and re-enable the clone table to force all region servers to close their + // existing HFile handles and open fresh ones. Without the fix, the region open fails with + // FileNotFoundException because HFileLink.getAvailablePath() cannot find the archived HFile + // at any of its candidate locations (live, archive, .tmp, mobdir) — it was deleted by + // HFileLinkCleaner. The table therefore never becomes available within the timeout. + admin.disableTable(clonedTableName); + admin.enableTable(clonedTableName); + verifyRowCount(TEST_UTIL, clonedTableName, snapshot1Rows); + } finally { + setCompactionsEnabled(true); + admin.catalogJanitorSwitch(true); + if (admin.tableExists(clonedTableName)) { + TEST_UTIL.deleteTable(clonedTableName); + } + } + } + + /** + * Returns true if any live region of the table at {@code tableDir} contains a reference file + * (standard split/merge reference or HFileLink Reference). + */ + private boolean hasReferenceFiles(FileSystem fs, Path tableDir) throws IOException { + if (!fs.exists(tableDir)) { + return false; + } + for (FileStatus regionStatus : fs.listStatus(tableDir)) { + if (!regionStatus.isDirectory()) { + continue; + } + String regionDirName = regionStatus.getPath().getName(); + if (regionDirName.startsWith(".")) { + continue; // skip .tmp and similar special directories + } + FileStatus[] familyDirs = fs.listStatus(regionStatus.getPath()); + if (familyDirs == null) { + continue; + } + for (FileStatus familyStatus : familyDirs) { + if (!familyStatus.isDirectory()) { + continue; + } + String familyDirName = familyStatus.getPath().getName(); + if (familyDirName.startsWith(".") || familyDirName.equals("recovered.edits")) { + continue; + } + FileStatus[] storeFiles = fs.listStatus(familyStatus.getPath()); + if (storeFiles == null) { + continue; + } + for (FileStatus fileStatus : storeFiles) { + if (StoreFileInfo.isReference(fileStatus.getPath())) { + return true; + } + } + } + } + return false; + } + + private void setCompactionsEnabled(boolean enabled) { + for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { + TEST_UTIL.getHBaseCluster().getRegionServer(i) + .getCompactSplitThread().setCompactionsEnabled(enabled); + } + } + + private void setFileTimesRecursively(FileSystem fs, Path path, long time) throws IOException { + fs.setTimes(path, time, -1); + if (fs.isDirectory(path)) { + for (FileStatus child : fs.listStatus(path)) { + setFileTimesRecursively(fs, child.getPath(), time); + } + } + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestCloneSnapshotFromClientAfterMergingRegion.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestCloneSnapshotFromClientAfterMergingRegion.java new file mode 100644 index 000000000000..cc9f3f2b578c --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestCloneSnapshotFromClientAfterMergingRegion.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.client; + +import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate; +import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; +import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner; +import org.apache.hadoop.hbase.testclassification.ClientTests; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; + +@Tag(LargeTests.TAG) +@Tag(ClientTests.TAG) +@HBaseParameterizedTestTemplate(name = "{index}: regionReplication={0}") +public class TestCloneSnapshotFromClientAfterMergingRegion + extends CloneSnapshotFromClientAfterMergingRegionTestBase { + + public TestCloneSnapshotFromClientAfterMergingRegion(int numReplicas) { + super(numReplicas); + } + + @BeforeAll + public static void setUpBeforeClass() throws Exception { + setupConfiguration(); + // HFileLinkCleaner is not in the default plugin list but is required to exercise the + // back-reference mechanism: without it, TimeToLiveHFileCleaner would delete ALL archived + // files after TTL regardless of back-references, masking the real bug. + String existing = + TEST_UTIL.getConfiguration().get(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS, ""); + TEST_UTIL.getConfiguration().set(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS, + existing.isEmpty() ? HFileLinkCleaner.class.getName() + : existing + "," + HFileLinkCleaner.class.getName()); + TEST_UTIL.startMiniCluster(3); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestHFileLinkCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestHFileLinkCleaner.java index ffd12ba34235..0f3f57386234 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestHFileLinkCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestHFileLinkCleaner.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.util.Collections; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -144,7 +145,7 @@ private void createLink(StoreFileTracker sft, boolean createBackReference) throw fs.mkdirs(familyLinkPath); hfileLink = sft.createHFileLink(hri.getTable(), hri.getEncodedName(), hfileName, createBackReference); - hfileLinkName = hfileName; + hfileLinkName = HFileLink.createHFileLinkName(hri.getTable(), hri.getEncodedName(), hfileName); linkBackRefDir = HFileLink.getBackReferencesDir(archiveStoreDir, hfileName); assertTrue(fs.exists(linkBackRefDir)); backRefs = fs.listStatus(linkBackRefDir); @@ -211,6 +212,56 @@ public void testHFileLinkEmptyBackReferenceDirectory() throws Exception { assertFalse(fs.exists(linkBackRefDir), "back reference directory should be deleted"); } + /** + * Verify that an HFileLink Reference file — created by + * {@code RestoreSnapshotHelper.restoreReferenceFile()} when cloning a snapshot that contains + * split/merge reference files — protects the archived HFile from deletion by + * {@link HFileLinkCleaner}. + * + *

Unlike a normal clone (which calls {@code HFileLink.create()} and gets a zero-byte HFileLink + * file plus a back-reference), {@code restoreReferenceFile()} writes only a Reference file whose + * base name is an HFileLink name (e.g. {@code srcTable=srcRegion-hfile.cloneRegion}). The + * back-reference points to a path that {@code HFileLinkCleaner} reconstructs as a zero-byte + * HFileLink file — but that file does not exist. The cleaner must also look for Reference files + * at that path before concluding the forward link is gone. + */ + @Test + public void testHFileLinkReferenceFileProtectsArchivedHFile() throws Exception { + // @Before created: archived HFile, zero-byte HFileLink, and back-reference. + // restoreReferenceFile() does NOT create a zero-byte HFileLink file; it creates a Reference + // file whose base name is the HFileLink name. Simulate that here. + + // Step 1: remove the zero-byte HFileLink (restoreReferenceFile doesn't create one). + assertTrue(fs.delete(new Path(familyLinkPath, hfileLinkName), false)); + + // Step 2: create the HFileLink Reference file that restoreReferenceFile() produces. + // Name pattern: =-. + String hfileLinkRefName = hfileLinkName + "." + hriLink.getEncodedName(); + Path hfileLinkRefPath = new Path(familyLinkPath, hfileLinkRefName); + fs.createNewFile(hfileLinkRefPath); + // Sanity-check: the naming must satisfy StoreFileInfo.isReference() so HBase treats it as a + // reference file, not a plain HFile. + assertTrue(StoreFileInfo.isReference(hfileLinkRefPath), + "HFileLink Reference name must be recognized as a reference file"); + + // The back-reference dir+file already exists from @Before. + // The cleaner must NOT delete the archived HFile while the Reference file still exists. + cleaner.chore(); + assertTrue(fs.exists(hfilePath), + "Archived HFile must be protected while an HFileLink Reference file exists in the clone"); + + // Step 3: simulate clone-table compaction — the Reference file is replaced by a real HFile + // and removed from the store. + assertTrue(fs.delete(hfileLinkRefPath, false)); + + // The back-reference's forward link is now gone; the cleaner should mark the back-ref + // deletable and remove it. + cleaner.chore(); + assertFalse(fs.exists(linkBackRef), + "Back-reference should be removed after HFileLink Reference file is gone"); + // Archived HFile cleanup (requires TTL to expire) is validated by @After. + } + private static Path getFamilyDirPath(final Path rootDir, final TableName table, final String region, final String family) { return new Path(new Path(CommonFSUtils.getTableDir(rootDir, table), region), family); From ee05a337d7e13d58a7971a74f0c8e91a9649021c Mon Sep 17 00:00:00 2001 From: Thibault Deutsch Date: Sun, 22 Mar 2026 19:09:42 +0000 Subject: [PATCH 2/3] Prevent HFileLinkCleaner from deleting archived pre-merge HFiles referenced by clone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a snapshot is taken while a merged region still holds reference files, and that snapshot is then cloned, RestoreSnapshotHelper.restoreReferenceFile() creates HFileLink Reference files in the clone (e.g. srcTable=srcRegion-hfile.cloneRegion) but does not write back-references to the archive directory. As a result: 1. CatalogJanitor GCs the pre-merge regions, archiving their HFiles. 2. HFileLinkCleaner sees no back-references for those archived HFiles and deletes them. 3. Every subsequent region open on the cloned table fails with FileNotFoundException. Fix with two complementary changes: - RestoreSnapshotHelper: write a back-reference to the archive when creating an HFileLink Reference file, so HFileLinkCleaner knows the archived HFile is still live. - HFileLinkCleaner: when evaluating a back-reference, also check for HFileLink Reference files (glob .*) in the link directory — these are created by restoreReferenceFile() and protect the archived HFile even when no zero-byte HFileLink file exists alongside them. --- .../master/cleaner/HFileLinkCleaner.java | 12 +- .../hbase/snapshot/RestoreSnapshotHelper.java | 16 ++ ...romClientAfterSplittingRegionTestBase.java | 211 ++++++++++++++++++ 3 files changed, 238 insertions(+), 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/HFileLinkCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/HFileLinkCleaner.java index 3b137ebc1a9f..f3bdf8935848 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/HFileLinkCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/HFileLinkCleaner.java @@ -79,7 +79,17 @@ public boolean isFileDeletable(FileStatus fStat) { } hfilePath = HFileLink.getHFileFromBackReference(CommonFSUtils.getRootDir(getConf()), filePath); - return !fs.exists(hfilePath); + if (fs.exists(hfilePath)) { + return false; + } + // Also protect HFileLink Reference files created by + // RestoreSnapshotHelper.restoreReferenceFile(). These are named + // . and live in the same directory as the + // zero-byte HFileLink. The zero-byte file does not exist (only the Reference + // file does), so the fs.exists() check above is insufficient. + FileStatus[] refFiles = + fs.globStatus(new Path(hfilePath.getParent(), hfilePath.getName() + ".*")); + return refFiles == null || refFiles.length == 0; } catch (IOException e) { if (LOG.isDebugEnabled()) { LOG.debug("Couldn't verify if the referenced file still exists, keep it just in case: " diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java index f0f1ba3899ae..093c5a9d1506 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.backup.HFileArchiver; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.RegionInfo; @@ -795,6 +796,21 @@ private StoreFileInfo restoreReferenceFile(final Path familyDir, final RegionInf refLink = HFileLink.createHFileLinkName(snapshotTable, snapshotRegionName, fileName); linkPath = new Path(familyDir, HFileLink.createHFileLinkName(snapshotTable, regionInfo.getEncodedName(), hfileName)); + // Write a back-reference so HFileLinkCleaner knows the archived parent HFile is still + // referenced by this clone region. Without this, once CatalogJanitor GCs the split-parent + // region (archiving its HFiles), HFileLinkCleaner sees no back-references and deletes + // them — breaking every read on the cloned table. + Path archiveStoreDir = + HFileArchiveUtil.getStoreArchivePath(conf, snapshotTable, snapshotRegionName, + familyDir.getName()); + Path backRefDir = HFileLink.getBackReferencesDir(archiveStoreDir, fileName); + fs.mkdirs(backRefDir); + String containingCloneRegion = + Bytes.toString(regionsMap.get(regionInfo.getEncodedNameAsBytes())); + if (containingCloneRegion == null) containingCloneRegion = regionInfo.getEncodedName(); + String backRefName = HFileLink.createBackReferenceName( + tableDesc.getTableName().getNameAsString(), containingCloneRegion); + fs.createNewFile(new Path(backRefDir, backRefName)); } Path outPath = new Path(familyDir, refLink + '.' + clonedRegionName); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java index 5027da6762f9..72b54a7e1141 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java @@ -20,17 +20,36 @@ import static org.awaitility.Awaitility.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.time.Duration; import java.util.List; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.assignment.RegionStates; +import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; +import org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner; +import org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.io.HFileLink; +import org.apache.hadoop.hbase.regionserver.HStore; +import org.apache.hadoop.hbase.regionserver.HStoreFile; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; +import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.junit.jupiter.api.TestTemplate; +import org.apache.hadoop.hbase.util.HFileArchiveUtil; +import org.junit.Test; public class CloneSnapshotFromClientAfterSplittingRegionTestBase extends CloneSnapshotFromClientTestBase { @@ -106,6 +125,198 @@ public void testCloneSnapshotAfterSplittingRegion() throws IOException, Interrup } } + /** + * Regression test for the bug where HFiles referenced by HFileLink Reference files in a cloned + * table are deleted from the archive by {@code HFileLinkCleaner} after the split-parent region + * is GC'd. + * + *

Sequence that triggers the bug: + *

    + *
  1. Split a region — daughters get split reference files pointing at the parent's HFiles.
  2. + *
  3. Snapshot the table while reference files still exist (compaction disabled).
  4. + *
  5. Clone the snapshot — {@code RestoreSnapshotHelper.restoreReferenceFile()} creates + * HFileLink Reference files in the clone pointing at the split parent's HFiles, but does + * NOT create the back-references that {@code HFileLinkCleaner} relies on.
  6. + *
  7. Re-enable compaction and compact the daughter regions to resolve their references.
  8. + *
  9. CatalogJanitor GC archives the split-parent region's HFiles.
  10. + *
  11. {@code HFileLinkCleaner} deletes the archived HFiles because there are no + * back-references — breaking every read on the cloned table.
  12. + *
+ * + *

Without the fix, the final {@code verifyRowCount} call throws + * {@code FileNotFoundException}. + */ + @TestTemplate + public void testCloneSnapshotAfterSplittingRegionAndGC() throws Exception { + // Disable CatalogJanitor so GC doesn't race with snapshot/clone creation. + admin.catalogJanitorSwitch(false); + TableName clonedTableName = + TableName.valueOf(getValidMethodName() + "-" + EnvironmentEdgeManager.currentTime()); + try { + // Step 1: split a region. Compaction is globally disabled in setupConfiguration(), so the + // daughter regions will retain split reference files pointing at the parent's HFiles. + int numRegionsBefore = admin.getRegions(tableName).size(); + splitRegion(); + // waitUntilAllRegionsAssigned() also visits the split parent row in hbase:meta, which stays + // in SPLIT state permanently and would cause a 60 s timeout. Use RegionStates directly to + // count only OPEN regions (daughters), excluding the split parent. + RegionStates regionStates = + TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + int expectedOpenCount = numRegionsBefore + numReplicas; + TEST_UTIL.waitFor(30_000, () -> { + List open = + regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.OPEN); + return open != null && open.size() >= expectedOpenCount; + }); + + // Step 2: snapshot while reference files still exist. + admin.snapshot(snapshotName2, tableName); + + // Step 3: clone the snapshot. + admin.cloneSnapshot(snapshotName2, clonedTableName); + SnapshotTestingUtils.waitForTableToBeOnline(TEST_UTIL, clonedTableName); + verifyRowCount(TEST_UTIL, clonedTableName, snapshot1Rows); + + // Step 4: re-enable compaction and compact the daughter regions to resolve references. + setCompactionsEnabled(true); + admin.majorCompact(tableName); + // Wait until all daughter regions have no reference files. + FileSystem fs = FileSystem.get(TEST_UTIL.getConfiguration()); + Path tableDir = CommonFSUtils.getTableDir( + CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()), tableName); + TEST_UTIL.waitFor(60_000, () -> { + try { + for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { + HRegionServer rs = TEST_UTIL.getHBaseCluster().getRegionServer(i); + rs.getCompactedHFilesDischarger().chore(); + for (HRegion region : rs.getRegions(tableName)) { + RegionInfo ri = region.getRegionInfo(); + if (ri.getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { + continue; // secondary replicas share the primary's HFiles; no own directory + } + HRegionFileSystem regionFs = HRegionFileSystem.openRegionFromFileSystem( + TEST_UTIL.getConfiguration(), fs, tableDir, ri, true); + for (Path familyDir : FSUtils.getFamilyDirs(fs, + new Path(tableDir, ri.getEncodedName()))) { + org.apache.hadoop.hbase.regionserver.StoreContext storeContext = + org.apache.hadoop.hbase.regionserver.StoreContext.getBuilder() + .withColumnFamilyDescriptor( + org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder + .of(familyDir.getName())) + .withRegionFileSystem(regionFs).withFamilyStoreDirectoryPath(familyDir) + .build(); + if (StoreFileTrackerFactory.create(TEST_UTIL.getConfiguration(), false, + storeContext).hasReferences()) { + return false; + } + } + } + } + return true; + } catch (IOException e) { + return false; + } + }); + + // Step 5: let CatalogJanitor GC the split-parent region (archives its HFiles). + admin.catalogJanitorSwitch(true); + TEST_UTIL.getMiniHBaseCluster().getMaster().getCatalogJanitor().choreForTesting(); + // Wait for parent region directory to be removed from the live table dir. + List splitParents = + regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.SPLIT); + TEST_UTIL.waitFor(30_000, () -> { + try { + for (RegionInfo parent : splitParents) { + if (fs.exists(new Path(tableDir, parent.getEncodedName()))) { + return false; + } + } + return true; + } catch (IOException e) { + return false; + } + }); + + // Step 6: delete the snapshot so SnapshotHFileCleaner no longer protects the archived + // HFiles, then expire the archive TTL and run HFileCleaner. + admin.deleteSnapshot(snapshotName2); + // SnapshotFileCache only self-refreshes when it encounters a file NOT in its stale cache. + // After deletion the parent HFile names are still in the stale cache (because + // SnapshotReferenceUtil.getHFileNames() added them via the daughters' reference files), so + // the cache would never self-refresh for those files. Force a refresh now. + HFileCleaner hfileCleaner = + TEST_UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner(); + // getDelegatesForTesting() uses an unchecked cast internally, so iterate as Object + // to avoid a ClassCastException from the implicit checkcast the compiler inserts. + for (Object delegate : hfileCleaner.getDelegatesForTesting()) { + if (delegate instanceof SnapshotHFileCleaner) { + ((SnapshotHFileCleaner) delegate).getFileCacheForTesting() + .triggerCacheRefreshForTesting(); + break; + } + } + Path archivePath = HFileArchiveUtil.getArchivePath(TEST_UTIL.getConfiguration()); + long expiredTime = + EnvironmentEdgeManager.currentTime() - TimeToLiveHFileCleaner.DEFAULT_TTL * 1000; + setFileTimesRecursively(fs, archivePath, expiredTime); + TEST_UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner().triggerCleanerNow().get(); + +// Step 7: the clone must still be fully readable — +// its HFileLink References must still resolve. + for (HRegion region : TEST_UTIL.getHBaseCluster().getRegions(clonedTableName)) { + for (HStore store : region.getStores()) { + for (HStoreFile sf : store.getStorefiles()) { + Path path = sf.getPath(); + if (sf.isReference() && StoreFileInfo.isReference(path)) { + // Reference files point to HFileLinks, resolve the actual path + Path refPath = StoreFileInfo.getReferredToFile(path); + // If the referred file is an HFileLink, resolve it further + if (HFileLink.isHFileLink(refPath)) { + HFileLink link = HFileLink.buildFromHFileLinkPattern( + TEST_UTIL.getConfiguration(), refPath); + Path actualPath = link.getAvailablePath(fs); + + // Check if this file exists + assertTrue(fs.exists(actualPath),"Actual file does not exist: " + actualPath); + } + } + } + } + } + + // Step 8: disable and re-enable the clone table to force all region servers to close their + // existing HFile handles and open fresh ones. Without the fix, the region open fails with + // FileNotFoundException because HFileLink.getAvailablePath() cannot find the archived HFile + // at any of its candidate locations (live, archive, .tmp, mobdir) — it was deleted by + // HFileLinkCleaner. The table therefore never becomes available within the timeout. + admin.disableTable(clonedTableName); + admin.enableTable(clonedTableName); + verifyRowCount(TEST_UTIL, clonedTableName, snapshot1Rows); + } finally { + setCompactionsEnabled(true); + admin.catalogJanitorSwitch(true); + if (admin.tableExists(clonedTableName)) { + TEST_UTIL.deleteTable(clonedTableName); + } + } + } + + private void setCompactionsEnabled(boolean enabled) { + for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { + TEST_UTIL.getHBaseCluster().getRegionServer(i) + .getCompactSplitThread().setCompactionsEnabled(enabled); + } + } + + private void setFileTimesRecursively(FileSystem fs, Path path, long time) throws IOException { + fs.setTimes(path, time, -1); + if (fs.isDirectory(path)) { + for (FileStatus child : fs.listStatus(path)) { + setFileTimesRecursively(fs, child.getPath(), time); + } + } + } + @TestTemplate public void testCloneSnapshotBeforeSplittingRegionAndDroppingTable() throws IOException, InterruptedException { From d5d8a98e638860870672e7b7b0543e6059a14730 Mon Sep 17 00:00:00 2001 From: Shrinidhi Talpankar Date: Wed, 13 May 2026 14:52:31 +0530 Subject: [PATCH 3/3] Reverting changes to CloneSnapshotFromClientAfterSplittingRegionTestBase.java in previous commit --- ...romClientAfterSplittingRegionTestBase.java | 211 ------------------ 1 file changed, 211 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java index 72b54a7e1141..5027da6762f9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/CloneSnapshotFromClientAfterSplittingRegionTestBase.java @@ -20,36 +20,17 @@ import static org.awaitility.Awaitility.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.time.Duration; import java.util.List; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.assignment.RegionStates; -import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; -import org.apache.hadoop.hbase.master.cleaner.TimeToLiveHFileCleaner; -import org.apache.hadoop.hbase.master.snapshot.SnapshotHFileCleaner; -import org.apache.hadoop.hbase.regionserver.HRegion; -import org.apache.hadoop.hbase.io.HFileLink; -import org.apache.hadoop.hbase.regionserver.HStore; -import org.apache.hadoop.hbase.regionserver.HStoreFile; -import org.apache.hadoop.hbase.regionserver.StoreFileInfo; -import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; -import org.apache.hadoop.hbase.regionserver.HRegionServer; -import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory; -import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.snapshot.SnapshotTestingUtils; -import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.junit.jupiter.api.TestTemplate; -import org.apache.hadoop.hbase.util.HFileArchiveUtil; -import org.junit.Test; public class CloneSnapshotFromClientAfterSplittingRegionTestBase extends CloneSnapshotFromClientTestBase { @@ -125,198 +106,6 @@ public void testCloneSnapshotAfterSplittingRegion() throws IOException, Interrup } } - /** - * Regression test for the bug where HFiles referenced by HFileLink Reference files in a cloned - * table are deleted from the archive by {@code HFileLinkCleaner} after the split-parent region - * is GC'd. - * - *

Sequence that triggers the bug: - *

    - *
  1. Split a region — daughters get split reference files pointing at the parent's HFiles.
  2. - *
  3. Snapshot the table while reference files still exist (compaction disabled).
  4. - *
  5. Clone the snapshot — {@code RestoreSnapshotHelper.restoreReferenceFile()} creates - * HFileLink Reference files in the clone pointing at the split parent's HFiles, but does - * NOT create the back-references that {@code HFileLinkCleaner} relies on.
  6. - *
  7. Re-enable compaction and compact the daughter regions to resolve their references.
  8. - *
  9. CatalogJanitor GC archives the split-parent region's HFiles.
  10. - *
  11. {@code HFileLinkCleaner} deletes the archived HFiles because there are no - * back-references — breaking every read on the cloned table.
  12. - *
- * - *

Without the fix, the final {@code verifyRowCount} call throws - * {@code FileNotFoundException}. - */ - @TestTemplate - public void testCloneSnapshotAfterSplittingRegionAndGC() throws Exception { - // Disable CatalogJanitor so GC doesn't race with snapshot/clone creation. - admin.catalogJanitorSwitch(false); - TableName clonedTableName = - TableName.valueOf(getValidMethodName() + "-" + EnvironmentEdgeManager.currentTime()); - try { - // Step 1: split a region. Compaction is globally disabled in setupConfiguration(), so the - // daughter regions will retain split reference files pointing at the parent's HFiles. - int numRegionsBefore = admin.getRegions(tableName).size(); - splitRegion(); - // waitUntilAllRegionsAssigned() also visits the split parent row in hbase:meta, which stays - // in SPLIT state permanently and would cause a 60 s timeout. Use RegionStates directly to - // count only OPEN regions (daughters), excluding the split parent. - RegionStates regionStates = - TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); - int expectedOpenCount = numRegionsBefore + numReplicas; - TEST_UTIL.waitFor(30_000, () -> { - List open = - regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.OPEN); - return open != null && open.size() >= expectedOpenCount; - }); - - // Step 2: snapshot while reference files still exist. - admin.snapshot(snapshotName2, tableName); - - // Step 3: clone the snapshot. - admin.cloneSnapshot(snapshotName2, clonedTableName); - SnapshotTestingUtils.waitForTableToBeOnline(TEST_UTIL, clonedTableName); - verifyRowCount(TEST_UTIL, clonedTableName, snapshot1Rows); - - // Step 4: re-enable compaction and compact the daughter regions to resolve references. - setCompactionsEnabled(true); - admin.majorCompact(tableName); - // Wait until all daughter regions have no reference files. - FileSystem fs = FileSystem.get(TEST_UTIL.getConfiguration()); - Path tableDir = CommonFSUtils.getTableDir( - CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration()), tableName); - TEST_UTIL.waitFor(60_000, () -> { - try { - for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { - HRegionServer rs = TEST_UTIL.getHBaseCluster().getRegionServer(i); - rs.getCompactedHFilesDischarger().chore(); - for (HRegion region : rs.getRegions(tableName)) { - RegionInfo ri = region.getRegionInfo(); - if (ri.getReplicaId() != RegionInfo.DEFAULT_REPLICA_ID) { - continue; // secondary replicas share the primary's HFiles; no own directory - } - HRegionFileSystem regionFs = HRegionFileSystem.openRegionFromFileSystem( - TEST_UTIL.getConfiguration(), fs, tableDir, ri, true); - for (Path familyDir : FSUtils.getFamilyDirs(fs, - new Path(tableDir, ri.getEncodedName()))) { - org.apache.hadoop.hbase.regionserver.StoreContext storeContext = - org.apache.hadoop.hbase.regionserver.StoreContext.getBuilder() - .withColumnFamilyDescriptor( - org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder - .of(familyDir.getName())) - .withRegionFileSystem(regionFs).withFamilyStoreDirectoryPath(familyDir) - .build(); - if (StoreFileTrackerFactory.create(TEST_UTIL.getConfiguration(), false, - storeContext).hasReferences()) { - return false; - } - } - } - } - return true; - } catch (IOException e) { - return false; - } - }); - - // Step 5: let CatalogJanitor GC the split-parent region (archives its HFiles). - admin.catalogJanitorSwitch(true); - TEST_UTIL.getMiniHBaseCluster().getMaster().getCatalogJanitor().choreForTesting(); - // Wait for parent region directory to be removed from the live table dir. - List splitParents = - regionStates.getRegionByStateOfTable(tableName).get(RegionState.State.SPLIT); - TEST_UTIL.waitFor(30_000, () -> { - try { - for (RegionInfo parent : splitParents) { - if (fs.exists(new Path(tableDir, parent.getEncodedName()))) { - return false; - } - } - return true; - } catch (IOException e) { - return false; - } - }); - - // Step 6: delete the snapshot so SnapshotHFileCleaner no longer protects the archived - // HFiles, then expire the archive TTL and run HFileCleaner. - admin.deleteSnapshot(snapshotName2); - // SnapshotFileCache only self-refreshes when it encounters a file NOT in its stale cache. - // After deletion the parent HFile names are still in the stale cache (because - // SnapshotReferenceUtil.getHFileNames() added them via the daughters' reference files), so - // the cache would never self-refresh for those files. Force a refresh now. - HFileCleaner hfileCleaner = - TEST_UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner(); - // getDelegatesForTesting() uses an unchecked cast internally, so iterate as Object - // to avoid a ClassCastException from the implicit checkcast the compiler inserts. - for (Object delegate : hfileCleaner.getDelegatesForTesting()) { - if (delegate instanceof SnapshotHFileCleaner) { - ((SnapshotHFileCleaner) delegate).getFileCacheForTesting() - .triggerCacheRefreshForTesting(); - break; - } - } - Path archivePath = HFileArchiveUtil.getArchivePath(TEST_UTIL.getConfiguration()); - long expiredTime = - EnvironmentEdgeManager.currentTime() - TimeToLiveHFileCleaner.DEFAULT_TTL * 1000; - setFileTimesRecursively(fs, archivePath, expiredTime); - TEST_UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner().triggerCleanerNow().get(); - -// Step 7: the clone must still be fully readable — -// its HFileLink References must still resolve. - for (HRegion region : TEST_UTIL.getHBaseCluster().getRegions(clonedTableName)) { - for (HStore store : region.getStores()) { - for (HStoreFile sf : store.getStorefiles()) { - Path path = sf.getPath(); - if (sf.isReference() && StoreFileInfo.isReference(path)) { - // Reference files point to HFileLinks, resolve the actual path - Path refPath = StoreFileInfo.getReferredToFile(path); - // If the referred file is an HFileLink, resolve it further - if (HFileLink.isHFileLink(refPath)) { - HFileLink link = HFileLink.buildFromHFileLinkPattern( - TEST_UTIL.getConfiguration(), refPath); - Path actualPath = link.getAvailablePath(fs); - - // Check if this file exists - assertTrue(fs.exists(actualPath),"Actual file does not exist: " + actualPath); - } - } - } - } - } - - // Step 8: disable and re-enable the clone table to force all region servers to close their - // existing HFile handles and open fresh ones. Without the fix, the region open fails with - // FileNotFoundException because HFileLink.getAvailablePath() cannot find the archived HFile - // at any of its candidate locations (live, archive, .tmp, mobdir) — it was deleted by - // HFileLinkCleaner. The table therefore never becomes available within the timeout. - admin.disableTable(clonedTableName); - admin.enableTable(clonedTableName); - verifyRowCount(TEST_UTIL, clonedTableName, snapshot1Rows); - } finally { - setCompactionsEnabled(true); - admin.catalogJanitorSwitch(true); - if (admin.tableExists(clonedTableName)) { - TEST_UTIL.deleteTable(clonedTableName); - } - } - } - - private void setCompactionsEnabled(boolean enabled) { - for (int i = 0; i < TEST_UTIL.getHBaseCluster().getNumLiveRegionServers(); i++) { - TEST_UTIL.getHBaseCluster().getRegionServer(i) - .getCompactSplitThread().setCompactionsEnabled(enabled); - } - } - - private void setFileTimesRecursively(FileSystem fs, Path path, long time) throws IOException { - fs.setTimes(path, time, -1); - if (fs.isDirectory(path)) { - for (FileStatus child : fs.listStatus(path)) { - setFileTimesRecursively(fs, child.getPath(), time); - } - } - } - @TestTemplate public void testCloneSnapshotBeforeSplittingRegionAndDroppingTable() throws IOException, InterruptedException {