From 682aa696777415012af3cc696190fbbd6fc199c0 Mon Sep 17 00:00:00 2001 From: Toshihiro Suzuki Date: Tue, 13 Feb 2018 01:55:07 +0900 Subject: [PATCH] HBASE-19893 restore_snapshot is broken in master branch when region splits Signed-off-by: tedyu --- .../hbase/favored/FavoredNodesManager.java | 32 ++++--- .../hbase/master/assignment/RegionStates.java | 5 +- .../procedure/RestoreSnapshotProcedure.java | 85 ++++++++++++++++--- .../hbase/snapshot/RestoreSnapshotHelper.java | 3 +- .../hadoop/hbase/HBaseTestingUtility.java | 36 +++++--- .../client/TestRestoreSnapshotFromClient.java | 29 ++++++- 6 files changed, 151 insertions(+), 39 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/favored/FavoredNodesManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/favored/FavoredNodesManager.java index f5c1d91cde93..67d4071f8a51 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/favored/FavoredNodesManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/favored/FavoredNodesManager.java @@ -269,21 +269,25 @@ public synchronized Map> getReplicaLoad(List regionInfoList) { - for (RegionInfo hri : regionInfoList) { - List favNodes = getFavoredNodes(hri); - if (favNodes != null) { - if (primaryRSToRegionMap.containsKey(favNodes.get(PRIMARY.ordinal()))) { - primaryRSToRegionMap.get(favNodes.get(PRIMARY.ordinal())).remove(hri); - } - if (secondaryRSToRegionMap.containsKey(favNodes.get(SECONDARY.ordinal()))) { - secondaryRSToRegionMap.get(favNodes.get(SECONDARY.ordinal())).remove(hri); - } - if (teritiaryRSToRegionMap.containsKey(favNodes.get(TERTIARY.ordinal()))) { - teritiaryRSToRegionMap.get(favNodes.get(TERTIARY.ordinal())).remove(hri); - } - globalFavoredNodesAssignmentPlan.removeFavoredNodes(hri); + public synchronized void deleteFavoredNodesForRegion(RegionInfo regionInfo) { + List favNodes = getFavoredNodes(regionInfo); + if (favNodes != null) { + if (primaryRSToRegionMap.containsKey(favNodes.get(PRIMARY.ordinal()))) { + primaryRSToRegionMap.get(favNodes.get(PRIMARY.ordinal())).remove(regionInfo); + } + if (secondaryRSToRegionMap.containsKey(favNodes.get(SECONDARY.ordinal()))) { + secondaryRSToRegionMap.get(favNodes.get(SECONDARY.ordinal())).remove(regionInfo); } + if (teritiaryRSToRegionMap.containsKey(favNodes.get(TERTIARY.ordinal()))) { + teritiaryRSToRegionMap.get(favNodes.get(TERTIARY.ordinal())).remove(regionInfo); + } + globalFavoredNodesAssignmentPlan.removeFavoredNodes(regionInfo); + } + } + + public synchronized void deleteFavoredNodesForRegions(Collection regionInfoList) { + for (RegionInfo regionInfo : regionInfoList) { + deleteFavoredNodesForRegion(regionInfo); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index e4b49affe297..b1b23715ee88 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -516,6 +516,10 @@ public void deleteRegion(final RegionInfo regionInfo) { } } + public void deleteRegions(final List regionInfos) { + regionInfos.forEach(this::deleteRegion); + } + ArrayList getTableRegionStateNodes(final TableName tableName) { final ArrayList regions = new ArrayList(); for (RegionStateNode node: regionsMap.tailMap(tableName.getName()).values()) { @@ -769,7 +773,6 @@ public void logSplit(final ServerName serverName) { setServerState(serverName, ServerState.OFFLINE); } - @VisibleForTesting public void updateRegionState(final RegionInfo regionInfo, final State state) { final RegionStateNode regionNode = getOrCreateRegionStateNode(regionInfo); synchronized (regionNode) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java index 09f62596a262..f542449b6915 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RestoreSnapshotProcedure.java @@ -34,11 +34,15 @@ import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.errorhandling.ForeignException; import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher; +import org.apache.hadoop.hbase.favored.FavoredNodesManager; import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.MetricsSnapshot; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -391,7 +395,7 @@ private void restoreSnapshot(final MasterProcedureEnv env) throws IOException { env.getMasterServices().getConfiguration(), fs, manifest, - modifiedTableDescriptor, + modifiedTableDescriptor, rootDir, monitorException, getMonitorStatus()); @@ -419,12 +423,13 @@ private void restoreSnapshot(final MasterProcedureEnv env) throws IOException { private void updateMETA(final MasterProcedureEnv env) throws IOException { try { Connection conn = env.getMasterServices().getConnection(); + int regionReplication = modifiedTableDescriptor.getRegionReplication(); // 1. Prepare to restore getMonitorStatus().setStatus("Preparing to restore each region"); - // 2. Applies changes to hbase:meta - // (2.1). Removes the current set of regions from META + // 2. Applies changes to hbase:meta and in-memory states + // (2.1). Removes the current set of regions from META and in-memory states // // By removing also the regions to restore (the ones present both in the snapshot // and in the current state) we ensure that no extra fields are present in META @@ -433,26 +438,25 @@ private void updateMETA(final MasterProcedureEnv env) throws IOException { // that are not correct after the restore. if (regionsToRemove != null) { MetaTableAccessor.deleteRegions(conn, regionsToRemove); + deleteRegionsFromInMemoryStates(regionsToRemove, env, regionReplication); } - // (2.2). Add the new set of regions to META + // (2.2). Add the new set of regions to META and in-memory states // // At this point the old regions are no longer present in META. // and the set of regions present in the snapshot will be written to META. // All the information in hbase:meta are coming from the .regioninfo of each region present // in the snapshot folder. if (regionsToAdd != null) { - MetaTableAccessor.addRegionsToMeta( - conn, - regionsToAdd, - modifiedTableDescriptor.getRegionReplication()); + MetaTableAccessor.addRegionsToMeta(conn, regionsToAdd, regionReplication); + addRegionsToInMemoryStates(regionsToAdd, env, regionReplication); } if (regionsToRestore != null) { - MetaTableAccessor.overwriteRegions( - conn, - regionsToRestore, - modifiedTableDescriptor.getRegionReplication()); + MetaTableAccessor.overwriteRegions(conn, regionsToRestore, regionReplication); + + deleteRegionsFromInMemoryStates(regionsToRestore, env, regionReplication); + addRegionsToInMemoryStates(regionsToRestore, env, regionReplication); } RestoreSnapshotHelper.RestoreMetaChanges metaChanges = @@ -479,6 +483,63 @@ private void updateMETA(final MasterProcedureEnv env) throws IOException { monitorStatus.getCompletionTimestamp() - monitorStatus.getStartTime()); } + /** + * Delete regions from in-memory states + * @param regionInfos regions to delete + * @param env MasterProcedureEnv + * @param regionReplication the number of region replications + */ + private void deleteRegionsFromInMemoryStates(List regionInfos, + MasterProcedureEnv env, int regionReplication) { + FavoredNodesManager fnm = env.getMasterServices().getFavoredNodesManager(); + + env.getAssignmentManager().getRegionStates().deleteRegions(regionInfos); + env.getMasterServices().getServerManager().removeRegions(regionInfos); + if (fnm != null) { + fnm.deleteFavoredNodesForRegions(regionInfos); + } + + // For region replicas + if (regionReplication > 1) { + for (RegionInfo regionInfo : regionInfos) { + for (int i = 1; i < regionReplication; i++) { + RegionInfo regionInfoForReplica = + RegionReplicaUtil.getRegionInfoForReplica(regionInfo, i); + env.getAssignmentManager().getRegionStates().deleteRegion(regionInfoForReplica); + env.getMasterServices().getServerManager().removeRegion(regionInfoForReplica); + if (fnm != null) { + fnm.deleteFavoredNodesForRegion(regionInfoForReplica); + } + } + } + } + } + + /** + * Add regions to in-memory states + * @param regionInfos regions to add + * @param env MasterProcedureEnv + * @param regionReplication the number of region replications + */ + private void addRegionsToInMemoryStates(List regionInfos, MasterProcedureEnv env, + int regionReplication) { + AssignmentManager am = env.getAssignmentManager(); + for (RegionInfo regionInfo : regionInfos) { + if (regionInfo.isSplit()) { + am.getRegionStates().updateRegionState(regionInfo, RegionState.State.SPLIT); + } else { + am.getRegionStates().updateRegionState(regionInfo, RegionState.State.CLOSED); + + // For region replicas + for (int i = 1; i < regionReplication; i++) { + RegionInfo regionInfoForReplica = + RegionReplicaUtil.getRegionInfoForReplica(regionInfo, i); + am.getRegionStates().updateRegionState(regionInfoForReplica, RegionState.State.CLOSED); + } + } + } + } + private void restoreSnapshotAcl(final MasterProcedureEnv env) throws IOException { if (restoreAcl && snapshot.hasUsersAndPermissions() && snapshot.getUsersAndPermissions() != null && SnapshotDescriptionUtils diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java index 179dfe5b2e45..7edf73492523 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/RestoreSnapshotHelper.java @@ -229,7 +229,8 @@ private RestoreMetaChanges restoreHdfsRegions(final ThreadPoolExecutor exec) thr if (regionNames.contains(regionName)) { LOG.info("region to restore: " + regionName); regionNames.remove(regionName); - metaChanges.addRegionToRestore(regionInfo); + metaChanges.addRegionToRestore(ProtobufUtil.toRegionInfo(regionManifests.get(regionName) + .getRegionInfo())); } else { LOG.info("region to remove: " + regionName); metaChanges.addRegionToRemove(regionInfo); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 03d5d76c0f26..b938d2834f26 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -2411,6 +2411,19 @@ public List getMetaTableRows(TableName tableName) throws IOException { return rows; } + /** + * Returns all regions of the specified table + * + * @param tableName the table name + * @return all regions of the specified table + * @throws IOException when getting the regions fails. + */ + private List getRegions(TableName tableName) throws IOException { + try (Admin admin = getConnection().getAdmin()) { + return admin.getRegions(tableName); + } + } + /* * Find any other region server which is different from the one identified by parameter * @param rs @@ -2429,9 +2442,6 @@ public HRegionServer getOtherRegionServer(HRegionServer rs) { /** * Tool to get the reference to the region server object that holds the * region of the specified user table. - * It first searches for the meta rows that contain the region of the - * specified table, then gets the index of that RS, and finally retrieves - * the RS's reference. * @param tableName user table to lookup in hbase:meta * @return region server that holds it, null if the row doesn't exist * @throws IOException @@ -2439,21 +2449,27 @@ public HRegionServer getOtherRegionServer(HRegionServer rs) { */ public HRegionServer getRSForFirstRegionInTable(TableName tableName) throws IOException, InterruptedException { - List metaRows = getMetaTableRows(tableName); - if (metaRows == null || metaRows.isEmpty()) { + List regions = getRegions(tableName); + if (regions == null || regions.isEmpty()) { return null; } - LOG.debug("Found " + metaRows.size() + " rows for table " + - tableName); - byte [] firstrow = metaRows.get(0); - LOG.debug("FirstRow=" + Bytes.toString(firstrow)); + LOG.debug("Found " + regions.size() + " regions for table " + + tableName); + + byte[] firstRegionName = regions.stream() + .filter(r -> !r.isOffline()) + .map(RegionInfo::getRegionName) + .findFirst() + .orElseThrow(() -> new IOException("online regions not found in table " + tableName)); + + LOG.debug("firstRegionName=" + Bytes.toString(firstRegionName)); long pause = getConfiguration().getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE); int numRetries = getConfiguration().getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER); RetryCounter retrier = new RetryCounter(numRetries+1, (int)pause, TimeUnit.MICROSECONDS); while(retrier.shouldRetry()) { - int index = getMiniHBaseCluster().getServerWith(firstrow); + int index = getMiniHBaseCluster().getServerWith(firstRegionName); if (index != -1) { return getMiniHBaseCluster().getRegionServerThreads().get(index).getRegionServer(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestRestoreSnapshotFromClient.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestRestoreSnapshotFromClient.java index eb8b20e124fb..07044ee37932 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestRestoreSnapshotFromClient.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestRestoreSnapshotFromClient.java @@ -25,7 +25,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; @@ -307,6 +306,12 @@ public void testRestoreSnapshotAfterSplittingRegions() throws IOException, Inter // Take a snapshot admin.snapshot(snapshotName1, tableName); + // Load more data + SnapshotTestingUtils.loadData(TEST_UTIL, tableName, 500, FAMILY); + + // Split the second region + splitRegion(regionInfos.get(1)); + // Restore the snapshot admin.disableTable(tableName); admin.restoreSnapshot(snapshotName1); @@ -315,6 +320,28 @@ public void testRestoreSnapshotAfterSplittingRegions() throws IOException, Inter verifyRowCount(TEST_UTIL, tableName, snapshot1Rows); } + @Test + public void testRestoreSnapshotAfterTruncate() throws Exception { + TableName tableName = TableName.valueOf("testRestoreSnapshotAfterTruncate"); + SnapshotTestingUtils.createTable(TEST_UTIL, tableName, getNumReplicas(), FAMILY); + SnapshotTestingUtils.loadData(TEST_UTIL, tableName, 500, FAMILY); + int numOfRows = 0; + + try (Table table = TEST_UTIL.getConnection().getTable(tableName)) { + numOfRows = countRows(table); + } + // take snapshot + admin.snapshot("snap", tableName); + admin.disableTable(tableName); + admin.truncateTable(tableName, false); + admin.disableTable(tableName); + admin.restoreSnapshot("snap"); + + admin.enableTable(tableName); + verifyRowCount(TEST_UTIL, tableName, numOfRows); + SnapshotTestingUtils.verifyReplicasCameOnline(tableName, admin, getNumReplicas()); + } + // ========================================================================== // Helpers // ==========================================================================