diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java index 5b5b401af8a..9187abb521d 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java @@ -60,7 +60,7 @@ public abstract class AbstractCreateIndexCommand extends AbstractIndexCommand { this.columns = copyOrNull(columns); } - protected abstract CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion); + protected abstract CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion); @Override public List get(Catalog catalog) { @@ -84,7 +84,9 @@ public List get(Catalog catalog) { } return List.of( - new NewIndexEntry(createDescriptor(catalog.objectIdGenState(), table.id(), catalog.version() + 1), schemaName), + new NewIndexEntry( + createDescriptor(catalog.objectIdGenState(), table.id(), table.zoneId(), catalog.version() + 1), schemaName + ), new ObjectIdGenUpdateEntry(1) ); } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java index 45662b39503..44dc6e2fd68 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java @@ -48,9 +48,9 @@ private CreateHashIndexCommand(String schemaName, String indexName, String table } @Override - protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion) { + protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion) { return new CatalogHashIndexDescriptor( - indexId, indexName, tableId, unique, creationCatalogVersion, columns + indexId, indexName, tableId, unique, creationCatalogVersion, zoneId, columns ); } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java index 34c9eb9e9a0..b1cc4dbb65a 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java @@ -61,7 +61,7 @@ private CreateSortedIndexCommand(String schemaName, String indexName, String tab } @Override - protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion) { + protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion) { var indexColumnDescriptors = new ArrayList(columns.size()); for (int i = 0; i < columns.size(); i++) { @@ -71,7 +71,7 @@ protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int } return new CatalogSortedIndexDescriptor( - indexId, indexName, tableId, unique, creationCatalogVersion, indexColumnDescriptors + indexId, indexName, tableId, unique, creationCatalogVersion, zoneId, indexColumnDescriptors ); } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java index 565cdce0bfc..f043c2d8a5a 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java @@ -147,7 +147,7 @@ public List get(Catalog catalog) { ensureNoTableIndexOrSysViewExistsWithGivenName(schema, indexName); int txWaitCatalogVersion = catalog.version() + 1; - CatalogIndexDescriptor pkIndex = createIndexDescriptor(txWaitCatalogVersion, indexName, pkIndexId, tableId); + CatalogIndexDescriptor pkIndex = createIndexDescriptor(txWaitCatalogVersion, indexName, pkIndexId, tableId, zone.id()); return List.of( new NewTableEntry(table, schemaName), @@ -200,7 +200,13 @@ private void validate() { } } - private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, String indexName, int pkIndexId, int tableId) { + private CatalogIndexDescriptor createIndexDescriptor( + int txWaitCatalogVersion, + String indexName, + int pkIndexId, + int tableId, + int zoneId + ) { CatalogIndexDescriptor pkIndex; if (primaryKey instanceof TableSortedPrimaryKey) { @@ -221,6 +227,7 @@ private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, S true, AVAILABLE, txWaitCatalogVersion, + zoneId, indexColumns ); } else if (primaryKey instanceof TableHashPrimaryKey) { @@ -232,6 +239,7 @@ private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, S true, AVAILABLE, txWaitCatalogVersion, + zoneId, hashPrimaryKey.columns() ); } else { diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java index e251a6016fb..a25ab5b88e1 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java @@ -45,11 +45,20 @@ public class CatalogHashIndexDescriptor extends CatalogIndexDescriptor { * @param unique Unique flag. * @param txWaitCatalogVersion Catalog version used in special index status updates to wait for RW transactions, started before * this version, to finish. + * @param zoneId Zone id where table for the index is presented. * @param columns A list of indexed columns. Must not contains duplicates. * @throws IllegalArgumentException If columns list contains duplicates. */ - public CatalogHashIndexDescriptor(int id, String name, int tableId, boolean unique, int txWaitCatalogVersion, List columns) { - this(id, name, tableId, unique, CatalogIndexStatus.REGISTERED, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN); + public CatalogHashIndexDescriptor( + int id, + String name, + int tableId, + boolean unique, + int txWaitCatalogVersion, + int zoneId, + List columns + ) { + this(id, name, tableId, unique, CatalogIndexStatus.REGISTERED, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN); } /** @@ -72,9 +81,10 @@ public CatalogHashIndexDescriptor( boolean unique, CatalogIndexStatus status, int txWaitCatalogVersion, + int zoneId, List columns ) { - this(id, name, tableId, unique, status, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN); + this(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN); } /** @@ -98,10 +108,11 @@ private CatalogHashIndexDescriptor( boolean unique, CatalogIndexStatus status, int txWaitCatalogVersion, + int zoneId, List columns, long causalityToken ) { - super(CatalogIndexDescriptorType.HASH, id, name, tableId, unique, status, txWaitCatalogVersion, causalityToken); + super(CatalogIndexDescriptorType.HASH, id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, causalityToken); this.columns = List.copyOf(Objects.requireNonNull(columns, "columns")); } @@ -126,9 +137,10 @@ public CatalogHashIndexDescriptor readFrom(IgniteDataInput input) throws IOExcep boolean unique = input.readBoolean(); CatalogIndexStatus status = CatalogIndexStatus.forId(input.readByte()); int txWaitCatalogVersion = input.readInt(); + int zoneId = input.readInt(); List columns = readStringCollection(input, ArrayList::new); - return new CatalogHashIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, columns, updateToken); + return new CatalogHashIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, updateToken); } @Override @@ -140,6 +152,7 @@ public void writeTo(CatalogHashIndexDescriptor descriptor, IgniteDataOutput outp output.writeBoolean(descriptor.unique()); output.writeByte(descriptor.status().id()); output.writeInt(descriptor.txWaitCatalogVersion()); + output.writeInt(descriptor.zoneId()); writeStringCollection(descriptor.columns(), output); } } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java index 93a1affae6f..9a011be3706 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java @@ -39,14 +39,18 @@ public abstract class CatalogIndexDescriptor extends CatalogObjectDescriptor { /** Index descriptor type. */ private final CatalogIndexDescriptorType indexType; + /** Zone id where table for the index is presented. */ + private final int zoneId; + CatalogIndexDescriptor(CatalogIndexDescriptorType indexType, int id, String name, int tableId, boolean unique, - CatalogIndexStatus status, int txWaitCatalogVersion, long causalityToken) { + CatalogIndexStatus status, int txWaitCatalogVersion, int zoneId, long causalityToken) { super(id, Type.INDEX, name, causalityToken); this.indexType = indexType; this.tableId = tableId; this.unique = unique; this.status = Objects.requireNonNull(status, "status"); this.txWaitCatalogVersion = txWaitCatalogVersion; + this.zoneId = zoneId; } /** Gets table ID. */ @@ -72,6 +76,11 @@ public int txWaitCatalogVersion() { return txWaitCatalogVersion; } + /** Return zone id where table for the index is presented. */ + public int zoneId() { + return zoneId; + } + /** Returns catalog index descriptor type. */ public CatalogIndexDescriptorType indexType() { return indexType; diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java index 6697c162596..e06907511bd 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java @@ -46,6 +46,7 @@ public class CatalogSortedIndexDescriptor extends CatalogIndexDescriptor { * @param unique Unique flag. * @param txWaitCatalogVersion Catalog version used in special index status updates to wait for RW transactions, started before * this version, to finish. + * @param zoneId Zone id where table for the index is presented. * @param columns A list of columns descriptors. * @throws IllegalArgumentException If columns list contains duplicates or columns size doesn't match the collations size. */ @@ -55,9 +56,10 @@ public CatalogSortedIndexDescriptor( int tableId, boolean unique, int txWaitCatalogVersion, + int zoneId, List columns ) { - this(id, name, tableId, unique, REGISTERED, txWaitCatalogVersion, columns); + this(id, name, tableId, unique, REGISTERED, txWaitCatalogVersion, zoneId, columns); } /** @@ -80,9 +82,10 @@ public CatalogSortedIndexDescriptor( boolean unique, CatalogIndexStatus status, int txWaitCatalogVersion, + int zoneId, List columns ) { - this(id, name, tableId, unique, status, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN); + this(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN); } /** @@ -106,10 +109,11 @@ private CatalogSortedIndexDescriptor( boolean unique, CatalogIndexStatus status, int txWaitCatalogVersion, + int zoneId, List columns, long causalityToken ) { - super(CatalogIndexDescriptorType.SORTED, id, name, tableId, unique, status, txWaitCatalogVersion, causalityToken); + super(CatalogIndexDescriptorType.SORTED, id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, causalityToken); this.columns = Objects.requireNonNull(columns, "columns"); } @@ -134,9 +138,10 @@ public CatalogSortedIndexDescriptor readFrom(IgniteDataInput input) throws IOExc boolean unique = input.readBoolean(); CatalogIndexStatus status = CatalogIndexStatus.forId(input.readByte()); int txWaitCatalogVersion = input.readInt(); + int zoneId = input.readInt(); List columns = readList(CatalogIndexColumnDescriptor.SERIALIZER, input); - return new CatalogSortedIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, columns, updateToken); + return new CatalogSortedIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, updateToken); } @Override @@ -148,6 +153,7 @@ public void writeTo(CatalogSortedIndexDescriptor descriptor, IgniteDataOutput ou output.writeBoolean(descriptor.unique()); output.writeByte(descriptor.status().id()); output.writeInt(descriptor.txWaitCatalogVersion()); + output.writeInt(descriptor.zoneId()); writeList(descriptor.columns(), CatalogIndexColumnDescriptor.SERIALIZER, output); } } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java index 6c6baac45fa..884440dc21c 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java @@ -111,6 +111,7 @@ private static CatalogIndexDescriptor updateHashIndexStatus( index.unique(), newStatus, txWaitCatalogVersion, + index.zoneId(), index.columns() ); } @@ -125,6 +126,7 @@ private static CatalogIndexDescriptor updateSortedIndexStatus( index.unique(), newStatus, txWaitCatalogVersion, + index.zoneId(), index.columns() ); } diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java index 1b2278f21da..f1ec0f2400a 100644 --- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java +++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java @@ -101,6 +101,7 @@ private CatalogIndexDescriptor changeHashIndexName(CatalogHashIndexDescriptor in index.unique(), index.status(), index.txWaitCatalogVersion(), + index.zoneId(), index.columns() ); } @@ -113,6 +114,7 @@ private CatalogIndexDescriptor changeSortedIndexName(CatalogSortedIndexDescripto index.unique(), index.status(), index.txWaitCatalogVersion(), + index.zoneId(), index.columns() ); } diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java index c40cf4d5409..7db0c9132e0 100644 --- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java +++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java @@ -88,6 +88,7 @@ void exceptionIsThrownIfIndexHasInvalidPreviousStatus(CatalogIndexStatus invalid false, invalidPreviousIndexStatus, version, + 0, List.of(columnName) ) }, diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java index a1e24c4bb84..210b1e2bedc 100644 --- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java +++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java @@ -367,6 +367,7 @@ void testReplaceIndex() { fooIndex.unique(), fooIndex.status(), fooIndex.txWaitCatalogVersion(), + fooIndex.zoneId(), fooIndex.columns() ); diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java index d6153169ec2..b8b565975dc 100644 --- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java +++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java @@ -27,7 +27,7 @@ class CatalogHashIndexDescriptorTest { @Test void toStringContainsTypeAndFields() { - var descriptor = new CatalogHashIndexDescriptor(1, "index1", 2, false, 3, List.of("col")); + var descriptor = new CatalogHashIndexDescriptor(1, "index1", 2, false, 3, 0, List.of("col")); String toString = descriptor.toString(); @@ -36,5 +36,6 @@ void toStringContainsTypeAndFields() { assertThat(toString, containsString("name=index1")); assertThat(toString, containsString("tableId=2")); assertThat(toString, containsString("status=REGISTERED")); + assertThat(toString, containsString("zoneId=0")); } } diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java index 979ff659ed6..c461d6a009d 100644 --- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java +++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java @@ -27,7 +27,7 @@ class CatalogSortedIndexDescriptorTest { @Test void toStringContainsTypeAndFields() { - var descriptor = new CatalogSortedIndexDescriptor(1, "index1", 2, false, 3, List.of()); + var descriptor = new CatalogSortedIndexDescriptor(1, "index1", 2, false, 3, 0, List.of()); String toString = descriptor.toString(); @@ -36,5 +36,6 @@ void toStringContainsTypeAndFields() { assertThat(toString, containsString("name=index1")); assertThat(toString, containsString("tableId=2")); assertThat(toString, containsString("status=REGISTERED")); + assertThat(toString, containsString("zoneId=0")); } } diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java index ac0e23b8edc..c346fd4acd3 100644 --- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java +++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java @@ -452,12 +452,12 @@ private static CatalogSortedIndexDescriptor newSortedIndexDescriptor(String name CatalogIndexColumnDescriptor idxCol4 = new CatalogIndexColumnDescriptor("C4", CatalogColumnCollation.ASC_NULLS_LAST); return new CatalogSortedIndexDescriptor( - 1, name, 12, false, CatalogIndexStatus.AVAILABLE, 1, List.of(idxCol1, idxCol2, idxCol3, idxCol4)); + 1, name, 12, false, CatalogIndexStatus.AVAILABLE, 1, 0, List.of(idxCol1, idxCol2, idxCol3, idxCol4)); } private static CatalogHashIndexDescriptor newHashIndexDescriptor(String name) { return new CatalogHashIndexDescriptor( - 1, name, 12, true, CatalogIndexStatus.REGISTERED, 1, List.of("C1", "C2")); + 1, name, 12, true, CatalogIndexStatus.REGISTERED, 1, 0, List.of("C1", "C2")); } private static CatalogTableDescriptor newTableDescriptor(String name, List columns) { diff --git a/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java b/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java index ee040d01591..514b496a4a2 100644 --- a/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java +++ b/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java @@ -45,6 +45,7 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.LongPriorityQueue; import org.apache.ignite.internal.table.distributed.schema.SchemaSyncService; import org.apache.ignite.internal.util.ExceptionUtils; @@ -291,11 +292,13 @@ void stop() { private void onPrimaryReplicaChanged(PrimaryReplicaEventParameters primaryReplicaEvent) { inBusyLock(busyLock, () -> { - if (!(primaryReplicaEvent.groupId() instanceof TablePartitionId)) { + if (!(primaryReplicaEvent.groupId() instanceof ZonePartitionId)) { return; } - TablePartitionId tablePartitionId = (TablePartitionId) primaryReplicaEvent.groupId(); + ZonePartitionId zonePartitionId = (ZonePartitionId) primaryReplicaEvent.groupId(); + + TablePartitionId tablePartitionId = new TablePartitionId(zonePartitionId.tableId(), zonePartitionId.partitionId()); updatePrimaryReplica(tablePartitionId, primaryReplicaEvent.startTime(), primaryReplicaEvent.leaseholder()); }); diff --git a/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java b/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java index 01e1f6b0b2a..01a643ed823 100644 --- a/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java +++ b/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java @@ -43,6 +43,8 @@ class ClientPrimaryReplicaTrackerTest extends BaseIgniteAbstractTest { private static final int TABLE_ID = 123; + private static final int ZONE_ID = 1234; + private ClientPrimaryReplicaTracker tracker; private FakePlacementDriver driver; @@ -52,7 +54,7 @@ class ClientPrimaryReplicaTrackerTest extends BaseIgniteAbstractTest { @BeforeEach public void setUp() throws Exception { driver = new FakePlacementDriver(PARTITIONS); - driver.setReplicas(List.of("s1", "s2"), TABLE_ID, 1); + driver.setReplicas(List.of("s1", "s2"), TABLE_ID, ZONE_ID, 1); InternalTable internalTable = mock(InternalTable.class); when(internalTable.partitions()).thenReturn(PARTITIONS); @@ -90,7 +92,7 @@ public void testUpdateByEvent() { tracker.start(); assertEquals(1, tracker.maxStartTime()); - driver.updateReplica("s3", TABLE_ID, 0, 2); + driver.updateReplica("s3", TABLE_ID, ZONE_ID, 0, 2); assertEquals(2, tracker.maxStartTime()); @@ -102,11 +104,11 @@ public void testUpdateByEvent() { @Test public void testNullReplicas() { - driver.updateReplica(null, TABLE_ID, 0, 2); + driver.updateReplica(null, TABLE_ID, ZONE_ID, 0, 2); tracker.start(); assertEquals(1, tracker.maxStartTime()); - driver.updateReplica(null, TABLE_ID, 1, 2); + driver.updateReplica(null, TABLE_ID, ZONE_ID, 1, 2); assertEquals(2, tracker.maxStartTime()); @@ -136,10 +138,10 @@ public void testOldEventsAreIgnoredByLeaseStartTime() { tracker.start(); tracker.primaryReplicasAsync(TABLE_ID, null).join(); // Start tracking the table. - driver.updateReplica("update-1", TABLE_ID, 0, 10); - driver.updateReplica("old-update-2", TABLE_ID, 0, 5); - driver.updateReplica("update-3", TABLE_ID, 0, 15); - driver.updateReplica("old-update-4", TABLE_ID, 0, 14); + driver.updateReplica("update-1", TABLE_ID, ZONE_ID, 0, 10); + driver.updateReplica("old-update-2", TABLE_ID, ZONE_ID, 0, 5); + driver.updateReplica("update-3", TABLE_ID, ZONE_ID, 0, 15); + driver.updateReplica("old-update-4", TABLE_ID, ZONE_ID, 0, 14); assertEquals(15, tracker.maxStartTime()); diff --git a/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java b/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java index 5853de37f18..7d6adf45e1a 100644 --- a/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java +++ b/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java @@ -22,16 +22,19 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import org.apache.ignite.internal.event.AbstractEventProducer; import org.apache.ignite.internal.hlc.HybridTimestamp; +import org.apache.ignite.internal.lang.IgniteInternalException; import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; /** * Fake placement driver. @@ -56,21 +59,21 @@ public void returnError(boolean returnError) { /** * Sets all primary replicas. */ - public void setReplicas(List replicas, int tableId, long leaseStartTime) { + public void setReplicas(List replicas, int tableId, int zoneId, long leaseStartTime) { assert replicas.size() == partitions; for (int partition = 0; partition < replicas.size(); partition++) { String replica = replicas.get(partition); - updateReplica(replica, tableId, partition, leaseStartTime); + updateReplica(replica, tableId, zoneId, partition, leaseStartTime); } } /** * Sets primary replica for the given partition. */ - public void updateReplica(String replica, int tableId, int partition, long leaseStartTime) { + public void updateReplica(String replica, int tableId, int zoneId, int partition, long leaseStartTime) { primaryReplicas.set(partition, getReplicaMeta(replica, leaseStartTime)); - TablePartitionId groupId = new TablePartitionId(tableId, partition); + ZonePartitionId groupId = new ZonePartitionId(zoneId, tableId, partition); PrimaryReplicaEventParameters params = new PrimaryReplicaEventParameters( 0, @@ -93,6 +96,16 @@ public CompletableFuture awaitPrimaryReplica(ReplicationGroupId gro : CompletableFuture.completedFuture(primaryReplicas.get(id.partitionId())); } + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + throw new IgniteInternalException("Not implemented yet."); + } + @Override public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) { return awaitPrimaryReplica(replicationGroupId, timestamp, 0, TimeUnit.MILLISECONDS); @@ -125,6 +138,25 @@ public HybridTimestamp getStartTime() { public HybridTimestamp getExpirationTime() { return HybridTimestamp.MAX_VALUE; } + + @Override + public Set subgroups() { + return Set.of(); + } }; } + + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + return nullCompletedFuture(); + } + + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + return null; + } } diff --git a/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java b/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java index 8ac8a715615..2bb4e8a29c1 100644 --- a/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java +++ b/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java @@ -86,6 +86,8 @@ public class PartitionAwarenessTest extends AbstractClientTest { private static final AtomicInteger nextTableId = new AtomicInteger(101); + private static final int zoneId = 1234; + /** * Before all. */ @@ -651,7 +653,7 @@ private static void initPrimaryReplicas(FakePlacementDriver placementDriver, @Nu replicas = defaultReplicas(); } - placementDriver.setReplicas(replicas, nextTableId.get() - 1, leaseStartTime); + placementDriver.setReplicas(replicas, nextTableId.get() - 1, zoneId, leaseStartTime); } private static List defaultReplicas() { diff --git a/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java b/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java index 8016953e97f..0811a1f0603 100644 --- a/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java +++ b/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java @@ -116,6 +116,11 @@ public int partitionId(BinaryRowEx row) { return 0; } + @Override + public int zoneId() { + return 123; + } + @Override public CompletableFuture get(BinaryRowEx keyRow, @Nullable InternalTransaction tx) { return completedFuture(getImpl(keyRow.tupleSlice(), keyRow)); diff --git a/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java b/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java index 0bd0d64e36f..5f7beff0d4c 100644 --- a/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java +++ b/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java @@ -50,7 +50,7 @@ import org.apache.ignite.compute.task.ComputeJobRunner; import org.apache.ignite.internal.hlc.HybridClock; import org.apache.ignite.internal.placementdriver.PlacementDriver; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.IgniteTablesInternal; import org.apache.ignite.internal.table.TableViewInternal; import org.apache.ignite.internal.util.CompletableFutures; @@ -320,9 +320,9 @@ private CompletableFuture primaryReplicaForPartitionByMappedKey } private CompletableFuture primaryReplicaForPartition(TableViewInternal table, int partitionIndex) { - TablePartitionId tablePartitionId = new TablePartitionId(table.tableId(), partitionIndex); + ZonePartitionId zonePartitionId = new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), partitionIndex); - return placementDriver.awaitPrimaryReplica(tablePartitionId, clock.now(), 30, TimeUnit.SECONDS) + return placementDriver.awaitPrimaryReplicaForTable(zonePartitionId, clock.now(), 30, TimeUnit.SECONDS) .thenApply(replicaMeta -> { if (replicaMeta != null && replicaMeta.getLeaseholderId() != null) { return topologyService.getById(replicaMeta.getLeaseholderId()); diff --git a/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java b/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java index a9055ad5f51..5eb89bd170b 100644 --- a/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java +++ b/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java @@ -22,7 +22,7 @@ import org.apache.ignite.internal.hlc.HybridClock; import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.ReplicaMeta; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.TableViewInternal; import org.apache.ignite.network.ClusterNode; import org.apache.ignite.network.TopologyService; @@ -93,9 +93,9 @@ private NextColocatedWorkerSelector( this.tuple = tuple; } - private CompletableFuture tryToFindPrimaryReplica(TablePartitionId tablePartitionId) { - return placementDriver.awaitPrimaryReplica( - tablePartitionId, + private CompletableFuture tryToFindPrimaryReplica(ZonePartitionId zonePartitionId) { + return placementDriver.awaitPrimaryReplicaForTable( + zonePartitionId, clock.now().addPhysicalTime(PRIMARY_REPLICA_ASK_CLOCK_ADDITION_MILLIS), AWAIT_FOR_PRIMARY_REPLICA_SECONDS, TimeUnit.SECONDS @@ -105,15 +105,15 @@ private CompletableFuture tryToFindPrimaryReplica(TablePartitionId @Override public CompletableFuture next() { - TablePartitionId tablePartitionId = tablePartitionId(); - return tryToFindPrimaryReplica(tablePartitionId); + ZonePartitionId zonePartitionId = zonePartitionId(); + return tryToFindPrimaryReplica(zonePartitionId); } - private TablePartitionId tablePartitionId() { + private ZonePartitionId zonePartitionId() { if (key != null && keyMapper != null) { - return new TablePartitionId(table.tableId(), table.partition(key, keyMapper)); + return new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), table.partition(key, keyMapper)); } else { - return new TablePartitionId(table.tableId(), table.partition(tuple)); + return new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), table.partition(tuple)); } } } diff --git a/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java b/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java index 7a078086e17..a8940d85e06 100644 --- a/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java +++ b/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java @@ -49,6 +49,7 @@ import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.table.IgniteTablesInternal; +import org.apache.ignite.internal.table.InternalTable; import org.apache.ignite.internal.table.TableViewInternal; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.network.ClusterNode; @@ -92,6 +93,9 @@ class IgniteComputeImplTest extends BaseIgniteAbstractTest { @Mock private TableViewInternal table; + @Mock + private InternalTable internalTable; + private final ClusterNode localNode = new ClusterNodeImpl("local", "local", new NetworkAddress("local-host", 1)); private final ClusterNode remoteNode = new ClusterNodeImpl("remote", "remote", new NetworkAddress("remote-host", 1)); @@ -103,6 +107,8 @@ void setupMocks() { lenient().when(topologyService.localMember()).thenReturn(localNode); lenient().when(topologyService.getByConsistentId(localNode.name())).thenReturn(localNode); lenient().when(topologyService.getByConsistentId(remoteNode.name())).thenReturn(remoteNode); + lenient().when(table.internalTable()).thenReturn(internalTable); + lenient().when(table.tableId()).thenReturn(42); } @Test @@ -212,7 +218,7 @@ private void respondWhenAskForPrimaryReplica() { ReplicaMeta replicaMeta = mock(ReplicaMeta.class); doReturn("").when(replicaMeta).getLeaseholderId(); CompletableFuture toBeReturned = completedFuture(replicaMeta); - doReturn(toBeReturned).when(placementDriver).awaitPrimaryReplica(any(), any(), anyLong(), any()); + doReturn(toBeReturned).when(placementDriver).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any()); doReturn(remoteNode).when(topologyService).getById(any()); } diff --git a/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java b/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java new file mode 100644 index 00000000000..d6c07698370 --- /dev/null +++ b/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.replicator; + +import java.util.Objects; + +/** + * The class is used to identify a zone replication group id for a given partition. + */ +public class ZonePartitionId implements ReplicationGroupId { + private final int zoneId; + + private final int tableId; + + private final int partId; + + /** + * The constructor. + * + * @param zoneId Zone id. + * @param partId Partition id. + */ + public ZonePartitionId(int zoneId, int partId) { + this.zoneId = zoneId; + this.partId = partId; + this.tableId = 0; + } + + /** + * The constructor. + * + * @param zoneId Zone id. + * @param tableId Table id. + * @param partId Partition id. + */ + public ZonePartitionId(int zoneId, int tableId, int partId) { + assert tableId != 0 : "Use constructor with two parameters."; + + this.zoneId = zoneId; + this.tableId = tableId; + this.partId = partId; + } + + /** + * Get the zone id. + * + * @return Zone id. + */ + public int zoneId() { + return zoneId; + } + + /** + * Get the table id. + * + * @return Table id. + */ + public int tableId() { + return tableId; + } + + /** + * Get the partition id. + * + * @return Partition id. + */ + public int partitionId() { + return partId; + } + + /** + * Converts a string representation of zone partition id to the object. + * + * @param str String representation. + * @return An zone partition id. + */ + public static ZonePartitionId fromString(String str) { + String[] parts = str.split("_part_"); + + return new ZonePartitionId(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); + } + + /** + * Creates a new object if this one has a defined table id or returns itself if it does not have a value of the table id. + * + * @return Pure zone partition id. + */ + public ZonePartitionId purify() { + if (tableId == 0) { + return this; + } + + return new ZonePartitionId(zoneId, partId); + } + + @Override + public String toString() { + return zoneId + "_part_" + partId; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + ZonePartitionId that = (ZonePartitionId) o; + + return zoneId == that.zoneId && partId == that.partId && tableId == that.tableId; + } + + @Override + public int hashCode() { + return Objects.hash(zoneId, partId, tableId); + } +} diff --git a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java index e676b39c857..9f336298be2 100644 --- a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java +++ b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java @@ -45,7 +45,7 @@ import org.apache.ignite.internal.lang.ByteArray; import org.apache.ignite.internal.metastorage.Entry; import org.apache.ignite.internal.metastorage.MetaStorageManager; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.TableViewInternal; import org.apache.ignite.internal.table.distributed.TableManager; import org.apache.ignite.internal.testframework.IgniteTestUtils; @@ -130,11 +130,7 @@ void testFilteredDataNodesPropagatedToStable() throws Exception { MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils .getFieldValue(node, IgniteImpl.class, "metaStorageMgr"); - TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node, IgniteImpl.class, "distributedTblMgr"); - - TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME); - - TablePartitionId partId = new TablePartitionId(table.tableId(), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneId(node), 0); assertValueInStorage( metaStorageManager, @@ -199,11 +195,7 @@ void testAlteringFiltersPropagatedDataNodesToStableImmediately() throws Exceptio MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils .getFieldValue(node0, IgniteImpl.class, "metaStorageMgr"); - TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr"); - - TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME); - - TablePartitionId partId = new TablePartitionId(table.tableId(), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneId(node0), 0); assertValueInStorage( metaStorageManager, @@ -254,11 +246,7 @@ void testEmptyDataNodesDoNotPropagatedToStableAfterAlteringFilter() throws Excep MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils .getFieldValue(node0, IgniteImpl.class, "metaStorageMgr"); - TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr"); - - TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME); - - TablePartitionId partId = new TablePartitionId(table.tableId(), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneId(node0), 0); assertValueInStorage( metaStorageManager, @@ -332,7 +320,7 @@ void testFilteredEmptyDataNodesDoNotTriggerRebalance() throws Exception { TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME); - TablePartitionId partId = new TablePartitionId(table.tableId(), 0); + ZonePartitionId partId = new ZonePartitionId(zoneId, 0); // Table was created after both nodes was up, so there wasn't any rebalance. assertPendingAssignmentsWereNeverExist(metaStorageManager, partId); @@ -372,11 +360,7 @@ void testFilteredEmptyDataNodesDoNotTriggerRebalanceOnReplicaUpdate() throws Exc node0.sql().execute(null, createTableSql()); - TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr"); - - TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME); - - TablePartitionId partId = new TablePartitionId(table.tableId(), 0); + ZonePartitionId partId = new ZonePartitionId(zoneId, 0); // Table was created after both nodes was up, so there wasn't any rebalance. assertPendingAssignmentsWereNeverExist(metaStorageManager, partId); @@ -443,7 +427,7 @@ private static void waitDataNodeAndListenersAreHandled( private static void assertPendingAssignmentsWereNeverExist( MetaStorageManager metaStorageManager, - TablePartitionId partId + ZonePartitionId partId ) throws InterruptedException, ExecutionException { assertTrue(metaStorageManager.get(pendingPartAssignmentsKey(partId)).get().empty()); } diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java index 39448ae24f5..29edaf223c4 100644 --- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java +++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java @@ -20,23 +20,21 @@ import static java.util.concurrent.CompletableFuture.allOf; import static java.util.concurrent.CompletableFuture.completedFuture; import static org.apache.ignite.internal.catalog.events.CatalogEvent.ZONE_ALTER; -import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.filterDataNodes; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.findTablesByZoneId; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.parseDataNodes; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zoneDataNodesKey; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceRaftGroupEventsListener.doStableKeySwitch; +import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.TABLES_COUNTER_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneId; +import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneIdDataNodes; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.raftConfigurationAppliedKey; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.tablesCounterPrefixKey; import static org.apache.ignite.internal.util.ByteUtils.fromBytes; import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; @@ -56,14 +54,13 @@ import org.apache.ignite.internal.distributionzones.DistributionZoneManager; import org.apache.ignite.internal.distributionzones.Node; import org.apache.ignite.internal.distributionzones.utils.CatalogAlterZoneEventListener; -import org.apache.ignite.internal.lang.ByteArray; import org.apache.ignite.internal.logger.IgniteLogger; import org.apache.ignite.internal.logger.Loggers; import org.apache.ignite.internal.metastorage.Entry; import org.apache.ignite.internal.metastorage.MetaStorageManager; import org.apache.ignite.internal.metastorage.WatchEvent; import org.apache.ignite.internal.metastorage.WatchListener; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.ExceptionUtils; import org.apache.ignite.internal.util.IgniteSpinBusyLock; import org.apache.ignite.internal.util.IgniteUtils; @@ -202,7 +199,7 @@ public CompletableFuture onUpdate(WatchEvent evt) { return nullCompletedFuture(); } - int zoneId = extractZoneId(evt.entryEvent().newEntry().key(), DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX); + int zoneId = extractZoneIdDataNodes(evt.entryEvent().newEntry().key()); // It is safe to get the latest version of the catalog as we are in the metastore thread. int catalogVersion = catalogService.latestCatalogVersion(); @@ -224,13 +221,10 @@ public CompletableFuture onUpdate(WatchEvent evt) { return nullCompletedFuture(); } - List tableDescriptors = findTablesByZoneId(zoneId, catalogVersion, catalogService); - - return triggerPartitionsRebalanceForAllTables( + return triggerPartitionsRebalanceForZone( evt.entryEvent().newEntry().revision(), zoneDescriptor, - filteredDataNodes, - tableDescriptors + filteredDataNodes ); }); } @@ -263,7 +257,7 @@ public CompletableFuture onUpdate(WatchEvent event) { return nullCompletedFuture(); } - int zoneId = RebalanceUtil.extractZoneIdFromTablesCounter(event.entryEvent().newEntry().key()); + int zoneId = extractZoneId(event.entryEvent().newEntry().key(), TABLES_COUNTER_PREFIX); // TODO: https://issues.apache.org/jira/browse/IGNITE-21254 tables here must be the same as they were on rebalance start List tables = findTablesByZoneId(zoneId, catalogService.latestCatalogVersion(), catalogService); @@ -279,27 +273,20 @@ public CompletableFuture onUpdate(WatchEvent event) { ); try { - Map partitionTablesKeys = new HashMap<>(); - int partId = extractPartitionNumber(event.entryEvent().newEntry().key()); - for (CatalogTableDescriptor table : tables) { - TablePartitionId replicaGrpId = new TablePartitionId(table.id(), partId); - partitionTablesKeys.put(raftConfigurationAppliedKey(replicaGrpId), replicaGrpId); - } + ZonePartitionId replicaGrpId = new ZonePartitionId(zoneId, partId); - Map entriesMap = metaStorageManager.getAll(partitionTablesKeys.keySet()).get(); + Entry assignmentEntry = metaStorageManager.get(raftConfigurationAppliedKey(replicaGrpId)).get(); - entriesMap.forEach((key, stable) -> { - doStableKeySwitch( - Assignments.fromBytes(stable.value()).nodes(), - partitionTablesKeys.get(key), - event.revision(), - metaStorageManager, - catalogService, - distributionZoneManager - ); - }); + tables.forEach(tbl -> doStableKeySwitch( + Assignments.fromBytes(assignmentEntry.value()).nodes(), + replicaGrpId, + event.revision(), + metaStorageManager, + catalogService, + distributionZoneManager + )); } catch (Exception e) { LOG.error( @@ -332,24 +319,24 @@ private CompletableFuture onUpdateReplicas(AlterZoneEventParameters parame } static CompletableFuture> calculateAssignments( - TablePartitionId tablePartitionId, + ZonePartitionId zonePartitionId, CatalogService catalogService, DistributionZoneManager distributionZoneManager ) { int catalogVersion = catalogService.latestCatalogVersion(); - CatalogTableDescriptor tableDescriptor = catalogService.table(tablePartitionId.tableId(), catalogVersion); + CatalogZoneDescriptor zoneDescriptor = catalogService.zone(zonePartitionId.zoneId(), catalogVersion); - CatalogZoneDescriptor zoneDescriptor = catalogService.zone(tableDescriptor.zoneId(), catalogVersion); + int zoneId = zonePartitionId.zoneId(); return distributionZoneManager.dataNodes( zoneDescriptor.updateToken(), catalogVersion, - tableDescriptor.zoneId() + zoneId ).thenApply(dataNodes -> AffinityUtils.calculateAssignmentForPartition( dataNodes, - tablePartitionId.partitionId(), + zonePartitionId.partitionId(), zoneDescriptor.replicas() ) ); @@ -375,72 +362,61 @@ private CompletableFuture recalculateAssignmentsAndScheduleRebalance( return nullCompletedFuture(); } - List tableDescriptors = findTablesByZoneId(zoneDescriptor.id(), catalogVersion, catalogService); - - return triggerPartitionsRebalanceForAllTables( + return triggerPartitionsRebalanceForZone( causalityToken, zoneDescriptor, - dataNodes, - tableDescriptors + dataNodes ); }); } - private CompletableFuture triggerPartitionsRebalanceForAllTables( + private CompletableFuture triggerPartitionsRebalanceForZone( long revision, CatalogZoneDescriptor zoneDescriptor, - Set dataNodes, - List tableDescriptors + Set dataNodes ) { - List> tableFutures = new ArrayList<>(tableDescriptors.size()); - - for (CatalogTableDescriptor tableDescriptor : tableDescriptors) { - CompletableFuture[] partitionFutures = RebalanceUtil.triggerAllTablePartitionsRebalance( - tableDescriptor, - zoneDescriptor, - dataNodes, - revision, - metaStorageManager - ); - - // This set is used to deduplicate exceptions (if there is an exception from upstream, for instance, - // when reading from MetaStorage, it will be encountered by every partition future) to avoid noise - // in the logs. - Set unwrappedCauses = ConcurrentHashMap.newKeySet(); - - for (int partId = 0; partId < partitionFutures.length; partId++) { - int finalPartId = partId; - - partitionFutures[partId].exceptionally(e -> { - Throwable cause = ExceptionUtils.unwrapCause(e); - - if (unwrappedCauses.add(cause)) { - // The exception is specific to this partition. - LOG.error( - "Exception on updating assignments for [table={}, partition={}]", - e, - tableInfo(tableDescriptor), finalPartId - ); - } else { - // The exception is from upstream and not specific for this partition, so don't log the partition index. - LOG.error( - "Exception on updating assignments for [table={}]", - e, - tableInfo(tableDescriptor) - ); - } + CompletableFuture[] partitionFutures = RebalanceUtil.triggerZonePartitionsRebalance( + zoneDescriptor, + dataNodes, + revision, + metaStorageManager + ); - return null; - }); - } + // This set is used to deduplicate exceptions (if there is an exception from upstream, for instance, + // when reading from MetaStorage, it will be encountered by every partition future) to avoid noise + // in the logs. + Set unwrappedCauses = ConcurrentHashMap.newKeySet(); + + for (int partId = 0; partId < partitionFutures.length; partId++) { + int finalPartId = partId; + + partitionFutures[partId].exceptionally(e -> { + Throwable cause = ExceptionUtils.unwrapCause(e); + + if (unwrappedCauses.add(cause)) { + // The exception is specific to this partition. + LOG.error( + "Exception on updating assignments for [zone={}, partition={}]", + e, + zoneInfo(zoneDescriptor), finalPartId + ); + } else { + // The exception is from upstream and not specific for this partition, so don't log the partition index. + LOG.error( + "Exception on updating assignments for [zone={}]", + e, + zoneInfo(zoneDescriptor) + ); + } - tableFutures.add(allOf(partitionFutures)); + return null; + }); } - return allOf(tableFutures.toArray(CompletableFuture[]::new)); + return allOf(partitionFutures); } - private static String tableInfo(CatalogTableDescriptor tableDescriptor) { - return tableDescriptor.id() + "/" + tableDescriptor.name(); + private static String zoneInfo(CatalogZoneDescriptor zoneDescriptor) { + return zoneDescriptor.id() + "/" + zoneDescriptor.name(); } } diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java index dcab08ef394..aa8037195e8 100644 --- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java +++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java @@ -67,13 +67,13 @@ import org.apache.ignite.internal.raft.RaftError; import org.apache.ignite.internal.raft.RaftGroupEventsListener; import org.apache.ignite.internal.raft.Status; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.ByteUtils; import org.apache.ignite.internal.util.IgniteSpinBusyLock; /** - * Listener for the raft group events, which must provide correct error handling of rebalance process - * and start new rebalance after the current one finished. + * Listener for the raft group events, which must provide correct error handling of rebalance process and start new rebalance after the + * current one finished. */ public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener { /** Ignite logger. */ @@ -122,7 +122,9 @@ public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener private final MetaStorageManager metaStorageMgr; /** Unique table partition id. */ - private final TablePartitionId tablePartitionId; + private final ZonePartitionId zonePartitionId; + + private final int tableId; /** Busy lock of parent component for synchronous stop. */ private final IgniteSpinBusyLock busyLock; @@ -130,38 +132,35 @@ public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener /** Executor for scheduling rebalance retries. */ private final ScheduledExecutorService rebalanceScheduler; - /** Zone id. */ - private final int zoneId; - /** Performs reconfiguration of a Raft group of a partition. */ private final PartitionMover partitionMover; /** Attempts to retry the current rebalance in case of errors. */ - private final AtomicInteger rebalanceAttempts = new AtomicInteger(0); + private final AtomicInteger rebalanceAttempts = new AtomicInteger(0); /** * Constructs new listener. * * @param metaStorageMgr Meta storage manager. - * @param tablePartitionId Partition id. + * @param zonePartitionId Partition id. * @param busyLock Busy lock. * @param partitionMover Class that moves partition between nodes. * @param rebalanceScheduler Executor for scheduling rebalance retries. */ public RebalanceRaftGroupEventsListener( MetaStorageManager metaStorageMgr, - TablePartitionId tablePartitionId, + ZonePartitionId zonePartitionId, IgniteSpinBusyLock busyLock, PartitionMover partitionMover, ScheduledExecutorService rebalanceScheduler, - int zoneId + int tableId ) { this.metaStorageMgr = metaStorageMgr; - this.tablePartitionId = tablePartitionId; + this.zonePartitionId = zonePartitionId; this.busyLock = busyLock; this.partitionMover = partitionMover; this.rebalanceScheduler = rebalanceScheduler; - this.zoneId = zoneId; + this.tableId = tableId; } /** {@inheritDoc} */ @@ -180,7 +179,7 @@ public void onLeaderElected(long term) { try { rebalanceAttempts.set(0); - byte[] pendingAssignmentsBytes = metaStorageMgr.get(pendingPartAssignmentsKey(tablePartitionId)).get().value(); + byte[] pendingAssignmentsBytes = metaStorageMgr.get(pendingPartAssignmentsKey(zonePartitionId)).get().value(); if (pendingAssignmentsBytes != null) { Set pendingAssignments = Assignments.fromBytes(pendingAssignmentsBytes).nodes(); @@ -198,7 +197,7 @@ public void onLeaderElected(long term) { LOG.info( "New leader elected. Going to apply new configuration [tablePartitionId={}, peers={}, learners={}]", - tablePartitionId, peers, learners + zonePartitionId, peers, learners ); PeersAndLearners peersAndLearners = PeersAndLearners.fromConsistentIds(peers, learners); @@ -207,7 +206,7 @@ public void onLeaderElected(long term) { } } catch (Exception e) { // TODO: IGNITE-14693 - LOG.warn("Unable to start rebalance [tablePartitionId, term={}]", e, tablePartitionId, term); + LOG.warn("Unable to start rebalance [tablePartitionId, term={}]", e, zonePartitionId, term); } finally { busyLock.leaveBusy(); } @@ -251,9 +250,9 @@ public void onNewPeersConfigurationApplied(PeersAndLearners configuration) { */ private void countDownPartitionsFromZone(Set stable) { try { - int partId = tablePartitionId.partitionId(); + int partId = zonePartitionId.partitionId(); - Entry counterEntry = metaStorageMgr.get(tablesCounterKey(zoneId, partId)).get(); + Entry counterEntry = metaStorageMgr.get(tablesCounterKey(zonePartitionId)).get(); assert counterEntry.value() != null; @@ -261,32 +260,33 @@ private void countDownPartitionsFromZone(Set stable) { assert !counter.isEmpty(); - if (!counter.contains(tablePartitionId.tableId())) { + if (!counter.contains(tableId)) { // Count down for this table has already been processed, just skip. // For example, this can happen when leader re-election happened during the rebalance process. return; } - Condition condition = value(tablesCounterKey(zoneId, partId)).eq(counterEntry.value()); + Condition condition = value(tablesCounterKey(zonePartitionId)).eq(counterEntry.value()); byte[] stableArray = Assignments.toBytes(stable); - counter.remove(tablePartitionId.tableId()); + counter.remove(tableId); if (counter.isEmpty()) { counter = Set.of(); } Update successCase = ops( - put(tablesCounterKey(zoneId, partId), toBytes(counter)), + put(tablesCounterKey(zonePartitionId), toBytes(counter)), // Todo: change to one key https://issues.apache.org/jira/browse/IGNITE-18991 - put(raftConfigurationAppliedKey(tablePartitionId), stableArray) + put(raftConfigurationAppliedKey(zonePartitionId), stableArray) ).yield(TABLES_COUNTER_DECREMENT_SUCCESS); Update failCase = ops().yield(PART_COUNTER_DECREMENT_FAIL); int res = metaStorageMgr.invoke(iif(condition, successCase, failCase)).get().getAsInt(); + int zoneId = zonePartitionId.zoneId(); if (res < 0) { LOG.info("Count down of zone's tables counter is failed. " + "Going to retry [zoneId={}, appliedPeers={}]", @@ -309,7 +309,7 @@ private void countDownPartitionsFromZone(Set stable) { rebalanceAttempts.set(0); } catch (InterruptedException | ExecutionException e) { // TODO: IGNITE-14693 - LOG.warn("Unable to count down partitions counter in metastore: " + tablePartitionId, e); + LOG.warn("Unable to count down partitions counter in metastore: " + zonePartitionId, e); } } @@ -325,7 +325,7 @@ public void onReconfigurationError(Status status, PeersAndLearners configuration if (status.equals(Status.LEADER_STEPPED_DOWN)) { // Leader stepped down, so we are expecting RebalanceRaftGroupEventsListener.onLeaderElected to be called on a new leader. - LOG.info("Leader stepped down during rebalance [partId={}]", tablePartitionId); + LOG.info("Leader stepped down during rebalance [partId={}]", zonePartitionId); return; } @@ -335,12 +335,12 @@ public void onReconfigurationError(Status status, PeersAndLearners configuration assert raftError == RaftError.ECATCHUP : "According to the JRaft protocol, " + RaftError.ECATCHUP + " is expected, got " + raftError; - LOG.debug("Error occurred during rebalance [partId={}]", tablePartitionId); + LOG.debug("Error occurred during rebalance [partId={}]", zonePartitionId); if (rebalanceAttempts.incrementAndGet() < REBALANCE_RETRY_THRESHOLD) { scheduleChangePeers(configuration, term); } else { - LOG.info("Number of retries for rebalance exceeded the threshold [partId={}, threshold={}]", tablePartitionId, + LOG.info("Number of retries for rebalance exceeded the threshold [partId={}, threshold={}]", zonePartitionId, REBALANCE_RETRY_THRESHOLD); // TODO: currently we just retry intent to change peers according to the rebalance infinitely, until new leader is elected, @@ -364,7 +364,7 @@ private void scheduleChangePeers(PeersAndLearners peersAndLearners, long term) { return; } - LOG.info("Going to retry rebalance [attemptNo={}, partId={}]", rebalanceAttempts.get(), tablePartitionId); + LOG.info("Going to retry rebalance [attemptNo={}, partId={}]", rebalanceAttempts.get(), zonePartitionId); try { partitionMover.movePartition(peersAndLearners, term).join(); @@ -379,19 +379,19 @@ private void scheduleChangePeers(PeersAndLearners peersAndLearners, long term) { */ static void doStableKeySwitch( Set stableFromRaft, - TablePartitionId tablePartitionId, + ZonePartitionId zonePartitionId, long revision, MetaStorageManager metaStorageMgr, CatalogService catalogService, DistributionZoneManager distributionZoneManager ) { try { - ByteArray pendingPartAssignmentsKey = pendingPartAssignmentsKey(tablePartitionId); - ByteArray stablePartAssignmentsKey = stablePartAssignmentsKey(tablePartitionId); - ByteArray plannedPartAssignmentsKey = plannedPartAssignmentsKey(tablePartitionId); - ByteArray switchReduceKey = switchReduceKey(tablePartitionId); - ByteArray switchAppendKey = switchAppendKey(tablePartitionId); - ByteArray stableChangeTriggerKey = stableChangeTriggerKey(tablePartitionId); + ByteArray pendingPartAssignmentsKey = pendingPartAssignmentsKey(zonePartitionId); + ByteArray stablePartAssignmentsKey = stablePartAssignmentsKey(zonePartitionId); + ByteArray plannedPartAssignmentsKey = plannedPartAssignmentsKey(zonePartitionId); + ByteArray switchReduceKey = switchReduceKey(zonePartitionId); + ByteArray switchAppendKey = switchAppendKey(zonePartitionId); + ByteArray stableChangeTriggerKey = stableChangeTriggerKey(zonePartitionId); // TODO: https://issues.apache.org/jira/browse/IGNITE-17592 Remove synchronous wait Map values = metaStorageMgr.getAll( @@ -405,7 +405,7 @@ static void doStableKeySwitch( ) ).get(); - Set calculatedAssignments = calculateAssignments(tablePartitionId, catalogService, distributionZoneManager).get(); + Set calculatedAssignments = calculateAssignments(zonePartitionId, catalogService, distributionZoneManager).get(); Entry stableEntry = values.get(stablePartAssignmentsKey); Entry pendingEntry = values.get(pendingPartAssignmentsKey); @@ -518,8 +518,8 @@ static void doStableKeySwitch( // TODO: https://issues.apache.org/jira/browse/IGNITE-17592 Remove synchronous wait int res = metaStorageMgr.invoke( iif(or( - notExists(stableChangeTriggerKey(tablePartitionId)), - value(stableChangeTriggerKey(tablePartitionId)).lt(ByteUtils.longToBytes(revision)) + notExists(stableChangeTriggerKey(zonePartitionId)), + value(stableChangeTriggerKey(zonePartitionId)).lt(ByteUtils.longToBytes(revision)) ), iif(retryPreconditions, successCase, failCase), ops().yield(OUTDATED_INVOKE_STATUS) @@ -531,20 +531,20 @@ static void doStableKeySwitch( case SWITCH_APPEND_FAIL: LOG.info("Rebalance keys changed while trying to update rebalance pending addition information. " + "Going to retry [tablePartitionID={}, appliedPeers={}]", - tablePartitionId, stableFromRaft + zonePartitionId, stableFromRaft ); break; case SWITCH_REDUCE_FAIL: LOG.info("Rebalance keys changed while trying to update rebalance pending reduce information. " + "Going to retry [tablePartitionID={}, appliedPeers={}]", - tablePartitionId, stableFromRaft + zonePartitionId, stableFromRaft ); break; case SCHEDULE_PENDING_REBALANCE_FAIL: case FINISH_REBALANCE_FAIL: LOG.info("Rebalance keys changed while trying to update rebalance information. " + "Going to retry [tablePartitionId={}, appliedPeers={}]", - tablePartitionId, stableFromRaft + zonePartitionId, stableFromRaft ); break; default: @@ -554,7 +554,7 @@ static void doStableKeySwitch( doStableKeySwitch( stableFromRaft, - tablePartitionId, + zonePartitionId, revision, metaStorageMgr, catalogService, @@ -568,29 +568,29 @@ static void doStableKeySwitch( case SWITCH_APPEND_SUCCESS: LOG.info("Rebalance finished. Going to schedule next rebalance with addition" + " [tablePartitionId={}, appliedPeers={}, plannedPeers={}]", - tablePartitionId, stableFromRaft, calculatedPendingAddition + zonePartitionId, stableFromRaft, calculatedPendingAddition ); break; case SWITCH_REDUCE_SUCCESS: LOG.info("Rebalance finished. Going to schedule next rebalance with reduction" + " [tablePartitionId={}, appliedPeers={}, plannedPeers={}]", - tablePartitionId, stableFromRaft, calculatedPendingReduction + zonePartitionId, stableFromRaft, calculatedPendingReduction ); break; case SCHEDULE_PENDING_REBALANCE_SUCCESS: LOG.info( "Rebalance finished. Going to schedule next rebalance [tablePartitionId={}, appliedPeers={}, plannedPeers={}]", - tablePartitionId, stableFromRaft, Assignments.fromBytes(plannedEntry.value()).nodes() + zonePartitionId, stableFromRaft, Assignments.fromBytes(plannedEntry.value()).nodes() ); break; case FINISH_REBALANCE_SUCCESS: - LOG.info("Rebalance finished [tablePartitionId={}, appliedPeers={}]", tablePartitionId, stableFromRaft); + LOG.info("Rebalance finished [tablePartitionId={}, appliedPeers={}]", zonePartitionId, stableFromRaft); break; case OUTDATED_INVOKE_STATUS: LOG.debug("Stable switch skipped because event is outdated " + "[tablePartitionId={}, stableChangeTriggerKey={}, revision={}]", - tablePartitionId, stableChangeTriggerValue, revision + zonePartitionId, stableChangeTriggerValue, revision ); break; @@ -601,7 +601,7 @@ static void doStableKeySwitch( } catch (InterruptedException | ExecutionException e) { // TODO: IGNITE-14693 - LOG.warn("Unable to commit partition configuration to metastore: " + tablePartitionId, e); + LOG.warn("Unable to commit partition configuration to metastore: " + zonePartitionId, e); } } diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java index 6fdec030e47..49bd1bbb6f9 100644 --- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java +++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java @@ -19,6 +19,7 @@ import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; +import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.ASSIGNMENT_NOT_UPDATED; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.OUTDATED_UPDATE_RECEIVED; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.PENDING_KEY_UPDATED; @@ -58,7 +59,7 @@ import org.apache.ignite.internal.metastorage.dsl.Condition; import org.apache.ignite.internal.metastorage.dsl.Iif; import org.apache.ignite.internal.metastorage.dsl.StatementResult; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.ByteUtils; import org.apache.ignite.internal.util.CollectionUtils; import org.jetbrains.annotations.Nullable; @@ -73,8 +74,9 @@ public class RebalanceUtil { /** * Status values for methods like - * {@link #updatePendingAssignmentsKeys(CatalogTableDescriptor, TablePartitionId, Collection, int, long, MetaStorageManager, int, Set)} - * or {@link #manualPartitionUpdate(TablePartitionId, Collection, Set, int, long, MetaStorageManager, Set)}. + * {@link #updatePendingAssignmentsKeys(CatalogZoneDescriptor, ZonePartitionId, Collection, int, long, MetaStorageManager, int, Set)} + * (CatalogTableDescriptor, ZonePartitionId, Collection, int, long, MetaStorageManager, int, Set)} or + * {@link #manualPartitionUpdate(ZonePartitionId, Collection, Set, int, long, MetaStorageManager, Set)}. */ public enum UpdateStatus { /** @@ -126,8 +128,8 @@ public static UpdateStatus valueOf(int ordinal) { /** * Update keys that related to rebalance algorithm in Meta Storage. Keys are specific for partition. * - * @param tableDescriptor Table descriptor. - * @param partId Unique identifier of a partition. + * @param zoneDescriptor Zone descriptor. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @param dataNodes Data nodes. * @param replicas Number of replicas for a table. * @param revision Revision of Meta Storage that is specific for the assignment update. @@ -137,8 +139,8 @@ public static UpdateStatus valueOf(int ordinal) { * @return Future representing result of updating keys in {@code metaStorageMgr} */ public static CompletableFuture updatePendingAssignmentsKeys( - CatalogTableDescriptor tableDescriptor, - TablePartitionId partId, + CatalogZoneDescriptor zoneDescriptor, + ZonePartitionId zonePartitionId, Collection dataNodes, int replicas, long revision, @@ -146,13 +148,13 @@ public static CompletableFuture updatePendingAssignmentsKeys( int partNum, Set tableCfgPartAssignments ) { - ByteArray partChangeTriggerKey = pendingChangeTriggerKey(partId); + ByteArray partChangeTriggerKey = pendingChangeTriggerKey(zonePartitionId); - ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId); + ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(zonePartitionId); - ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(partId); + ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(zonePartitionId); - ByteArray partAssignmentsStableKey = stablePartAssignmentsKey(partId); + ByteArray partAssignmentsStableKey = stablePartAssignmentsKey(zonePartitionId); Set partAssignments = AffinityUtils.calculateAssignmentForPartition(dataNodes, partNum, replicas); @@ -211,14 +213,14 @@ public static CompletableFuture updatePendingAssignmentsKeys( case PENDING_KEY_UPDATED: LOG.info( "Update metastore pending partitions key [key={}, partition={}, table={}/{}, newVal={}]", - partAssignmentsPendingKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(), + partAssignmentsPendingKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(), partAssignments); break; case PLANNED_KEY_UPDATED: LOG.info( "Update metastore planned partitions key [key={}, partition={}, table={}/{}, newVal={}]", - partAssignmentsPlannedKey, partNum, tableDescriptor.id(), tableDescriptor.name(), + partAssignmentsPlannedKey, partNum, zoneDescriptor.id(), zoneDescriptor.name(), partAssignments ); @@ -226,7 +228,7 @@ public static CompletableFuture updatePendingAssignmentsKeys( case PLANNED_KEY_REMOVED_EQUALS_PENDING: LOG.info( "Remove planned key because current pending key has the same value [key={}, partition={}, table={}/{}, val={}]", - partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(), + partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(), partAssignments ); @@ -235,7 +237,7 @@ public static CompletableFuture updatePendingAssignmentsKeys( LOG.info( "Remove planned key because pending is empty and calculated assignments are equal to current assignments " + "[key={}, partition={}, table={}/{}, val={}]", - partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(), + partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(), partAssignments ); @@ -243,7 +245,7 @@ public static CompletableFuture updatePendingAssignmentsKeys( case ASSIGNMENT_NOT_UPDATED: LOG.debug( "Assignments are not updated [key={}, partition={}, table={}/{}, val={}]", - partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(), + partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(), partAssignments ); @@ -251,7 +253,7 @@ public static CompletableFuture updatePendingAssignmentsKeys( case OUTDATED_UPDATE_RECEIVED: LOG.debug( "Received outdated rebalance trigger event [revision={}, partition={}, table={}/{}]", - revision, partNum, tableDescriptor.id(), tableDescriptor.name()); + revision, partNum, zoneDescriptor.id(), zoneDescriptor.name()); break; default: @@ -266,7 +268,6 @@ public static CompletableFuture updatePendingAssignmentsKeys( * provided data nodes, and, if the calculated assignments are different from the ones loaded from the * MetaStorages, writes them as pending assignments. * - * @param tableDescriptor Table descriptor. * @param zoneDescriptor Zone descriptor. * @param dataNodes Data nodes to use. * @param storageRevision MetaStorage revision corresponding to this request. @@ -274,16 +275,15 @@ public static CompletableFuture updatePendingAssignmentsKeys( * @return Array of futures, one per partition of the table; the futures complete when the described * rebalance triggering completes. */ - public static CompletableFuture[] triggerAllTablePartitionsRebalance( - CatalogTableDescriptor tableDescriptor, + public static CompletableFuture[] triggerZonePartitionsRebalance( CatalogZoneDescriptor zoneDescriptor, Set dataNodes, long storageRevision, MetaStorageManager metaStorageManager ) { - CompletableFuture> tableAssignmentsFut = tableAssignments( + CompletableFuture> zoneAssignmentsFut = zoneAssignments( metaStorageManager, - tableDescriptor.id(), + zoneDescriptor.id(), Set.of(), zoneDescriptor.partitions() ); @@ -291,22 +291,22 @@ public static CompletableFuture[] triggerAllTablePartitionsRebalance( CompletableFuture[] futures = new CompletableFuture[zoneDescriptor.partitions()]; for (int partId = 0; partId < zoneDescriptor.partitions(); partId++) { - TablePartitionId replicaGrpId = new TablePartitionId(tableDescriptor.id(), partId); + ZonePartitionId replicaGrpId = new ZonePartitionId(zoneDescriptor.id(), partId); int finalPartId = partId; - futures[partId] = tableAssignmentsFut.thenCompose(tableAssignments -> + futures[partId] = zoneAssignmentsFut.thenCompose(zoneAssignments -> // TODO https://issues.apache.org/jira/browse/IGNITE-19763 We should distinguish empty stable assignments on // TODO node recovery in case of interrupted table creation, and moving from empty assignments to non-empty. - tableAssignments.isEmpty() ? nullCompletedFuture() : updatePendingAssignmentsKeys( - tableDescriptor, + zoneAssignments.isEmpty() ? nullCompletedFuture() : updatePendingAssignmentsKeys( + zoneDescriptor, replicaGrpId, dataNodes, zoneDescriptor.replicas(), storageRevision, metaStorageManager, finalPartId, - tableAssignments.get(finalPartId).nodes() + zoneAssignments.get(finalPartId).nodes() )); } @@ -337,9 +337,9 @@ public static CompletableFuture[] forceAssignmentsUpdate( long revision, MetaStorageManager metaStorageManager ) { - CompletableFuture> tableAssignmentsFut = tableAssignments( + CompletableFuture> tableAssignmentsFut = zoneAssignments( metaStorageManager, - tableDescriptor.id(), + tableDescriptor.zoneId(), partitionIds, zoneDescriptor.partitions() ); @@ -353,7 +353,7 @@ public static CompletableFuture[] forceAssignmentsUpdate( CompletableFuture[] futures = new CompletableFuture[ids.length]; for (int i = 0; i < ids.length; i++) { - TablePartitionId replicaGrpId = new TablePartitionId(tableDescriptor.id(), ids[i]); + ZonePartitionId replicaGrpId = new ZonePartitionId(zoneDescriptor.id(), ids[i]); futures[i] = tableAssignmentsFut.thenCompose(tableAssignments -> tableAssignments.isEmpty() ? nullCompletedFuture() : manualPartitionUpdate( @@ -374,7 +374,7 @@ public static CompletableFuture[] forceAssignmentsUpdate( } private static CompletableFuture manualPartitionUpdate( - TablePartitionId partId, + ZonePartitionId zonePartitionId, Collection aliveDataNodes, Set aliveNodesConsistentIds, int replicas, @@ -395,7 +395,10 @@ private static CompletableFuture manualPartitionUpdate( return CompletableFuture.completedFuture(ASSIGNMENT_NOT_UPDATED.ordinal()); } - Set calcAssignments = AffinityUtils.calculateAssignmentForPartition(aliveDataNodes, partId.partitionId(), replicas); + Set calcAssignments = AffinityUtils.calculateAssignmentForPartition( + aliveDataNodes, + zonePartitionId.partitionId(), + replicas); for (Assignment calcAssignment : calcAssignments) { if (partAssignments.size() == replicas) { @@ -412,9 +415,9 @@ private static CompletableFuture manualPartitionUpdate( byte[] partAssignmentsBytes = Assignments.forced(partAssignments).toBytes(); byte[] revisionBytes = ByteUtils.longToBytes(revision); - ByteArray partChangeTriggerKey = pendingChangeTriggerKey(partId); - ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId); - ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(partId); + ByteArray partChangeTriggerKey = pendingChangeTriggerKey(zonePartitionId); + ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(zonePartitionId); + ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(zonePartitionId); Iif iif = iif( notExists(partChangeTriggerKey).or(value(partChangeTriggerKey).lt(revisionBytes)), @@ -446,7 +449,7 @@ private static CompletableFuture manualPartitionUpdate( public static final String ASSIGNMENTS_SWITCH_APPEND_PREFIX = "assignments.switch.append."; /** Key prefix for counter of rebalances of tables from a zone that are associated with the specified partition. */ - private static final String TABLES_COUNTER_PREFIX = "tables.counter."; + public static final String TABLES_COUNTER_PREFIX = "tables.counter."; /** Key prefix for a raft configuration that was applied during rebalance of the specified partition form a table. */ private static final String RAFT_CONF_APPLIED_PREFIX = "assignments.raft.conf.applied."; @@ -454,78 +457,78 @@ private static CompletableFuture manualPartitionUpdate( /** * Key that is needed for skipping stale events of pending key change. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray pendingChangeTriggerKey(TablePartitionId partId) { - return new ByteArray(partId + "pending.change.trigger"); + public static ByteArray pendingChangeTriggerKey(ZonePartitionId zonePartitionId) { + return new ByteArray(zonePartitionId + "pending.change.trigger"); } /** * Key that is needed for skipping stale events of stable key change. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray stableChangeTriggerKey(TablePartitionId partId) { - return new ByteArray(partId + "stable.change.trigger"); + public static ByteArray stableChangeTriggerKey(ZonePartitionId zonePartitionId) { + return new ByteArray(zonePartitionId + "stable.change.trigger"); } /** * Key that is needed for the rebalance algorithm. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray pendingPartAssignmentsKey(TablePartitionId partId) { - return new ByteArray(PENDING_ASSIGNMENTS_PREFIX + partId); + public static ByteArray pendingPartAssignmentsKey(ZonePartitionId zonePartitionId) { + return new ByteArray(PENDING_ASSIGNMENTS_PREFIX + zonePartitionId); } /** * Key that is needed for the rebalance algorithm. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray plannedPartAssignmentsKey(TablePartitionId partId) { - return new ByteArray("assignments.planned." + partId); + public static ByteArray plannedPartAssignmentsKey(ZonePartitionId zonePartitionId) { + return new ByteArray("assignments.planned." + zonePartitionId); } /** * Key that is needed for the rebalance algorithm. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray stablePartAssignmentsKey(TablePartitionId partId) { - return new ByteArray(STABLE_ASSIGNMENTS_PREFIX + partId); + public static ByteArray stablePartAssignmentsKey(ZonePartitionId zonePartitionId) { + return new ByteArray(STABLE_ASSIGNMENTS_PREFIX + zonePartitionId); } /** * Key that is needed for the rebalance algorithm. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray switchReduceKey(TablePartitionId partId) { - return new ByteArray(ASSIGNMENTS_SWITCH_REDUCE_PREFIX + partId); + public static ByteArray switchReduceKey(ZonePartitionId zonePartitionId) { + return new ByteArray(ASSIGNMENTS_SWITCH_REDUCE_PREFIX + zonePartitionId); } /** * Key that is needed for the rebalance algorithm. * - * @param partId Unique identifier of a partition. + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. * @return Key for a partition. * @see Rebalance documentation */ - public static ByteArray switchAppendKey(TablePartitionId partId) { - return new ByteArray(ASSIGNMENTS_SWITCH_APPEND_PREFIX + partId); + public static ByteArray switchAppendKey(ZonePartitionId zonePartitionId) { + return new ByteArray(ASSIGNMENTS_SWITCH_APPEND_PREFIX + zonePartitionId); } /** @@ -539,6 +542,16 @@ public static ByteArray tablesCounterKey(int zoneId, int partId) { return new ByteArray(TABLES_COUNTER_PREFIX + zoneId + "_part_" + partId); } + /** + * ByteArray key for a counter of rebalances of tables from a zone that are associated with the specified partition. + * + * @param zonePartitionId Unique aggregate identifier of a partition of a zone. + * @return Key for a partition. + */ + public static ByteArray tablesCounterKey(ZonePartitionId zonePartitionId) { + return tablesCounterKey(zonePartitionId.zoneId(), zonePartitionId.partitionId()); + } + /** * ByteArray prefix for counter of rebalances of tables from a zone that are associated with the specified partition. * @@ -554,7 +567,7 @@ public static ByteArray tablesCounterPrefixKey() { * @param partId Unique identifier of a partition. * @return Key for a applied raft configuration. */ - public static ByteArray raftConfigurationAppliedKey(TablePartitionId partId) { + public static ByteArray raftConfigurationAppliedKey(ZonePartitionId partId) { return new ByteArray(RAFT_CONF_APPLIED_PREFIX + partId); } @@ -581,7 +594,7 @@ public static int extractTableId(byte[] key, String prefix) { public static int extractZoneId(byte[] key, String prefix) { String strKey = new String(key, StandardCharsets.UTF_8); - return Integer.parseInt(strKey.substring(prefix.length())); + return Integer.parseInt(strKey.substring(prefix.length(), strKey.indexOf("_part_"))); } /** @@ -590,10 +603,10 @@ public static int extractZoneId(byte[] key, String prefix) { * @param key Key. * @return Table id. */ - static int extractZoneIdFromTablesCounter(byte[] key) { + public static int extractZoneIdDataNodes(byte[] key) { String strKey = new String(key, StandardCharsets.UTF_8); - return Integer.parseInt(strKey.substring(TABLES_COUNTER_PREFIX.length(), strKey.indexOf("_part_"))); + return Integer.parseInt(strKey.substring(DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX.length())); } /** @@ -660,17 +673,17 @@ public static Set intersect(Set op1, Set op2) { * Returns partition assignments from meta storage. * * @param metaStorageManager Meta storage manager. - * @param tableId Table ID. + * @param zoneId Zone id. * @param partitionId Partition ID. * @return Future with partition assignments as a value. */ public static CompletableFuture> partitionAssignments( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, int partitionId ) { return metaStorageManager - .get(stablePartAssignmentsKey(new TablePartitionId(tableId, partitionId))) + .get(stablePartAssignmentsKey(new ZonePartitionId(zoneId, partitionId))) .thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes()); } @@ -678,7 +691,7 @@ public static CompletableFuture> partitionAssignments( * Returns partition assignments from meta storage locally. * * @param metaStorageManager Meta storage manager. - * @param tableId Table id. + * @param zoneId Zone id. * @param partitionNumber Partition number. * @param revision Revision. * @return Returns partition assignments from meta storage locally or {@code null} if assignments is absent. @@ -686,11 +699,11 @@ public static CompletableFuture> partitionAssignments( @Nullable public static Set partitionAssignmentsGetLocally( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, int partitionNumber, long revision ) { - Entry entry = metaStorageManager.getLocally(stablePartAssignmentsKey(new TablePartitionId(tableId, partitionNumber)), revision); + Entry entry = metaStorageManager.getLocally(stablePartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber)), revision); return (entry == null || entry.empty() || entry.tombstone()) ? null : Assignments.fromBytes(entry.value()).nodes(); } @@ -699,14 +712,14 @@ public static Set partitionAssignmentsGetLocally( * Returns table assignments for table partitions from meta storage. * * @param metaStorageManager Meta storage manager. - * @param tableId Table id. + * @param zoneId Zone id. * @param partitionIds IDs of partitions to get assignments for. If empty, get all partition assignments. * @param numberOfPartitions Number of partitions. Ignored if partition IDs are specified. * @return Future with table assignments as a value. */ - static CompletableFuture> tableAssignments( + static CompletableFuture> zoneAssignments( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, Set partitionIds, int numberOfPartitions ) { @@ -717,7 +730,7 @@ static CompletableFuture> tableAssignments( : partitionIds; for (Integer partId : ids) { - partitionKeysToPartitionNumber.put(stablePartAssignmentsKey(new TablePartitionId(tableId, partId)), partId); + partitionKeysToPartitionNumber.put(stablePartAssignmentsKey(new ZonePartitionId(zoneId, partId)), partId); } return metaStorageManager.getAll(partitionKeysToPartitionNumber.keySet()) @@ -740,7 +753,7 @@ static CompletableFuture> tableAssignments( assert numberOfMsPartitions == 0 || numberOfMsPartitions == entries.size() : "Invalid number of stable partition entries received from meta storage [received=" - + numberOfMsPartitions + ", numberOfPartitions=" + entries.size() + ", tableId=" + tableId + "]."; + + numberOfMsPartitions + ", numberOfPartitions=" + entries.size() + ", zoneId=" + zoneId + "]."; return numberOfMsPartitions == 0 ? Map.of() : result; }); @@ -763,7 +776,7 @@ public static List tableAssignmentsGetLocally( ) { return IntStream.range(0, numberOfPartitions) .mapToObj(p -> { - Entry e = metaStorageManager.getLocally(stablePartAssignmentsKey(new TablePartitionId(tableId, p)), revision); + Entry e = metaStorageManager.getLocally(stablePartAssignmentsKey(new ZonePartitionId(tableId, p)), revision); assert e != null && !e.empty() && !e.tombstone() : e; diff --git a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/causalitydatanodes/DistributionZoneCausalityDataNodesTest.java b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/causalitydatanodes/DistributionZoneCausalityDataNodesTest.java index c08a402b6d7..3cfdf7adc35 100644 --- a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/causalitydatanodes/DistributionZoneCausalityDataNodesTest.java +++ b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/causalitydatanodes/DistributionZoneCausalityDataNodesTest.java @@ -29,13 +29,12 @@ import static org.apache.ignite.internal.cluster.management.topology.LogicalTopologyImpl.LOGICAL_TOPOLOGY_KEY; import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.assertDataNodesFromManager; import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.assertValueInStorage; -import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zoneDataNodesKey; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zonesDataNodesPrefix; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zonesLogicalTopologyKey; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zonesLogicalTopologyPrefix; import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zonesLogicalTopologyVersionKey; -import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneId; +import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneIdDataNodes; import static org.apache.ignite.internal.testframework.IgniteTestUtils.assertThrowsWithCause; import static org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition; import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully; @@ -1509,7 +1508,7 @@ public CompletableFuture onUpdate(WatchEvent evt) { if (startsWith(e.key(), zoneDataNodesKey().bytes())) { revision = e.revision(); - zoneId = extractZoneId(e.key(), DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX); + zoneId = extractZoneIdDataNodes(e.key()); byte[] dataNodesBytes = e.value(); diff --git a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngineTest.java b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngineTest.java index 7b7b57c330f..e2ab1487e8d 100644 --- a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngineTest.java +++ b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngineTest.java @@ -31,7 +31,6 @@ import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zoneDataNodesKey; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.REBALANCE_SCHEDULER_POOL_SIZE; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.stablePartAssignmentsKey; -import static org.apache.ignite.internal.table.TableTestUtils.getTableIdStrict; import static org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition; import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully; import static org.apache.ignite.internal.util.ByteUtils.toBytes; @@ -100,7 +99,7 @@ import org.apache.ignite.internal.raft.WriteCommand; import org.apache.ignite.internal.raft.service.CommandClosure; import org.apache.ignite.internal.raft.service.RaftGroupService; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.TableTestUtils; import org.apache.ignite.internal.testframework.IgniteAbstractTest; import org.apache.ignite.internal.thread.NamedThreadFactory; @@ -295,7 +294,7 @@ void dataNodesTriggersAssignmentsChanging() { checkAssignments(zoneNodes, RebalanceUtil::pendingPartAssignmentsKey); - verify(keyValueStorage, timeout(1000).times(8)).invoke(any(), any()); + verify(keyValueStorage, timeout(1000).times(2)).invoke(any(), any()); } @Test @@ -396,7 +395,7 @@ void staleDataNodesEvent() { checkAssignments(zoneNodes, RebalanceUtil::pendingPartAssignmentsKey); - TablePartitionId partId = new TablePartitionId(getTableId(TABLE_NAME), 0); + ZonePartitionId partId = new ZonePartitionId(zoneId, 0); assertNull(keyValueStorage.get(RebalanceUtil.plannedPartAssignmentsKey(partId).bytes()).value()); @@ -412,7 +411,7 @@ void replicasTriggersAssignmentsChangingOnNonDefaultZones() throws Exception { byte[] assignmentsBytes = Assignments.of(Assignment.forPeer("node0")).toBytes(); keyValueStorage.put( - stablePartAssignmentsKey(new TablePartitionId(getTableId(TABLE_NAME), 0)).bytes(), assignmentsBytes, + stablePartAssignmentsKey(new ZonePartitionId(getZoneId(ZONE_NAME_0), 0)).bytes(), assignmentsBytes, clock.now() ); @@ -443,7 +442,7 @@ void replicasTriggersAssignmentsChangingOnDefaultZone() throws Exception { byte[] assignmentsBytes = Assignments.of(Assignment.forPeer("node0")).toBytes(); keyValueStorage.put( - stablePartAssignmentsKey(new TablePartitionId(getTableId(TABLE_NAME), i)).bytes(), assignmentsBytes, + stablePartAssignmentsKey(new ZonePartitionId(getZoneId(ZONE_NAME_0), i)).bytes(), assignmentsBytes, clock.now() ); } @@ -479,18 +478,16 @@ private void createRebalanceEngine(MetaStorageManager metaStorageManager) { ); } - private void checkAssignments(Map> zoneNodes, Function assignmentFunction) { + private void checkAssignments(Map> zoneNodes, Function assignmentFunction) { int catalogVersion = catalogManager.latestCatalogVersion(); catalogManager.tables(catalogVersion).forEach(tableDescriptor -> { - int tableId = tableDescriptor.id(); - CatalogZoneDescriptor zoneDescriptor = catalogManager.zone(tableDescriptor.zoneId(), catalogVersion); assertNotNull(zoneDescriptor, "tableName=" + tableDescriptor.name() + ", zoneId=" + tableDescriptor.zoneId()); for (int j = 0; j < zoneDescriptor.partitions(); j++) { - TablePartitionId partId = new TablePartitionId(tableId, j); + ZonePartitionId partId = new ZonePartitionId(zoneDescriptor.id(), j); byte[] actualAssignmentsBytes = keyValueStorage.get(assignmentFunction.apply(partId).bytes()).value(); @@ -554,7 +551,6 @@ private void createTable(String zoneName, String tableName) { List.of("k1") ); - var tableId = getTableId(tableName); var zoneId = getZoneId(zoneName); CatalogZoneDescriptor zoneDescriptor = catalogManager.zone(zoneId, catalogManager.latestCatalogVersion()); @@ -564,7 +560,7 @@ private void createTable(String zoneName, String tableName) { AffinityUtils.calculateAssignments(initialDataNodes, zoneDescriptor.partitions(), zoneDescriptor.replicas()); for (int i = 0; i < initialAssignments.size(); i++) { - var stableAssignmentPartitionKey = stablePartAssignmentsKey(new TablePartitionId(tableId, i)).bytes(); + var stableAssignmentPartitionKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, i)).bytes(); keyValueStorage.put(stableAssignmentPartitionKey, Assignments.toBytes(initialAssignments.get(i)), clock.now()); } @@ -573,8 +569,4 @@ private void createTable(String zoneName, String tableName) { private int getZoneId(String zoneName) { return getZoneIdStrict(catalogManager, zoneName, clock.nowLong()); } - - private int getTableId(String tableName) { - return getTableIdStrict(catalogManager, tableName, clock.nowLong()); - } } diff --git a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtilUpdateAssignmentsTest.java b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtilUpdateAssignmentsTest.java index 184f3c6223e..8324e5d2192 100644 --- a/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtilUpdateAssignmentsTest.java +++ b/modules/distribution-zones/src/test/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtilUpdateAssignmentsTest.java @@ -20,7 +20,7 @@ import static java.util.stream.Collectors.toSet; import static org.apache.ignite.internal.affinity.AffinityUtils.calculateAssignmentForPartition; import static org.apache.ignite.internal.affinity.Assignments.toBytes; -import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE; +import static org.apache.ignite.internal.catalog.commands.CatalogUtils.DEFAULT_FILTER; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -39,8 +39,7 @@ import java.util.stream.IntStream; import org.apache.ignite.internal.affinity.Assignment; import org.apache.ignite.internal.affinity.Assignments; -import org.apache.ignite.internal.catalog.descriptors.CatalogTableColumnDescriptor; -import org.apache.ignite.internal.catalog.descriptors.CatalogTableDescriptor; +import org.apache.ignite.internal.catalog.descriptors.CatalogZoneDescriptor; import org.apache.ignite.internal.hlc.HybridTimestamp; import org.apache.ignite.internal.lang.IgniteInternalException; import org.apache.ignite.internal.logger.IgniteLogger; @@ -59,9 +58,8 @@ import org.apache.ignite.internal.raft.WriteCommand; import org.apache.ignite.internal.raft.service.CommandClosure; import org.apache.ignite.internal.raft.service.RaftGroupService; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.IgniteAbstractTest; -import org.apache.ignite.sql.ColumnType; import org.jetbrains.annotations.Nullable; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -85,16 +83,16 @@ public class RebalanceUtilUpdateAssignmentsTest extends IgniteAbstractTest { private MetaStorageManager metaStorageManager; - private final CatalogTableDescriptor tableDescriptor = new CatalogTableDescriptor( + private final CatalogZoneDescriptor zoneDescriptor = new CatalogZoneDescriptor( 1, - -1, - -1, - "table1", + "TEST_ZONE", + 1, + 1, + 0, + 0, 0, - List.of(new CatalogTableColumnDescriptor("k1", ColumnType.INT32, false, 0, 0, 0, null)), - List.of("k1"), - null, - DEFAULT_STORAGE_PROFILE + DEFAULT_FILTER, + null ); private static final int partNum = 2; @@ -470,7 +468,7 @@ private void test( Set expectedPendingAssignments, Set expectedPlannedAssignments ) { - TablePartitionId tablePartitionId = new TablePartitionId(1, 1); + ZonePartitionId tablePartitionId = new ZonePartitionId(1, 1); if (currentStableAssignments != null) { keyValueStorage.put(RebalanceUtil.stablePartAssignmentsKey(tablePartitionId).bytes(), toBytes(currentStableAssignments), @@ -488,7 +486,7 @@ private void test( } RebalanceUtil.updatePendingAssignmentsKeys( - tableDescriptor, tablePartitionId, nodesForNewAssignments, + zoneDescriptor, tablePartitionId, nodesForNewAssignments, replicas, 1, metaStorageManager, partNum, tableCfgAssignments ); diff --git a/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTask.java b/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTask.java index 7503771c5bf..6dc367878cd 100644 --- a/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTask.java +++ b/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTask.java @@ -62,7 +62,7 @@ import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.PrimaryReplicaAwaitTimeoutException; import org.apache.ignite.internal.placementdriver.ReplicaMeta; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.IgniteSpinBusyLock; /** @@ -235,9 +235,9 @@ private CompletableFuture ensureThatLocalNodeStillPrimaryReplica() { private CompletableFuture awaitPrimaryReplica() { return inBusyLocks(() -> { - TablePartitionId groupId = new TablePartitionId(indexDescriptor.tableId(), 0); + ZonePartitionId groupId = new ZonePartitionId(indexDescriptor.zoneId(), indexDescriptor.tableId(), 0); - return placementDriver.awaitPrimaryReplica(groupId, clockService.now(), AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS) + return placementDriver.awaitPrimaryReplicaForTable(groupId, clockService.now(), AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS) .handle((replicaMeta, throwable) -> { if (throwable != null) { Throwable cause = unwrapCause(throwable); diff --git a/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskController.java b/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskController.java index 4787cb31d6c..2cf246389ce 100644 --- a/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskController.java +++ b/modules/index/src/main/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskController.java @@ -38,7 +38,7 @@ import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.IgniteSpinBusyLock; /** @@ -147,7 +147,7 @@ private void onIndexRemoved(RemoveIndexEventParameters parameters) { private void onPrimaryReplicaElected(PrimaryReplicaEventParameters parameters) { inBusyLock(busyLock, () -> { - TablePartitionId primaryReplicaId = (TablePartitionId) parameters.groupId(); + ZonePartitionId primaryReplicaId = (ZonePartitionId) parameters.groupId(); if (primaryReplicaId.partitionId() != 0) { // We are only interested in the 0 partition. diff --git a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexBuildController.java b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexBuildController.java index e5ec7620265..e552f035540 100644 --- a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexBuildController.java +++ b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexBuildController.java @@ -28,7 +28,6 @@ import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; import static org.apache.ignite.internal.util.IgniteUtils.inBusyLock; import static org.apache.ignite.internal.util.IgniteUtils.inBusyLockAsync; -import static org.apache.ignite.lang.ErrorGroups.Common.INTERNAL_ERR; import java.util.ArrayList; import java.util.Set; @@ -45,7 +44,8 @@ import org.apache.ignite.internal.close.ManuallyCloseable; import org.apache.ignite.internal.hlc.ClockService; import org.apache.ignite.internal.hlc.HybridTimestamp; -import org.apache.ignite.internal.lang.IgniteInternalException; +import org.apache.ignite.internal.logger.IgniteLogger; +import org.apache.ignite.internal.logger.Loggers; import org.apache.ignite.internal.network.ClusterService; import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.PrimaryReplicaAwaitTimeoutException; @@ -53,13 +53,13 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.storage.MvPartitionStorage; import org.apache.ignite.internal.storage.engine.MvTableStorage; import org.apache.ignite.internal.storage.index.IndexStorage; import org.apache.ignite.internal.util.ExceptionUtils; import org.apache.ignite.internal.util.IgniteSpinBusyLock; import org.apache.ignite.network.ClusterNode; -import org.jetbrains.annotations.Nullable; /** * Component is responsible for starting and stopping the building of indexes on primary replicas. @@ -78,6 +78,8 @@ * node restart but after {@link ReplicaMeta#getExpirationTime()}.

*/ class IndexBuildController implements ManuallyCloseable { + private static final IgniteLogger LOG = Loggers.forClass(IndexBuildController.class); + private final IndexBuilder indexBuilder; private final IndexManager indexManager; @@ -94,7 +96,7 @@ class IndexBuildController implements ManuallyCloseable { private final AtomicBoolean closeGuard = new AtomicBoolean(); - private final Set primaryReplicaIds = ConcurrentHashMap.newKeySet(); + private final Set primaryReplicaIds = ConcurrentHashMap.newKeySet(); /** Constructor. */ IndexBuildController( @@ -146,16 +148,29 @@ private CompletableFuture onIndexBuilding(StartBuildingIndexEventParameters p var startBuildIndexFutures = new ArrayList>(); - for (TablePartitionId primaryReplicaId : primaryReplicaIds) { - if (primaryReplicaId.tableId() == indexDescriptor.tableId()) { - CompletableFuture startBuildIndexFuture = getMvTableStorageFuture(parameters.causalityToken(), primaryReplicaId) - .thenCompose(mvTableStorage -> awaitPrimaryReplica(primaryReplicaId, clockService.now()) - .thenAccept(replicaMeta -> tryScheduleBuildIndex( - primaryReplicaId, - indexDescriptor, - mvTableStorage, - replicaMeta - )) + for (ZonePartitionId zonePartitionId : primaryReplicaIds) { + + int tableId = zonePartitionId.tableId(); + + TablePartitionId tablePartId = new TablePartitionId(tableId, zonePartitionId.partitionId()); + + if (tableId == indexDescriptor.tableId()) { + CompletableFuture startBuildIndexFuture = getMvTableStorageFuture(parameters.causalityToken(), tablePartId) + .thenCompose(mvTableStorage -> { + if (mvTableStorage == null) { + LOG.info("The table has been removed, so the index build is skipped [tblId={}].", tableId); + + return nullCompletedFuture(); + } + + return awaitPrimaryReplica(zonePartitionId, clockService.now()) + .thenAccept(replicaMeta -> tryScheduleBuildIndex( + tablePartId, + indexDescriptor, + mvTableStorage, + replicaMeta + )); + } ); startBuildIndexFutures.add(startBuildIndexFuture); @@ -176,26 +191,37 @@ private CompletableFuture onIndexRemoved(RemoveIndexEventParameters parameter private CompletableFuture onPrimaryReplicaElected(PrimaryReplicaEventParameters parameters) { return inBusyLockAsync(busyLock, () -> { - TablePartitionId primaryReplicaId = (TablePartitionId) parameters.groupId(); + ZonePartitionId zonePartitionId = (ZonePartitionId) parameters.groupId(); - if (isLocalNode(clusterService, parameters.leaseholderId())) { - primaryReplicaIds.add(primaryReplicaId); + int tableId = zonePartitionId.tableId(); + TablePartitionId tablePartitionId = new TablePartitionId(tableId, zonePartitionId.partitionId()); + + if (isLocalNode(clusterService, parameters.leaseholderId())) { // It is safe to get the latest version of the catalog because the PRIMARY_REPLICA_ELECTED event is handled on the // metastore thread. int catalogVersion = catalogService.latestCatalogVersion(); - return getMvTableStorageFuture(parameters.causalityToken(), primaryReplicaId) - .thenCompose(mvTableStorage -> awaitPrimaryReplica(primaryReplicaId, parameters.startTime()) - .thenAccept(replicaMeta -> tryScheduleBuildIndexesForNewPrimaryReplica( - catalogVersion, - primaryReplicaId, - mvTableStorage, - replicaMeta - )) - ); + primaryReplicaIds.add(zonePartitionId); + + return getMvTableStorageFuture(parameters.causalityToken(), tablePartitionId).thenCompose(mvTableStorage -> { + if (mvTableStorage == null) { + LOG.info("The table has been removed, so the index build is skipped [tblId={}].", tableId); + + return nullCompletedFuture(); + } + + return inBusyLock(busyLock, () -> awaitPrimaryReplica(zonePartitionId, parameters.startTime())) + .thenAccept(replicaMeta -> inBusyLock(busyLock, () -> tryScheduleBuildIndexesForNewPrimaryReplica( + catalogVersion, + tablePartitionId, + mvTableStorage, + replicaMeta + )) + ); + }); } else { - stopBuildingIndexesIfPrimaryExpired(primaryReplicaId); + stopBuildingIndexesIfPrimaryExpired(tablePartitionId); return nullCompletedFuture(); } @@ -256,32 +282,19 @@ private void tryScheduleBuildIndex( * @param replicaId Replica ID. */ private void stopBuildingIndexesIfPrimaryExpired(TablePartitionId replicaId) { - if (primaryReplicaIds.remove(replicaId)) { + if (primaryReplicaIds.removeIf(z -> z.tableId() == replicaId.tableId() && z.partitionId() == replicaId.partitionId())) { // Primary replica is no longer current, we need to stop building indexes for it. indexBuilder.stopBuildingIndexes(replicaId.tableId(), replicaId.partitionId()); } } private CompletableFuture getMvTableStorageFuture(long causalityToken, TablePartitionId replicaId) { - return indexManager.getMvTableStorage(causalityToken, replicaId.tableId()) - .thenApply(mvTableStorage -> requireMvTableStorageNonNull(mvTableStorage, replicaId.tableId())); - } - - private static MvTableStorage requireMvTableStorageNonNull(@Nullable MvTableStorage mvTableStorage, int tableId) { - if (mvTableStorage == null) { - throw new IgniteInternalException( - INTERNAL_ERR, - "Table storage for the specified table cannot be null [tableId = {}]", - tableId - ); - } - - return mvTableStorage; + return indexManager.getMvTableStorage(causalityToken, replicaId.tableId()); } - private CompletableFuture awaitPrimaryReplica(TablePartitionId replicaId, HybridTimestamp timestamp) { + private CompletableFuture awaitPrimaryReplica(ZonePartitionId replicaId, HybridTimestamp timestamp) { return placementDriver - .awaitPrimaryReplica(replicaId, timestamp, AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS) + .awaitPrimaryReplicaForTable(replicaId, timestamp, AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS) .handle((replicaMeta, throwable) -> { if (throwable != null) { Throwable unwrapThrowable = ExceptionUtils.unwrapCause(throwable); diff --git a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java index 5a4bfc01489..7be29600cab 100644 --- a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java +++ b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java @@ -60,7 +60,6 @@ import org.apache.ignite.internal.table.distributed.PartitionSet; import org.apache.ignite.internal.table.distributed.TableManager; import org.apache.ignite.internal.util.IgniteSpinBusyLock; -import org.jetbrains.annotations.Nullable; /** * An Ignite component that is responsible for handling index-related commands like CREATE or DROP @@ -172,8 +171,14 @@ public CompletableFuture stopAsync() { * @return Future with multi-version table storage, completes with {@code null} if the table does not exist according to the passed * parameters. */ - CompletableFuture<@Nullable MvTableStorage> getMvTableStorage(long causalityToken, int tableId) { - return tableManager.tableAsync(causalityToken, tableId).thenApply(table -> table == null ? null : table.internalTable().storage()); + CompletableFuture getMvTableStorage(long causalityToken, int tableId) { + return tableManager.tableAsync(causalityToken, tableId).thenApply(table -> { + if (table == null) { + return null; + } + + return table.internalTable().storage(); + }); } private CompletableFuture onIndexCreate(CreateIndexEventParameters parameters) { diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java index b119dcda9fd..6d3b7f0de33 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java @@ -49,7 +49,7 @@ import org.apache.ignite.internal.network.ClusterNodeImpl; import org.apache.ignite.internal.network.ClusterService; import org.apache.ignite.internal.placementdriver.ReplicaMeta; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.network.ClusterNode; import org.apache.ignite.network.NetworkAddress; @@ -190,17 +190,21 @@ private void setPrimaryReplicaAnotherNode() { } private void setPrimaryReplica(ClusterNode clusterNode) { - TablePartitionId groupId = new TablePartitionId(tableId(), 0); + ZonePartitionId zonePartId = new ZonePartitionId(zoneId(), tableId(), 0); - ReplicaMeta replicaMeta = newPrimaryReplicaMeta(clusterNode, groupId, HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE); + ReplicaMeta replicaMeta = newPrimaryReplicaMeta(clusterNode, zonePartId, HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE); - assertThat(placementDriver.setPrimaryReplicaMeta(0, groupId, completedFuture(replicaMeta)), willCompleteSuccessfully()); + assertThat(placementDriver.setPrimaryReplicaMeta(0, zonePartId, completedFuture(replicaMeta)), willCompleteSuccessfully()); } private int tableId() { return TestIndexManagementUtils.tableId(catalogManager, TABLE_NAME, clock); } + private int zoneId() { + return catalogManager.catalog(catalogManager.latestCatalogVersion()).table(tableId()).zoneId(); + } + private CatalogIndexDescriptor indexDescriptor() { return TestIndexManagementUtils.indexDescriptor(catalogManager, INDEX_NAME, clock); } diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java index 2031b554384..1b19fb41746 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java @@ -84,6 +84,7 @@ import org.apache.ignite.internal.placementdriver.PrimaryReplicaAwaitTimeoutException; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.IgniteAbstractTest; import org.apache.ignite.internal.util.IgniteSpinBusyLock; import org.apache.ignite.network.ClusterNode; @@ -145,7 +146,7 @@ void setUp() { createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE) ); - when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn(localNodeReplicaMetaFuture); + when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn(localNodeReplicaMetaFuture); CompletableFuture logicalTopologySnapshotFuture = completedFuture( new LogicalTopologySnapshot(1, List.of(LOGICAL_LOCAL_NODE)) @@ -189,7 +190,7 @@ void testSimpleTaskExecution() { verify(executor, atLeast(3)).execute(any()); verify(clockWaiter, atLeast(2)).waitFor(any()); - verify(placementDriver).awaitPrimaryReplica(any(), any(), anyLong(), any()); + verify(placementDriver).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any()); verify(logicalTopologyService).logicalTopologyOnLeader(); verify(logicalTopologyService).addEventListener(any()); verify(logicalTopologyService).removeEventListener(any()); @@ -204,7 +205,7 @@ void testTimeoutAndSuccessOnAwaitPrimaryReplica() { createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE) ); - when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn( + when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn( awaitPrimaryReplicaFuture0, awaitPrimaryReplicaFuture1 ); @@ -212,7 +213,7 @@ void testTimeoutAndSuccessOnAwaitPrimaryReplica() { assertThat(task.start(), willCompleteSuccessfully()); assertEquals(BUILDING, actualIndexStatus()); - verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any()); + verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any()); } @Test @@ -223,7 +224,7 @@ void testTimeoutAndExpireOnAwaitPrimaryReplica() { createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MIN_VALUE.addPhysicalTime(1)) ); - when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn( + when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn( awaitPrimaryReplicaFuture0, awaitPrimaryReplicaFuture1 ); @@ -231,7 +232,7 @@ void testTimeoutAndExpireOnAwaitPrimaryReplica() { assertThat(task.start(), willThrow(IndexTaskStoppingException.class)); assertEquals(REGISTERED, actualIndexStatus()); - verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any()); + verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any()); } @Test @@ -240,7 +241,7 @@ void testTimeoutAndErrorOnAwaitPrimaryReplica() { CompletableFuture awaitPrimaryReplicaFuture1 = failedFuture(primaryReplicaAwaitException()); - when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn( + when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn( awaitPrimaryReplicaFuture0, awaitPrimaryReplicaFuture1 ); @@ -248,7 +249,7 @@ void testTimeoutAndErrorOnAwaitPrimaryReplica() { assertThat(task.start(), willThrow(PrimaryReplicaAwaitException.class)); assertEquals(REGISTERED, actualIndexStatus()); - verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any()); + verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any()); } @Test @@ -326,7 +327,7 @@ private CatalogIndexStatus actualIndexStatus() { } private ReplicaMeta createLocalNodeReplicaMeta(HybridTimestamp startTime, HybridTimestamp expirationTime) { - return newPrimaryReplicaMeta(LOCAL_NODE, new TablePartitionId(indexDescriptor.tableId(), 0), startTime, expirationTime); + return newPrimaryReplicaMeta(LOCAL_NODE, new ZonePartitionId(0, indexDescriptor.tableId(), 0), startTime, expirationTime); } private static ClusterService createClusterService() { diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java index 6a23070f4b6..062845bcb7f 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java @@ -64,6 +64,7 @@ import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.placementdriver.leases.Lease; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.storage.MvPartitionStorage; import org.apache.ignite.internal.storage.engine.MvTableStorage; import org.apache.ignite.internal.storage.index.IndexStorage; @@ -377,12 +378,16 @@ private int tableId(String tableName) { return getTableIdStrict(catalogManager, tableName, clock.nowLong()); } + private int zoneId() { + return catalogManager.catalog(catalogManager.latestCatalogVersion()).table(tableId()).zoneId(); + } + private int indexId(String indexName) { return getIndexIdStrict(catalogManager, indexName, clock.nowLong()); } - private TablePartitionId replicaId(int partitionId) { - return new TablePartitionId(tableId(), partitionId); + private ZonePartitionId replicaId(int partitionId) { + return new ZonePartitionId(zoneId(), tableId(), partitionId); } private ReplicaMeta replicaMetaForOneSecond(String leaseholder, String leaseholderId, HybridTimestamp startTime) { diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java index ad290ab97b5..0fc2792e615 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java @@ -51,7 +51,7 @@ import org.apache.ignite.internal.network.ClusterNodeImpl; import org.apache.ignite.internal.network.ClusterService; import org.apache.ignite.internal.placementdriver.ReplicaMeta; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.internal.util.IgniteSpinBusyLock; import org.apache.ignite.network.ClusterNode; @@ -102,7 +102,7 @@ void testExtractIndexIdFromPartitionBuildIndexKey() { @Test void testIsPrimaryReplicaTrue() { - TablePartitionId replicaGroupId = new TablePartitionId(1, 0); + ZonePartitionId replicaGroupId = new ZonePartitionId(0, 1, 0); HybridTimestamp startTime = clock.now(); long dayInMillis = TimeUnit.DAYS.toMillis(1); @@ -114,7 +114,7 @@ void testIsPrimaryReplicaTrue() { @Test void testIsPrimaryReplicaFalse() { - TablePartitionId replicaGroupId = new TablePartitionId(1, 0); + ZonePartitionId replicaGroupId = new ZonePartitionId(0, 1, 0); ClusterNode otherNode = new ClusterNodeImpl(NODE_ID + "-other", NODE_NAME + "-other", mock(NetworkAddress.class)); diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java b/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java index d8992a9c196..df17605a8c5 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java @@ -48,7 +48,7 @@ import org.apache.ignite.internal.network.ClusterNodeImpl; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.placementdriver.leases.Lease; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.table.TableTestUtils; import org.apache.ignite.network.ClusterNode; import org.apache.ignite.network.NetworkAddress; @@ -129,7 +129,7 @@ static void assertMetastoreKeyPresent(MetaStorageManager metaStorageManager, Byt static ReplicaMeta newPrimaryReplicaMeta( ClusterNode clusterNode, - TablePartitionId replicaGroupId, + ZonePartitionId replicaGroupId, HybridTimestamp startTime, HybridTimestamp expirationTime ) { diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java b/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java index fe207e75888..73ea06f9d2b 100644 --- a/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java +++ b/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java @@ -18,6 +18,7 @@ package org.apache.ignite.internal.index; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -28,7 +29,8 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; +import org.apache.ignite.internal.util.CompletableFutures; /** Implementation for tests. */ class TestPlacementDriver extends AbstractEventProducer implements PlacementDriver { @@ -40,6 +42,23 @@ public CompletableFuture awaitPrimaryReplica( HybridTimestamp timestamp, long timeout, TimeUnit unit + ) { + assert groupId instanceof ZonePartitionId : "Unexpected replication group type [type=" + groupId.getClass().getSimpleName() + ']'; + + return awaitPrimaryReplicaForTable( + groupId, + timestamp, + timeout, + unit + ); + } + + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit ) { return primaryReplicaMetaFutureById.get(groupId); } @@ -56,7 +75,7 @@ public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) CompletableFuture setPrimaryReplicaMeta( long causalityToken, - TablePartitionId replicaId, + ZonePartitionId replicaId, CompletableFuture replicaMetaFuture ) { primaryReplicaMetaFutureById.put(replicaId, replicaMetaFuture); @@ -72,4 +91,18 @@ CompletableFuture setPrimaryReplicaMeta( ) )); } + + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + return CompletableFutures.nullCompletedFuture(); + } + + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + return null; + } } diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java index f06e28a8b3d..d4b45a00049 100644 --- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java +++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java @@ -17,6 +17,7 @@ package org.apache.ignite.internal.placementdriver; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import org.apache.ignite.internal.event.EventProducer; @@ -24,6 +25,7 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.ZonePartitionId; /** * Service that provides an ability to await and retrieve primary replicas for replication groups. @@ -59,6 +61,28 @@ CompletableFuture awaitPrimaryReplica( TimeUnit unit ); + /** + * Temporary solution for awaiting {@link ReplicaMeta}. Waits for + * {@link ReplicaMeta} for {@link org.apache.ignite.internal.replicator.TablePartitionId} + * based on the {@link ZonePartitionId#tableId()}. + * + * @param groupId Replication group id. + * @param timestamp CLOCK_SKEW aware timestamp reference value. + * @param timeout How long to wait before completing exceptionally with a TimeoutException, in units of unit. + * @param unit A TimeUnit determining how to interpret the timeout parameter. + * @return Primary replica future. + * @throws PrimaryReplicaAwaitTimeoutException If primary replica await timed out. + * @throws PrimaryReplicaAwaitException If primary replica await failed with any other reason except timeout. + */ + // TODO: https://issues.apache.org/jira/browse/IGNITE-20362 + @Deprecated + CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ); + /** * Same as {@link #awaitPrimaryReplica(ReplicationGroupId, HybridTimestamp, long, TimeUnit)} despite the fact that given method await * logic is bounded. It will wait for a primary replica for a reasonable period of time, and complete a future with null if a matching @@ -78,4 +102,24 @@ CompletableFuture awaitPrimaryReplica( * @return Future. */ CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId); + + /** + * Gets a cached lease by a zone replication group. + * + * @param grpId Replication group id. + * @return Lease or {@code null}. + */ + ReplicaMeta getLeaseMeta(ReplicationGroupId grpId); + + /** + * Tries to update the lease in order to include the new subgroup. The set of groups will be added to the set of lease subgroups + * ({@link ReplicaMeta#subgroups()}) for the specific lease determined by the zone id. + * TODO: IGNITE-20362 When replicas are started by zone, the method is removed. + * + * @param zoneId Zone id. + * @param enlistmentConsistencyToken Lease token. + * @param subGrps Table ids. + * @return Future to complete. + */ + CompletableFuture addSubgroups(ZonePartitionId zoneId, Long enlistmentConsistencyToken, Set subGrps); } diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java index 01bbf819cda..40b8ec78ea8 100644 --- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java +++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java @@ -18,7 +18,9 @@ package org.apache.ignite.internal.placementdriver; import java.io.Serializable; +import java.util.Set; import org.apache.ignite.internal.hlc.HybridTimestamp; +import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.jetbrains.annotations.Nullable; /** Replica lease meta. */ @@ -34,4 +36,11 @@ public interface ReplicaMeta extends Serializable { /** Gets a lease expiration timestamp. */ HybridTimestamp getExpirationTime(); + + /** + * Gets partition replication groups. + * + * @return Set of replication sub groups. + */ + Set subgroups(); } diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java index ad30b64481f..f2b408f2490 100644 --- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java +++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java @@ -17,7 +17,10 @@ package org.apache.ignite.internal.placementdriver.message; +import java.util.Set; +import org.apache.ignite.internal.network.annotations.Marshallable; import org.apache.ignite.internal.network.annotations.Transferable; +import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.jetbrains.annotations.Nullable; /** @@ -29,4 +32,9 @@ public interface LeaseGrantedMessageResponse extends PlacementDriverMessage { @Nullable String redirectProposal(); + + /** List of applied groups. */ + @Nullable + @Marshallable + Set appliedGroups(); } diff --git a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java index 440cbbb3097..3bdda01db52 100644 --- a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java +++ b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java @@ -21,6 +21,7 @@ import static java.util.concurrent.CompletableFuture.failedFuture; import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; @@ -29,6 +30,7 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.network.ClusterNode; import org.jetbrains.annotations.TestOnly; @@ -68,6 +70,16 @@ public CompletableFuture awaitPrimaryReplica( return getReplicaMetaFuture(); } + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + return getReplicaMetaFuture(); + } + @Override public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) { return getReplicaMetaFuture(); @@ -98,4 +110,18 @@ public Supplier getPrimaryReplicaSupplier() { public void setPrimaryReplicaSupplier(Supplier primaryReplicaSupplier) { this.primaryReplicaSupplier = primaryReplicaSupplier; } + + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + return nullCompletedFuture(); + } + + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + return null; + } } diff --git a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java index 886a04e1bb9..04831a72ef9 100644 --- a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java +++ b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java @@ -21,7 +21,9 @@ import static org.apache.ignite.internal.hlc.HybridTimestamp.MIN_VALUE; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.Set; import org.apache.ignite.internal.hlc.HybridTimestamp; +import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.network.ClusterNode; import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.TestOnly; @@ -129,4 +131,9 @@ public HybridTimestamp getStartTime() { public HybridTimestamp getExpirationTime() { return expirationTime; } + + @Override + public Set subgroups() { + return Set.of(); + } } diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java index 5b3fae8a9ff..115f1cef1f1 100644 --- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java +++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java @@ -47,6 +47,7 @@ import org.apache.ignite.internal.raft.PeersAndLearners; import org.apache.ignite.internal.raft.client.AbstractTopologyAwareGroupServiceTest; import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory; +import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.util.Cursor; import org.apache.ignite.raft.jraft.rpc.impl.RaftGroupEventsClientListener; import org.junit.jupiter.api.AfterEach; @@ -62,6 +63,7 @@ */ @ExtendWith(MockitoExtension.class) @MockitoSettings(strictness = Strictness.LENIENT) +@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false") public class ActiveActorTest extends AbstractTopologyAwareGroupServiceTest { private final Map placementDriverManagers = new HashMap<>(); @@ -132,7 +134,8 @@ protected void afterNodeStart( logicalTopologyService, mockRaftMgr, raftGroupServiceFactory, - new TestClockService(new HybridClockImpl()) + new TestClockService(new HybridClockImpl()), + grp -> ZONE_GROUP_ID ); assertThat(placementDriverManager.startAsync(), willCompleteSuccessfully()); diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java index 30003ad3842..25910eb9e07 100644 --- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java +++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java @@ -33,7 +33,7 @@ import org.apache.ignite.internal.placementdriver.leases.Lease; import org.apache.ignite.internal.placementdriver.leases.LeaseBatch; import org.apache.ignite.internal.replicator.ReplicationGroupId; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.IgniteAbstractTest; import org.jetbrains.annotations.Nullable; @@ -44,22 +44,22 @@ abstract class BasePlacementDriverTest extends IgniteAbstractTest { * * @return Replication group id. */ - protected TablePartitionId createTableAssignment(MetaStorageManager metastore, int tableId, List dataNodes) { + protected ZonePartitionId createZoneAssignment(MetaStorageManager metastore, int zoneId, List dataNodes) { List> assignments = AffinityUtils.calculateAssignments(dataNodes, 1, dataNodes.size()); Map partitionAssignments = new HashMap<>(assignments.size()); for (int i = 0; i < assignments.size(); i++) { partitionAssignments.put( - stablePartAssignmentsKey(new TablePartitionId(tableId, i)), + stablePartAssignmentsKey(new ZonePartitionId(zoneId, i)), Assignments.toBytes(assignments.get(i))); } metastore.putAll(partitionAssignments).join(); - var grpPart0 = new TablePartitionId(tableId, 0); + var grpPart0 = new ZonePartitionId(zoneId, 0); - log.info("Fake table created [id={}, repGrp={}]", tableId, grpPart0); + log.info("Fake zone created [id={}, repGrp={}]", zoneId, grpPart0); return grpPart0; } diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java index c6523efc825..c8ca65ceceb 100644 --- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java +++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java @@ -59,6 +59,7 @@ import org.apache.ignite.internal.table.NodeUtils; import org.apache.ignite.internal.table.TableViewInternal; import org.apache.ignite.internal.testframework.IgniteTestUtils; +import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.testframework.flow.TestFlowUtils; import org.apache.ignite.internal.tx.InternalTransaction; import org.apache.ignite.internal.tx.impl.ReadWriteTransactionImpl; @@ -78,6 +79,7 @@ * The test class checks invariant of a primary replica choice. */ @SuppressWarnings("resource") +@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false") public class ItPrimaryReplicaChoiceTest extends ClusterPerTestIntegrationTest { private static final int AWAIT_PRIMARY_REPLICA_TIMEOUT = 10; diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java index fe784efb883..b6358c9edfa 100644 --- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java +++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java @@ -35,6 +35,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; @@ -71,6 +72,7 @@ import org.apache.ignite.internal.raft.configuration.RaftConfiguration; import org.apache.ignite.internal.raft.service.RaftGroupService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.IgniteUtils; import org.apache.ignite.network.NetworkAddress; import org.apache.ignite.raft.jraft.rpc.impl.RaftGroupEventsClientListener; @@ -85,7 +87,11 @@ */ @ExtendWith(ConfigurationExtension.class) public class MultiActorPlacementDriverTest extends BasePlacementDriverTest { - public static final int BASE_PORT = 1234; + private static final int BASE_PORT = 1234; + + private static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0); + + private static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 0); private static final PlacementDriverMessagesFactory PLACEMENT_DRIVER_MESSAGES_FACTORY = new PlacementDriverMessagesFactory(); @@ -110,7 +116,7 @@ public class MultiActorPlacementDriverTest extends BasePlacementDriverTest { /** This closure handles {@link LeaseGrantedMessage} to check the placement driver manager behavior. */ private IgniteTriFunction leaseGrantHandler; - private final AtomicInteger nextTableId = new AtomicInteger(1); + private final AtomicInteger nextZoneId = new AtomicInteger(1); @BeforeEach public void beforeTest(TestInfo testInfo) { @@ -172,6 +178,7 @@ private NetworkMessageHandler leaseGrantMessageHandler(ClusterService handlerSer if (resp == null) { resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); } @@ -277,7 +284,8 @@ private List startPlacementDriver( logicalTopologyService, raftManager, topologyAwareRaftGroupServiceFactory, - new TestClockService(nodeClock) + new TestClockService(nodeClock), + grp -> ZONE_GROUP_ID ); res.add(new Node(nodeName, clusterService, raftManager, metaStorageManager, placementDriverManager)); @@ -290,7 +298,7 @@ private List startPlacementDriver( @Test public void testLeaseCreate() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createTableAssignment(); checkLeaseCreated(grpPart0, true); } @@ -303,11 +311,12 @@ public void testLeaseProlong() throws Exception { acceptedNodeRef.compareAndSet(null, to); return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createTableAssignment(); Lease lease = checkLeaseCreated(grpPart0, true); Lease leaseRenew = waitForProlong(grpPart0, lease); @@ -323,11 +332,12 @@ public void prolongAfterActiveActorChanged() throws Exception { acceptedNodeRef.compareAndSet(null, to); return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createTableAssignment(); Lease lease = checkLeaseCreated(grpPart0, true); @@ -375,12 +385,13 @@ public void testLeaseProlongAfterRedirect() throws Exception { log.info("Lease is accepted [leaseholder={}]", to); return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); } }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createTableAssignment(); Lease lease = checkLeaseCreated(grpPart0, true); @@ -399,11 +410,12 @@ public void testDeclineLeaseByLeaseholder() throws Exception { activeActorRef.set(from); return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); }; - TablePartitionId grpPart = createTableAssignment(); + ZonePartitionId grpPart = createTableAssignment(); Lease lease = checkLeaseCreated(grpPart, true); @@ -424,6 +436,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception { .build(); } else { return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); } @@ -435,7 +448,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception { service.messagingService().send( clusterServices.get(activeActorRef.get()).topologyService().localMember(), PLACEMENT_DRIVER_MESSAGES_FACTORY.stopLeaseProlongationMessage() - .groupId(grpPart) + .groupId(GROUP_ID) .redirectProposal(proposedLeaseholder) .build() ); @@ -455,7 +468,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception { * @return Renewed lease. * @throws InterruptedException If the waiting is interrupted. */ - private Lease waitNewLeaseholder(TablePartitionId grpPart, Lease lease) throws InterruptedException { + private Lease waitNewLeaseholder(ZonePartitionId grpPart, Lease lease) throws InterruptedException { var leaseRenewRef = new AtomicReference(); assertTrue(waitForCondition(() -> { @@ -489,7 +502,7 @@ private Lease waitNewLeaseholder(TablePartitionId grpPart, Lease lease) throws I * @return Renewed lease. * @throws InterruptedException If the waiting is interrupted. */ - private Lease waitForProlong(TablePartitionId grpPart, Lease lease) throws InterruptedException { + private Lease waitForProlong(ZonePartitionId grpPart, Lease lease) throws InterruptedException { var leaseRenewRef = new AtomicReference(); assertTrue(waitForCondition(() -> { @@ -534,7 +547,7 @@ private Lease waitForProlong(TablePartitionId grpPart, Lease lease) throws Inter * @return A lease that is read from Meta storage. * @throws InterruptedException If the waiting is interrupted. */ - private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) throws InterruptedException { + private Lease checkLeaseCreated(ZonePartitionId grpPartId, boolean waitAccept) throws InterruptedException { AtomicReference leaseRef = new AtomicReference<>(); assertTrue(waitForCondition(() -> { @@ -567,7 +580,7 @@ private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) * * @return Replication group id. */ - private TablePartitionId createTableAssignment() { - return createTableAssignment(metaStorageManager, nextTableId.get(), nodeNames); + private ZonePartitionId createTableAssignment() { + return createZoneAssignment(metaStorageManager, nextZoneId.get(), nodeNames); } } diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java index 4171b9f9acd..c5e64514c83 100644 --- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java +++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java @@ -87,6 +87,7 @@ import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory; import org.apache.ignite.internal.raft.configuration.RaftConfiguration; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.network.ClusterNode; import org.apache.ignite.network.NetworkAddress; @@ -104,6 +105,10 @@ public class PlacementDriverManagerTest extends BasePlacementDriverTest { public static final int PORT = 1234; + protected static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0); + + protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 0); + private static final PlacementDriverMessagesFactory PLACEMENT_DRIVER_MESSAGES_FACTORY = new PlacementDriverMessagesFactory(); private String nodeName; @@ -210,7 +215,8 @@ private void startPlacementDriverManager() { logicalTopologyService, raftManager, topologyAwareRaftGroupServiceFactory, - new TestClockService(nodeClock) + new TestClockService(nodeClock), + grp -> ZONE_GROUP_ID ); assertThat( @@ -244,6 +250,7 @@ private NetworkMessageHandler leaseGrantMessageHandler(String handlerNode) { if (resp == null) { resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(true) .build(); } @@ -274,7 +281,7 @@ private void stopPlacementDriverManager() throws Exception { @Test public void testLeaseCreate() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, false); } @@ -282,7 +289,7 @@ public void testLeaseCreate() throws Exception { @Test @WithSystemProperty(key = "IGNITE_LONG_LEASE", value = "200") public void testLeaseRenew() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, false); @@ -305,7 +312,7 @@ public void testLeaseRenew() throws Exception { @Test @WithSystemProperty(key = "IGNITE_LONG_LEASE", value = "200") public void testLeaseholderUpdate() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, false); @@ -337,7 +344,7 @@ public void testLeaseholderUpdate() throws Exception { @Test public void testPrimaryReplicaEvents() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)); + ZonePartitionId grpPart0 = createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)); Lease lease1 = checkLeaseCreated(grpPart0, true); @@ -366,7 +373,7 @@ public void testPrimaryReplicaEvents() throws Exception { assertTrue(waitForCondition(() -> { CompletableFuture fut = placementDriverManager.placementDriver() - .getPrimaryReplica(grpPart0, lease1.getExpirationTime()); + .getPrimaryReplica(GROUP_ID, lease1.getExpirationTime()); ReplicaMeta meta = fut.join(); @@ -388,7 +395,7 @@ public void testPrimaryReplicaEvents() throws Exception { assertTrue(waitForCondition(() -> { CompletableFuture fut = placementDriverManager.placementDriver() - .getPrimaryReplica(grpPart0, lease2.getExpirationTime()); + .getPrimaryReplica(GROUP_ID, lease2.getExpirationTime()); ReplicaMeta meta = fut.join(); @@ -456,12 +463,12 @@ private ClusterService startAnotherNode(String nodeName, int port) throws Except @Test public void testLeaseRemovedAfterExpirationAndAssignmetnsRemoval() throws Exception { - List groupIds = List.of( - createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)), - createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)) + List groupIds = List.of( + createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)), + createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)) ); - Map leaseExpirationMap = + Map leaseExpirationMap = groupIds.stream().collect(Collectors.toMap(id -> id, id -> new AtomicBoolean())); groupIds.forEach(groupId -> { @@ -494,7 +501,7 @@ public void testLeaseRemovedAfterExpirationAndAssignmetnsRemoval() throws Except @Test public void testLeaseAccepted() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, true); } @@ -504,10 +511,11 @@ public void testLeaseForceAccepted() throws Exception { leaseGrantHandler = (req, handler) -> PLACEMENT_DRIVER_MESSAGES_FACTORY .leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(req.force()) .build(); - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, true); } @@ -522,7 +530,7 @@ public void testExceptionOnAcceptance() throws Exception { throw new RuntimeException("test"); }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, false); @@ -543,25 +551,27 @@ public void testRedirectionAcceptance() throws Exception { return PLACEMENT_DRIVER_MESSAGES_FACTORY .leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(false) .redirectProposal(redirect.get()) .build(); } else { return PLACEMENT_DRIVER_MESSAGES_FACTORY .leaseGrantedMessageResponse() + .appliedGroups(Set.of(GROUP_ID)) .accepted(redirect.get().equals(handler)) .build(); } }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, true); } @Test public void testLeaseRestore() throws Exception { - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); checkLeaseCreated(grpPart0, false); @@ -581,7 +591,7 @@ public void testLeaseMatchGrantMessage() throws Exception { return null; }; - TablePartitionId grpPart0 = createTableAssignment(); + ZonePartitionId grpPart0 = createZoneAssignment(); Lease lease = checkLeaseCreated(grpPart0, false); @@ -599,7 +609,7 @@ public void testLeaseMatchGrantMessage() throws Exception { * @return A lease that is read from Meta storage. * @throws InterruptedException If the waiting is interrupted. */ - private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) throws InterruptedException { + private Lease checkLeaseCreated(ZonePartitionId grpPartId, boolean waitAccept) throws InterruptedException { AtomicReference leaseRef = new AtomicReference<>(); assertTrue(waitForCondition(() -> { @@ -632,8 +642,8 @@ private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) * * @return Replication group id. */ - private TablePartitionId createTableAssignment() { - return createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName, anotherNodeName)); + private ZonePartitionId createZoneAssignment() { + return createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName, anotherNodeName)); } /** diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java index 11e4da4de6f..b8e9edb2606 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java @@ -38,7 +38,7 @@ import org.apache.ignite.internal.metastorage.WatchEvent; import org.apache.ignite.internal.metastorage.WatchListener; import org.apache.ignite.internal.replicator.ReplicationGroupId; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.Cursor; /** @@ -94,7 +94,7 @@ public void startTrack() { strKey = strKey.replace(STABLE_ASSIGNMENTS_PREFIX, ""); - TablePartitionId grpId = TablePartitionId.fromString(strKey); + ZonePartitionId grpId = ZonePartitionId.fromString(strKey); Set assignments = Assignments.fromBytes(entry.value()).nodes(); @@ -144,7 +144,7 @@ public CompletableFuture onUpdate(WatchEvent event) { for (EntryEvent evt : event.entryEvents()) { Entry entry = evt.newEntry(); - var replicationGrpId = TablePartitionId.fromString( + var replicationGrpId = ZonePartitionId.fromString( new String(entry.key(), StandardCharsets.UTF_8).replace(STABLE_ASSIGNMENTS_PREFIX, "")); if (entry.tombstone()) { diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java index 3ed175c099c..be26e38123b 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java @@ -26,6 +26,7 @@ import static org.apache.ignite.internal.placementdriver.PlacementDriverManager.PLACEMENTDRIVER_LEASES_KEY; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; @@ -34,6 +35,7 @@ import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; import org.apache.ignite.internal.affinity.Assignment; import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologyService; import org.apache.ignite.internal.hlc.ClockService; @@ -56,6 +58,8 @@ import org.apache.ignite.internal.placementdriver.negotiation.LeaseAgreement; import org.apache.ignite.internal.placementdriver.negotiation.LeaseNegotiator; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.thread.IgniteThread; import org.apache.ignite.internal.tostring.IgniteToStringInclude; import org.apache.ignite.internal.tostring.S; @@ -80,6 +84,8 @@ public class LeaseUpdater { /** Lease holding interval. */ private static final long LEASE_INTERVAL = 10 * UPDATE_LEASE_MS; + private final boolean alwaysForce = IgniteSystemProperties.getBoolean("IGNITE_ALWAYS_FORCE", true); + /** The lock is available when the actor is changing state. */ private final IgniteSpinBusyLock stateChangingLock = new IgniteSpinBusyLock(); @@ -106,6 +112,8 @@ public class LeaseUpdater { /** Cluster clock. */ private final ClockService clockService; + private final Function groupIdProvider; + /** Closure to update leases. */ private final Updater updater; @@ -133,13 +141,15 @@ public class LeaseUpdater { MetaStorageManager msManager, LogicalTopologyService topologyService, LeaseTracker leaseTracker, - ClockService clockService + ClockService clockService, + Function groupIdProvider ) { this.nodeName = nodeName; this.clusterService = clusterService; this.msManager = msManager; this.leaseTracker = leaseTracker; this.clockService = clockService; + this.groupIdProvider = groupIdProvider; this.longLeaseInterval = IgniteSystemProperties.getLong("IGNITE_LONG_LEASE", 120_000); this.assignmentsTracker = new AssignmentsTracker(msManager); @@ -368,7 +378,7 @@ private void updateLeaseBatchInternal() { agreement.checkValid(grpId, topologyTracker.currentTopologySnapshot(), assignments); if (agreement.isAccepted()) { - publishLease(grpId, lease, renewedLeases); + publishLease(grpId, lease, renewedLeases, agreement.applicableFor()); continue; } else if (agreement.isDeclined()) { @@ -438,6 +448,11 @@ private void updateLeaseBatchInternal() { ); } + if (Arrays.equals(leasesCurrent.leasesBytes(), renewedValue)) { + LOG.info("No leases to update found."); + return; + } + msManager.invoke( or(notExists(key), value(key).eq(leasesCurrent.leasesBytes())), put(key, renewedValue), @@ -457,7 +472,7 @@ private void updateLeaseBatchInternal() { for (Map.Entry entry : toBeNegotiated.entrySet()) { Lease lease = renewedLeases.get(entry.getKey()); - boolean force = entry.getValue(); + boolean force = alwaysForce || entry.getValue(); leaseNegotiator.negotiate(lease, force); } @@ -509,11 +524,18 @@ private void prolongLease(ReplicationGroupId grpId, Lease lease, Map renewedLeases) { + private void publishLease( + ReplicationGroupId grpId, + Lease lease, + Map renewedLeases, + Set subGrps + ) { var newTs = new HybridTimestamp(clockService.now().getPhysical() + LEASE_INTERVAL, 0); - Lease renewedLease = lease.acceptLease(newTs); + Lease renewedLease = lease.acceptLease(newTs, subGrps); renewedLeases.put(grpId, renewedLease); @@ -613,17 +635,23 @@ public void onReceived(NetworkMessage msg0, ClusterNode sender, @Nullable Long c private void processMessageInternal(String sender, PlacementDriverActorMessage msg) { ReplicationGroupId grpId = msg.groupId(); - Lease lease = leaseTracker.getLease(grpId); + assert grpId instanceof TablePartitionId : "Unexpected replication group type [grp=" + grpId + "]."; + + var tblPartId = (TablePartitionId) grpId; + + ReplicationGroupId grpId0 = groupIdProvider.apply(tblPartId); + + Lease lease = leaseTracker.getLease(grpId0); if (msg instanceof StopLeaseProlongationMessage) { if (lease.isProlongable() && sender.equals(lease.getLeaseholder())) { StopLeaseProlongationMessage stopLeaseProlongationMessage = (StopLeaseProlongationMessage) msg; - denyLease(grpId, lease, stopLeaseProlongationMessage.redirectProposal()).whenComplete((res, th) -> { + denyLease(grpId0, lease, stopLeaseProlongationMessage.redirectProposal()).whenComplete((res, th) -> { if (th != null) { - LOG.warn("Prolongation denial failed due to exception [groupId={}]", th, grpId); + LOG.warn("Prolongation denial failed due to exception [groupId={}]", th, grpId0); } else { - LOG.info("Stop lease prolongation message was handled [groupId={}, sender={}, deny={}]", grpId, sender, res); + LOG.info("Stop lease prolongation message was handled [groupId={}, sender={}, deny={}]", grpId0, sender, res); } }); } diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java index 9edc6dbbfb9..f35b084c33a 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java @@ -25,6 +25,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; +import java.util.function.Function; import java.util.function.Supplier; import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologyService; import org.apache.ignite.internal.hlc.ClockService; @@ -41,6 +42,8 @@ import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupService; import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.IgniteSpinBusyLock; import org.apache.ignite.network.ClusterNode; import org.jetbrains.annotations.TestOnly; @@ -110,7 +113,8 @@ public PlacementDriverManager( LogicalTopologyService logicalTopologyService, RaftManager raftManager, TopologyAwareRaftGroupServiceFactory topologyAwareRaftGroupServiceFactory, - ClockService clockService + ClockService clockService, + Function tablePartIdToZoneIdProvider ) { this.replicationGroupId = replicationGroupId; this.clusterService = clusterService; @@ -121,7 +125,13 @@ public PlacementDriverManager( this.raftClientFuture = new CompletableFuture<>(); - this.leaseTracker = new LeaseTracker(metastore, clusterService.topologyService(), clockService); + this.leaseTracker = new LeaseTracker( + nodeName, + metastore, + clusterService.topologyService(), + clockService, + tablePartIdToZoneIdProvider + ); this.leaseUpdater = new LeaseUpdater( nodeName, @@ -129,7 +139,8 @@ public PlacementDriverManager( metastore, logicalTopologyService, leaseTracker, - clockService + clockService, + tablePartIdToZoneIdProvider ); } diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java index d613fa3e6d5..f26c2181067 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java @@ -27,7 +27,9 @@ import static org.apache.ignite.internal.util.ByteUtils.toBytes; import java.nio.ByteBuffer; +import java.util.Collections; import java.util.Objects; +import java.util.Set; import org.apache.ignite.internal.hlc.HybridTimestamp; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.replicator.ReplicationGroupId; @@ -36,8 +38,7 @@ import org.jetbrains.annotations.Nullable; /** - * A lease representation in memory. - * The real lease is stored in Meta storage. + * A lease representation in memory. The real lease is stored in Meta storage. */ public class Lease implements ReplicaMeta { private static final long serialVersionUID = 394641185393949608L; @@ -67,6 +68,9 @@ public class Lease implements ReplicaMeta { /** ID of replication group. */ private final ReplicationGroupId replicationGroupId; + /** Table partition replication groups. */ + private final Set subgroups; + /** * Creates a new lease. * @@ -83,7 +87,7 @@ public Lease( HybridTimestamp leaseExpirationTime, ReplicationGroupId replicationGroupId ) { - this(leaseholder, leaseholderId, startTime, leaseExpirationTime, false, false, null, replicationGroupId); + this(leaseholder, leaseholderId, startTime, leaseExpirationTime, false, false, null, replicationGroupId, Collections.emptySet()); } /** @@ -95,9 +99,10 @@ public Lease( * @param leaseExpirationTime Lease expiration timestamp. * @param prolong Lease is available to prolong. * @param accepted The flag is {@code true} when the holder accepted the lease. - * @param proposedCandidate The name of a node that is proposed to be a next leaseholder. This is not null in case when the lease - * is not prolongable. + * @param proposedCandidate The name of a node that is proposed to be a next leaseholder. This is not null in case when the + * lease is not prolongable. * @param replicationGroupId ID of replication group. + * @param subgroups Table partition replication groups. */ public Lease( @Nullable String leaseholder, @@ -107,7 +112,8 @@ public Lease( boolean prolong, boolean accepted, @Nullable String proposedCandidate, - ReplicationGroupId replicationGroupId + ReplicationGroupId replicationGroupId, + Set subgroups ) { assert (leaseholder == null) == (leaseholderId == null) : "leaseholder=" + leaseholder + ", leaseholderId=" + leaseholderId; @@ -121,6 +127,7 @@ public Lease( this.accepted = accepted; this.replicationGroupId = replicationGroupId; this.proposedCandidate = proposedCandidate; + this.subgroups = subgroups; } /** @@ -133,7 +140,17 @@ public Lease prolongLease(HybridTimestamp to) { assert accepted : "The lease should be accepted by leaseholder before prolongation: [lease=" + this + ", to=" + to + ']'; assert prolongable : "The lease should be available to prolong: [lease=" + this + ", to=" + to + ']'; - return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId); + return new Lease( + leaseholder, + leaseholderId, + startTime, + to, + true, + true, + null, + replicationGroupId, + subgroups + ); } /** @@ -142,10 +159,10 @@ public Lease prolongLease(HybridTimestamp to) { * @param to The new lease expiration timestamp. * @return A accepted lease. */ - public Lease acceptLease(HybridTimestamp to) { + public Lease acceptLease(HybridTimestamp to, Set parts) { assert !accepted : "The lease is already accepted: " + this; - return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId); + return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId, parts); } /** @@ -156,7 +173,17 @@ public Lease acceptLease(HybridTimestamp to) { public Lease denyLease(String proposedCandidate) { assert accepted : "The lease is not accepted: " + this; - return new Lease(leaseholder, leaseholderId, startTime, expirationTime, false, true, proposedCandidate, replicationGroupId); + return new Lease( + leaseholder, + leaseholderId, + startTime, + expirationTime, + false, + true, + proposedCandidate, + replicationGroupId, + subgroups + ); } @Override @@ -179,6 +206,11 @@ public HybridTimestamp getExpirationTime() { return expirationTime; } + @Override + public Set subgroups() { + return subgroups; + } + /** Returns {@code true} if the lease might be prolonged. */ public boolean isProlongable() { return prolongable; @@ -210,11 +242,15 @@ public byte[] bytes() { byte[] leaseholderIdBytes = stringToBytes(leaseholderId); byte[] proposedCandidateBytes = stringToBytes(proposedCandidate); byte[] groupIdBytes = toBytes(replicationGroupId); + byte[] subgroupsBytes = toBytes(subgroups); int bufSize = 2 // accepted + prolongable + HYBRID_TIMESTAMP_SIZE * 2 // startTime + expirationTime - + bytesSizeForWrite(leaseholderBytes) + bytesSizeForWrite(leaseholderIdBytes) + bytesSizeForWrite(proposedCandidateBytes) - + bytesSizeForWrite(groupIdBytes); + + bytesSizeForWrite(leaseholderBytes) + + bytesSizeForWrite(leaseholderIdBytes) + + bytesSizeForWrite(proposedCandidateBytes) + + bytesSizeForWrite(groupIdBytes) + + bytesSizeForWrite(subgroupsBytes); ByteBuffer buf = ByteBuffer.allocate(bufSize).order(LITTLE_ENDIAN); @@ -228,6 +264,7 @@ public byte[] bytes() { putBytes(buf, leaseholderIdBytes); putBytes(buf, proposedCandidateBytes); putBytes(buf, groupIdBytes); + putBytes(buf, subgroupsBytes); return buf.array(); } @@ -252,8 +289,9 @@ public static Lease fromBytes(ByteBuffer buf) { String proposedCandidate = stringFromBytes(getBytes(buf)); ReplicationGroupId groupId = ByteUtils.fromBytes(getBytes(buf)); + Set parts = ByteUtils.fromBytes(getBytes(buf)); - return new Lease(leaseholder, leaseholderId, startTime, expirationTime, prolongable, accepted, proposedCandidate, groupId); + return new Lease(leaseholder, leaseholderId, startTime, expirationTime, prolongable, accepted, proposedCandidate, groupId, parts); } /** diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java index b671009dc90..bc7010b1116 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java @@ -23,6 +23,11 @@ import static java.util.concurrent.CompletableFuture.allOf; import static java.util.concurrent.CompletableFuture.completedFuture; import static org.apache.ignite.internal.hlc.HybridTimestamp.MIN_VALUE; +import static org.apache.ignite.internal.metastorage.dsl.Conditions.notExists; +import static org.apache.ignite.internal.metastorage.dsl.Conditions.or; +import static org.apache.ignite.internal.metastorage.dsl.Conditions.value; +import static org.apache.ignite.internal.metastorage.dsl.Operations.noop; +import static org.apache.ignite.internal.metastorage.dsl.Operations.put; import static org.apache.ignite.internal.placementdriver.PlacementDriverManager.PLACEMENTDRIVER_LEASES_KEY; import static org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent.PRIMARY_REPLICA_ELECTED; import static org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent.PRIMARY_REPLICA_EXPIRED; @@ -35,17 +40,21 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; import org.apache.ignite.internal.event.AbstractEventProducer; import org.apache.ignite.internal.hlc.ClockService; import org.apache.ignite.internal.hlc.HybridTimestamp; -import org.apache.ignite.internal.lang.IgniteStringFormatter; import org.apache.ignite.internal.logger.IgniteLogger; import org.apache.ignite.internal.logger.Loggers; import org.apache.ignite.internal.metastorage.Entry; @@ -60,7 +69,12 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; +import org.apache.ignite.internal.replicator.exception.PrimaryReplicaMissException; +import org.apache.ignite.internal.thread.NamedThreadFactory; import org.apache.ignite.internal.util.IgniteSpinBusyLock; +import org.apache.ignite.internal.util.IgniteUtils; import org.apache.ignite.internal.util.PendingIndependentComparableValuesTracker; import org.apache.ignite.network.ClusterNode; import org.apache.ignite.network.ClusterNodeResolver; @@ -99,18 +113,35 @@ public class LeaseTracker extends AbstractEventProducer tablePartIdToZoneIdProvider; + private final ClockService clockService; + /** Node name. */ + private final String nodeName; + + /** Repeated Meta storage lease subgroup updates will be handled in this thread pool. */ + private ExecutorService leaseUpdateRetryExecutor; + /** * Constructor. * + * @param nodeName Node name. * @param msManager Meta storage manager. * @param clockService Clock service. */ - public LeaseTracker(MetaStorageManager msManager, ClusterNodeResolver clusterNodeResolver, ClockService clockService) { + public LeaseTracker( + String nodeName, + MetaStorageManager msManager, + ClusterNodeResolver clusterNodeResolver, + ClockService clockService, + Function tablePartIdToZoneIdProvider + ) { + this.nodeName = nodeName; this.msManager = msManager; this.clusterNodeResolver = clusterNodeResolver; this.clockService = clockService; + this.tablePartIdToZoneIdProvider = tablePartIdToZoneIdProvider; } /** @@ -123,9 +154,96 @@ public void startTrack(long recoveryRevision) { msManager.registerExactWatch(PLACEMENTDRIVER_LEASES_KEY, updateListener); loadLeasesBusyAsync(recoveryRevision); + + leaseUpdateRetryExecutor = Executors.newSingleThreadExecutor( + NamedThreadFactory.create(nodeName, "lease-update-retry-executor", LOG) + ); }); } + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + if (leases.leaseByGroupId().get(zoneId).subgroups().containsAll(subGrps)) { + return nullCompletedFuture(); + } + + CompletableFuture resultFut = new CompletableFuture<>(); + + Leases leasesCurrent = leases; + Map previousLeasesMap = leasesCurrent.leaseByGroupId(); + Map renewedLeases = new HashMap<>(previousLeasesMap); + + Lease previousLease = previousLeasesMap.get(zoneId); + + if (previousLease != null && enlistmentConsistencyToken.equals(previousLease.getStartTime().longValue())) { + HashSet subgroups = new HashSet<>(previousLease.subgroups()); + + subgroups.addAll(subGrps); + + renewedLeases.put(zoneId, new Lease( + previousLease.getLeaseholder(), + previousLease.getLeaseholderId(), + previousLease.getStartTime(), + previousLease.getExpirationTime(), + previousLease.isProlongable(), + previousLease.isAccepted(), + null, + previousLease.replicationGroupId(), + subgroups)); + } else { + resultFut.completeExceptionally(new PrimaryReplicaMissException( + nodeName, + null, + "localNode.id()", + null, + null, + null, + null + )); + + return resultFut; + } + + byte[] renewedValue = new LeaseBatch(renewedLeases.values()).bytes(); + + msManager.invoke( + or(notExists(PLACEMENTDRIVER_LEASES_KEY), value(PLACEMENTDRIVER_LEASES_KEY).eq(leasesCurrent.leasesBytes())), + put(PLACEMENTDRIVER_LEASES_KEY, renewedValue), + noop() + ).whenCompleteAsync((invokeResult, throwable) -> { + if (throwable != null) { + resultFut.completeExceptionally(throwable); + + return; + } + + if (invokeResult) { + resultFut.complete(null); + } else { + try { + // Throttling. + Thread.sleep(200); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + addSubgroups(zoneId, enlistmentConsistencyToken, subGrps).whenComplete((unused, throwable1) -> { + if (throwable1 != null) { + resultFut.completeExceptionally(throwable1); + } + + resultFut.complete(null); + }); + } + }, leaseUpdateRetryExecutor); + + return resultFut; + } + /** Stops the tracker. */ public void stopTrack() { if (!stopGuard.compareAndSet(false, true)) { @@ -137,6 +255,8 @@ public void stopTrack() { primaryReplicaWaiters.forEach((groupId, pendingTracker) -> pendingTracker.close()); primaryReplicaWaiters.clear(); + IgniteUtils.shutdownAndAwaitTermination(leaseUpdateRetryExecutor, 10, TimeUnit.SECONDS); + msManager.unregisterWatch(updateListener); } @@ -152,6 +272,8 @@ public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) * @return A lease is associated with the group. */ public Lease getLease(ReplicationGroupId grpId) { + assert grpId instanceof ZonePartitionId : "Unexpected replication group type [grp=" + grpId + "]."; + Leases leases = this.leases; assert leases != null : "Leases not initialized, probably the local placement driver actor hasn't started lease tracking."; @@ -161,6 +283,13 @@ public Lease getLease(ReplicationGroupId grpId) { return lease == null ? emptyLease(grpId) : lease; } + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + Lease lease = getLease(grpId); + + return lease.isAccepted() ? lease : null; + } + /** Returns collection of leases, ordered by replication group. */ public Leases leasesCurrent() { return leases; @@ -172,7 +301,7 @@ private class UpdateListener implements WatchListener { public CompletableFuture onUpdate(WatchEvent event) { return inBusyLockAsync(busyLock, () -> { List> fireEventFutures = new ArrayList<>(); - List expiredLeases = new ArrayList<>(); + HashMap expiredLeases = new HashMap<>(); for (EntryEvent entry : event.entryEvents()) { Entry msEntry = entry.newEntry(); @@ -189,35 +318,45 @@ public CompletableFuture onUpdate(WatchEvent event) { leasesMap.put(grpId, lease); + Lease previousLease = previousLeasesMap.get(grpId); + if (lease.isAccepted()) { primaryReplicaWaiters .computeIfAbsent(grpId, groupId -> new PendingIndependentComparableValuesTracker<>(MIN_VALUE)) .update(lease.getExpirationTime(), lease); - if (needFireEventReplicaBecomePrimary(previousLeasesMap.get(grpId), lease)) { - fireEventFutures.add(fireEventReplicaBecomePrimary(event.revision(), lease)); + for (ReplicationGroupId groupToNotify : needFireEventReplicaBecomePrimary(previousLease, lease)) { + fireEventFutures.add(fireEventReplicaBecomePrimary(groupToNotify, event.revision(), lease)); } } - if (needToFireEventReplicaExpired(grpId, lease)) { - expiredLeases.add(leases.leaseByGroupId().get(grpId)); + if (previousLease != null && previousLease.isAccepted()) { + for (ReplicationGroupId groupToNotify : needFireEventReplicaExpired(previousLease, lease)) { + expiredLeases.put(groupToNotify, previousLease); + } } } - for (ReplicationGroupId grpId : leases.leaseByGroupId().keySet()) { + for (Map.Entry replicaLease : previousLeasesMap.entrySet()) { + ReplicationGroupId grpId = replicaLease.getKey(); + if (!leasesMap.containsKey(grpId)) { tryRemoveTracker(grpId); - if (needToFireEventReplicaExpired(grpId, null)) { - expiredLeases.add(leases.leaseByGroupId().get(grpId)); + Lease previousLease = previousLeasesMap.get(grpId); + + if (previousLease.isAccepted()) { + for (ReplicationGroupId groupToNotify : needFireEventReplicaExpired(previousLease, null)) { + expiredLeases.put(groupToNotify, previousLease); + } } } } leases = new Leases(unmodifiableMap(leasesMap), leasesBytes); - for (Lease expiredLease : expiredLeases) { - firePrimaryReplicaExpiredEvent(event.revision(), expiredLease); + for (Map.Entry expiredLease : expiredLeases.entrySet()) { + fireEventPrimaryReplicaExpired(expiredLease.getKey(), event.revision(), expiredLease.getValue()); } } @@ -260,25 +399,56 @@ public CompletableFuture awaitPrimaryReplica( long timeout, TimeUnit unit ) { + assert groupId instanceof TablePartitionId : "Unexpected replication group type [grp=" + groupId + "]."; + + var tblPartId = (TablePartitionId) groupId; + + ReplicationGroupId groupId0 = tablePartIdToZoneIdProvider.apply(tblPartId); + + return awaitPrimaryReplicaForTable( + groupId0, + timestamp, + timeout, + unit + ); + } + + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + assert groupId instanceof ZonePartitionId : "Unexpected replication group type [grp=" + groupId + "]."; + + var zonePartId = ((ZonePartitionId) groupId).purify(); + CompletableFuture future = new CompletableFuture<>(); - awaitPrimaryReplica(groupId, timestamp, future); + awaitPrimaryReplica(zonePartId, timestamp, future); return future .orTimeout(timeout, unit) .exceptionally(e -> { if (e instanceof TimeoutException) { - throw new PrimaryReplicaAwaitTimeoutException(groupId, timestamp, leases.leaseByGroupId().get(groupId), e); + throw new PrimaryReplicaAwaitTimeoutException(zonePartId, timestamp, leases.leaseByGroupId().get(zonePartId), e); } - throw new PrimaryReplicaAwaitException(groupId, timestamp, e); + throw new PrimaryReplicaAwaitException(zonePartId, timestamp, e); }); } @Override - public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) { + public CompletableFuture getPrimaryReplica(ReplicationGroupId groupId, HybridTimestamp timestamp) { + assert groupId instanceof TablePartitionId : "Unexpected replication group type [grp=" + groupId + "]."; + + var tblPartId = (TablePartitionId) groupId; + + ReplicationGroupId groupId0 = tablePartIdToZoneIdProvider.apply(tblPartId); + return inBusyLockAsync(busyLock, () -> { - Lease lease = getLease(replicationGroupId); + Lease lease = getLease(groupId0); if (lease.isAccepted() && clockService.after(lease.getExpirationTime(), timestamp)) { return completedFuture(lease); @@ -288,7 +458,7 @@ public CompletableFuture getPrimaryReplica(ReplicationGroupId repli .clusterTime() .waitFor(timestamp.addPhysicalTime(clockService.maxClockSkewMillis())) .thenApply(ignored -> inBusyLock(busyLock, () -> { - Lease lease0 = getLease(replicationGroupId); + Lease lease0 = getLease(groupId0); if (lease0.isAccepted() && clockService.after(lease0.getExpirationTime(), timestamp)) { return lease0; @@ -352,60 +522,79 @@ private void loadLeasesBusyAsync(long recoveryRevision) { /** * Fires the primary replica expire event if it needs. * - * @param grpId Group id, used for the cases when the {@code lease} parameter is null. Should be always not null. - * @param lease Lease to check on expiration. - * @return Whether the event is needed. + * @param previousLease Lease to check on expiration. + * @param newLease A new lease. + * @return Collection of replication group ids, which are needed to be notified. */ - private boolean needToFireEventReplicaExpired(ReplicationGroupId grpId, @Nullable Lease lease) { - assert lease == null || lease.replicationGroupId().equals(grpId) - : IgniteStringFormatter.format("Group id mismatch [groupId={}, lease={}]", grpId, lease); + private Set needFireEventReplicaExpired(Lease previousLease, @Nullable Lease newLease) { + assert previousLease.isAccepted() : previousLease; - Lease currentLease = leases.leaseByGroupId().get(grpId); + if (newLease == null || !newLease.isAccepted() || !newLease.getStartTime().equals(previousLease.getStartTime())) { + return previousLease.subgroups(); + } - if (currentLease != null && currentLease.isAccepted()) { - boolean sameLease = lease != null && currentLease.getStartTime().equals(lease.getStartTime()); + Set needToBeNotified = new HashSet<>(previousLease.subgroups()); - if (!sameLease) { - return true; - } - } + needToBeNotified.removeAll(newLease.subgroups()); - return false; + return needToBeNotified; } /** * Fires the primary replica expire event. * + * @param groupId Replication group id. * @param causalityToken Causality token. * @param expiredLease Expired lease. */ - private void firePrimaryReplicaExpiredEvent(long causalityToken, Lease expiredLease) { - ReplicationGroupId grpId = expiredLease.replicationGroupId(); + private void fireEventPrimaryReplicaExpired(ReplicationGroupId groupId, long causalityToken, Lease expiredLease) { + TablePartitionId tablePartitionId = (TablePartitionId) groupId; + + ZonePartitionId zonePartitionId = (ZonePartitionId) expiredLease.replicationGroupId(); - CompletableFuture prev = expirationFutureByGroup.put(grpId, fireEvent( + CompletableFuture fut = fireEvent( PRIMARY_REPLICA_EXPIRED, new PrimaryReplicaEventParameters( causalityToken, - grpId, + new ZonePartitionId(zonePartitionId.zoneId(), tablePartitionId.tableId(), zonePartitionId.partitionId()), expiredLease.getLeaseholderId(), expiredLease.getLeaseholder(), expiredLease.getStartTime() ) - )); + ); + + CompletableFuture prev = expirationFutureByGroup.put( + groupId, + fut + ); - assert prev == null || prev.isDone() : "Previous lease expiration process has not completed yet [grpId=" + grpId + ']'; + assert prev == null || prev.isDone() : + "Previous lease expiration process has not completed yet [grpId=" + expiredLease.replicationGroupId() + + ", subGrpId=" + groupId + ']'; } - private CompletableFuture fireEventReplicaBecomePrimary(long causalityToken, Lease lease) { + /** + * Fires the replica become primary event. + * + * @param groupId Replication group id. + * @param causalityToken Causality token. + * @param lease A new lease. + * @return Future to notification complete. + */ + private CompletableFuture fireEventReplicaBecomePrimary(ReplicationGroupId groupId, long causalityToken, Lease lease) { String leaseholderId = lease.getLeaseholderId(); + ZonePartitionId zonePartitionId = (ZonePartitionId) lease.replicationGroupId(); + + TablePartitionId tablePartitionId = (TablePartitionId) groupId; + assert leaseholderId != null : lease; return fireEvent( PRIMARY_REPLICA_ELECTED, new PrimaryReplicaEventParameters( causalityToken, - lease.replicationGroupId(), + new ZonePartitionId(zonePartitionId.zoneId(), tablePartitionId.tableId(), zonePartitionId.partitionId()), leaseholderId, lease.getLeaseholder(), lease.getStartTime() @@ -418,11 +607,19 @@ private CompletableFuture fireEventReplicaBecomePrimary(long causalityToke * * @param previousLease Previous group lease, {@code null} if absent. * @param newLease New group lease. - * @return {@code true} if there is no previous lease for the group or the new lease is not prolongation. + * @return Collection of replication group ids, which are needed to be notified. */ - private static boolean needFireEventReplicaBecomePrimary(@Nullable Lease previousLease, Lease newLease) { + private static Set needFireEventReplicaBecomePrimary(@Nullable Lease previousLease, Lease newLease) { assert newLease.isAccepted() : newLease; - return previousLease == null || !previousLease.isAccepted() || !previousLease.getStartTime().equals(newLease.getStartTime()); + if (previousLease == null || !previousLease.isAccepted() || !previousLease.getStartTime().equals(newLease.getStartTime())) { + return newLease.subgroups(); + } + + Set needToBeNotified = new HashSet<>(newLease.subgroups()); + + needToBeNotified.removeAll(previousLease.subgroups()); + + return needToBeNotified; } } diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java index fa1fce72e9e..faabba6012d 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java @@ -22,6 +22,7 @@ import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; import static org.apache.ignite.internal.util.IgniteUtils.findAny; +import java.util.Collections; import java.util.Set; import java.util.concurrent.CompletableFuture; import org.apache.ignite.internal.affinity.Assignment; @@ -116,6 +117,19 @@ public String getRedirectTo() { return resp != null ? resp.redirectProposal() : null; } + /** + * The lease was considered by the set of replication subgroups. + * + * @return A set of applied groups. + */ + public Set applicableFor() { + assert ready() : "The method should be invoked only after the agreement is ready"; + + LeaseGrantedMessageResponse resp = responseFut.join(); + + return resp != null ? resp.appliedGroups() : Collections.emptySet(); + } + /** * Returns true if the agreement is negotiated, false otherwise. * diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java index ca3b719888f..8bd9a76e955 100644 --- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java +++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java @@ -90,6 +90,8 @@ public void negotiate(Lease lease, boolean force) { LeaseGrantedMessageResponse response = (LeaseGrantedMessageResponse) msg; + assert !response.accepted() || response.appliedGroups() != null : response; + fut.complete(response); } else { if (!(unwrapCause(throwable) instanceof NodeStoppingException)) { diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java index 82b217f184b..80535fcb221 100644 --- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java +++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java @@ -59,6 +59,7 @@ import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse; import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessagesFactory; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.network.NetworkAddress; import org.apache.ignite.network.TopologyService; @@ -72,7 +73,7 @@ public class LeaseNegotiationTest extends BaseIgniteAbstractTest { private static final PlacementDriverMessagesFactory MSG_FACTORY = new PlacementDriverMessagesFactory(); - private static final TablePartitionId GROUP_ID = new TablePartitionId(0, 0); + private static final ZonePartitionId ZONE_PARTITION_ID = new ZonePartitionId(0, 0); private static final String NODE_0_NAME = "node0"; private static final LogicalNode CLUSTER_NODE_0 = new LogicalNode(randomUUID().toString(), NODE_0_NAME, mock(NetworkAddress.class)); @@ -144,9 +145,11 @@ private LeaseUpdater createLeaseUpdater() { when(pdClusterService.topologyService()).thenAnswer(inv -> pdTopologyService); LeaseTracker leaseTracker = new LeaseTracker( + NODE_0_NAME, metaStorageManager, pdClusterService.topologyService(), - new TestClockService(new HybridClockImpl()) + new TestClockService(new HybridClockImpl()), + grpId -> ZONE_PARTITION_ID ); leaseTracker.startTrack(0L); @@ -157,12 +160,16 @@ private LeaseUpdater createLeaseUpdater() { metaStorageManager, pdLogicalTopologyService, leaseTracker, - new TestClockService(new HybridClockImpl()) + new TestClockService(new HybridClockImpl()), + grpId -> ZONE_PARTITION_ID ); } private static LeaseGrantedMessageResponse createLeaseGrantedMessageResponse(boolean accept) { - return MSG_FACTORY.leaseGrantedMessageResponse().accepted(accept).build(); + return MSG_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(new TablePartitionId(42, 0))) + .accepted(accept) + .build(); } @Test @@ -180,11 +187,11 @@ public void testAssignmentChangeOnNegotiation() throws InterruptedException { return createLeaseGrantedMessageResponse(true); }; - metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); + metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); assertThat(lgmReceived, willCompleteSuccessfully()); - metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_1_NAME)))); + metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_1_NAME)))); waitForAcceptedLease(); @@ -207,7 +214,7 @@ public void testAssignmentChangeOnNegotiationAndReplicaRejectsLease() throws Int return createLeaseGrantedMessageResponse(true); }; - metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); + metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); assertThat(lgmReceived, willCompleteSuccessfully()); @@ -231,7 +238,10 @@ public void testAssignmentChangeOnNegotiationNodeLeftTopology() throws Interrupt return createLeaseGrantedMessageResponse(true); }; - metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME), forPeer(NODE_1_NAME)))); + metaStorageManager.put( + stablePartAssignmentsKey(ZONE_PARTITION_ID), + Assignments.toBytes(Set.of(forPeer(NODE_0_NAME), forPeer(NODE_1_NAME))) + ); assertThat(lgmReceived, willCompleteSuccessfully()); @@ -258,7 +268,7 @@ public void testNetworkExceptionOnNegotiation() throws InterruptedException { return createLeaseGrantedMessageResponse(true); }; - metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); + metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME)))); assertThat(lgmReceived, willCompleteSuccessfully()); diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java index 826bd4d228e..835607cf8fe 100644 --- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java +++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java @@ -29,6 +29,7 @@ import static org.mockito.Mockito.when; import java.util.List; +import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import org.apache.ignite.internal.hlc.HybridClockImpl; import org.apache.ignite.internal.hlc.HybridTimestamp; @@ -45,6 +46,7 @@ import org.apache.ignite.internal.placementdriver.leases.LeaseBatch; import org.apache.ignite.internal.placementdriver.leases.LeaseTracker; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.network.ClusterNodeResolver; import org.junit.jupiter.api.Test; @@ -71,9 +73,11 @@ public void testLeaseCleanup() { when(msManager.getLocally(any(), anyLong())).thenAnswer(invocation -> emptyEntry); LeaseTracker leaseTracker = new LeaseTracker( + "testNode", msManager, mock(ClusterNodeResolver.class), - new TestClockService(new HybridClockImpl()) + new TestClockService(new HybridClockImpl()), + tablePartitionId -> new ZonePartitionId(123, tablePartitionId.partitionId()) ); leaseTracker.startTrack(0L); @@ -83,8 +87,11 @@ public void testLeaseCleanup() { return falseCompletedFuture(); }); - TablePartitionId partId0 = new TablePartitionId(0, 0); - TablePartitionId partId1 = new TablePartitionId(0, 1); + ZonePartitionId partId0 = new ZonePartitionId(123, 0); + ZonePartitionId partId1 = new ZonePartitionId(123, 1); + + TablePartitionId tablePartitionId = new TablePartitionId(1, 1); + ZonePartitionId partId1FromEvent = new ZonePartitionId(123, 1, 1); HybridTimestamp startTime = new HybridTimestamp(1, 0); HybridTimestamp expirationTime = new HybridTimestamp(1000, 0); @@ -94,7 +101,7 @@ public void testLeaseCleanup() { Lease lease0 = new Lease(leaseholder0, leaseholder0 + "_id", startTime, expirationTime, partId0); Lease lease1 = new Lease(leaseholder1, leaseholder1 + "_id", startTime, expirationTime, partId1) - .acceptLease(new HybridTimestamp(2000, 0)); + .acceptLease(new HybridTimestamp(2000, 0), Set.of(tablePartitionId)); // In entry0, there are leases for partition ids partId0 and partId1. In entry1, there is only partId0, so partId1 is expired. Entry entry0 = new EntryImpl(PLACEMENTDRIVER_LEASES_KEY.bytes(), new LeaseBatch(List.of(lease0, lease1)).bytes(), 0, 0); @@ -106,7 +113,7 @@ public void testLeaseCleanup() { // Check that the absence of accepted lease triggers the event. listenerRef.get().onUpdate(new WatchEvent(new EntryEvent(emptyEntry, entry1))); assertNotNull(parametersRef.get()); - assertEquals(partId1, parametersRef.get().groupId()); + assertEquals(partId1FromEvent, parametersRef.get().groupId()); // Check that the absence of not accepted lease doesn't trigger the event. parametersRef.set(null); diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java index 7caff8bb83d..f94afdfcb86 100644 --- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java +++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java @@ -66,7 +66,7 @@ import org.apache.ignite.internal.placementdriver.leases.LeaseTracker; import org.apache.ignite.internal.placementdriver.leases.Leases; import org.apache.ignite.internal.replicator.ReplicationGroupId; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.internal.testframework.IgniteTestUtils; import org.apache.ignite.internal.util.Cursor; @@ -107,7 +107,7 @@ public class LeaseUpdaterTest extends BaseIgniteAbstractTest { @BeforeEach void setUp() { Entry entry = new EntryImpl( - stablePartAssignmentsKey(new TablePartitionId(1, 0)).bytes(), + stablePartAssignmentsKey(new ZonePartitionId(1, 0)).bytes(), Assignments.of(Assignment.forPeer(node.name())).toBytes(), 1, 0 @@ -143,7 +143,8 @@ void setUp() { metaStorageManager, topologyService, leaseTracker, - new TestClockService(new HybridClockImpl()) + new TestClockService(new HybridClockImpl()), + grp -> new ZonePartitionId(grp.tableId(), grp.partitionId()) ); leaseUpdater.init(); diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java index c177c5c7ef3..70e41a025bd 100644 --- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java +++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java @@ -44,7 +44,9 @@ import static org.mockito.Mockito.mock; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -63,6 +65,7 @@ import org.apache.ignite.internal.placementdriver.leases.LeaseTracker; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest; import org.apache.ignite.internal.util.PendingComparableValuesTracker; import org.apache.ignite.network.ClusterNode; @@ -83,6 +86,10 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest { private static final TablePartitionId GROUP_1 = new TablePartitionId(1000, 0); + private static final ZonePartitionId ZONE_GROUP_1 = new ZonePartitionId(2000, 0); + + private static final Map tableIdToZoneIdMapper = Map.of(GROUP_1, ZONE_GROUP_1); + private static final String LEASEHOLDER_1 = "leaseholder1"; private static final String LEASEHOLDER_ID_1 = "leaseholder1_id"; @@ -97,7 +104,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest { false, true, null, - GROUP_1 + ZONE_GROUP_1, + Set.of(GROUP_1) ); private static final Lease LEASE_FROM_1_TO_15_000 = new Lease( @@ -108,7 +116,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest { false, true, null, - GROUP_1 + ZONE_GROUP_1, + Set.of(GROUP_1) ); private static final Lease LEASE_FROM_15_000_TO_30_000 = new Lease( @@ -119,7 +128,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest { false, true, null, - GROUP_1 + ZONE_GROUP_1, + Set.of(GROUP_1) ); private static final int AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES = 1_000; @@ -194,7 +204,10 @@ public void testAwaitPrimaryReplicaInInterval() throws Exception { publishLease(LEASE_FROM_1_TO_5_000); // Await local node to be notified about new primary replica. - assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_5_000), 1_000)); + assertTrue(waitForCondition( + () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_5_000), + 1_000) + ); // Assert that primary await future isn't completed yet because corresponding await time 10 is greater than lease expiration time 5. assertFalse(primaryReplicaFuture.isDone()); @@ -232,7 +245,10 @@ public void testAwaitPrimaryReplicaBeforeInterval() throws Exception { publishLease(LEASE_FROM_1_TO_5_000); // Await local node to be notified about new primary replica. - assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_5_000), 1_000)); + assertTrue(waitForCondition( + () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_5_000), + 1_000 + )); // Assert that primary await future isn't completed yet because corresponding await time 10 is greater than lease expiration time 5. assertFalse(primaryReplicaFuture.isDone()); @@ -262,8 +278,10 @@ public void testAwaitPrimaryReplicaBeforeIntervalAfterPublishing() throws Except publishLease(LEASE_FROM_1_TO_15_000); // Await local node to be notified about new primary replica. - assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_15_000), - AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES)); + assertTrue(waitForCondition( + () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_15_000), + AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES + )); // Await primary replica for time 10. CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplica(GROUP_1, AWAIT_TIME_10_000, @@ -311,7 +329,8 @@ private void testAwaitCurrentPrimaryIsOffline( false, true, null, - GROUP_1 + ZONE_GROUP_1, + Set.of(GROUP_1) ); publishLease(firstLease); @@ -333,7 +352,8 @@ private void testAwaitCurrentPrimaryIsOffline( false, true, null, - GROUP_1 + ZONE_GROUP_1, + Set.of(GROUP_1) ); if (newLeaseholderIsOnline) { @@ -617,7 +637,7 @@ void testListenNeighborGroupReplicaBecomePrimaryEvent() { publishLease(lease); - TablePartitionId groupId = (TablePartitionId) lease.replicationGroupId(); + ZonePartitionId groupId = (ZonePartitionId) lease.replicationGroupId(); CompletableFuture eventParametersFuture = listenSpecificGroupReplicaBecomePrimaryEvent(groupId); @@ -629,7 +649,8 @@ void testListenNeighborGroupReplicaBecomePrimaryEvent() { false, true, null, - new TablePartitionId(groupId.tableId() + 1, groupId.partitionId() + 1) + new ZonePartitionId(groupId.zoneId() + 1, groupId.partitionId() + 1), + Set.of(new TablePartitionId(groupId.zoneId() + 1, groupId.partitionId() + 1)) ); publishLeases(lease, neighborGroupLease); @@ -683,21 +704,27 @@ private static void checkReplicaBecomePrimaryEventParameters( Lease expLease, PrimaryReplicaEventParameters parameters ) { - assertThat(parameters.groupId(), equalTo(expLease.replicationGroupId())); + assertThat(parameters.groupId().toString(), equalTo(expLease.replicationGroupId().toString())); assertThat(parameters.leaseholderId(), equalTo(expLease.getLeaseholderId())); } private LeaseTracker createPlacementDriver() { - return new LeaseTracker(metastore, new ClusterNodeResolver() { - @Override - public @Nullable ClusterNode getByConsistentId(String consistentId) { - return leaseholder; - } - - @Override - public @Nullable ClusterNode getById(String id) { - return leaseholder; - } - }, clockService); + return new LeaseTracker( + LEASEHOLDER_ID_1, + metastore, + new ClusterNodeResolver() { + @Override + public @Nullable ClusterNode getByConsistentId(String consistentId) { + return leaseholder; + } + + @Override + public @Nullable ClusterNode getById(String id) { + return leaseholder; + } + }, + clockService, + tableIdToZoneIdMapper::get + ); } } diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java index 43f0bee08b7..0182913c249 100644 --- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java +++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Set; import org.apache.ignite.internal.hlc.HybridTimestamp; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; @@ -94,7 +95,8 @@ private static Lease newLease( prolong, accepted, proposedCandidate, - replicationGroupId + replicationGroupId, + Set.of(replicationGroupId) ); } diff --git a/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java b/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java index 28d9bc8aee0..c24fa5021a5 100644 --- a/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java +++ b/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java @@ -108,7 +108,7 @@ public class ItPlacementDriverReplicaSideTest extends IgniteAbstractTest { private static final int BASE_PORT = 1234; - private static final TestReplicationGroupId GROUP_ID = new TestReplicationGroupId("group_1"); + private static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0); private static final ReplicaMessagesFactory REPLICA_MESSAGES_FACTORY = new ReplicaMessagesFactory(); @@ -488,6 +488,7 @@ private CompletableFuture createReplicationGroup( try { return replicaManager.startReplica( groupId, + new ZonePartitionId(0, 0), (request, senderId) -> { log.info("Handle request [type={}]", request.getClass().getSimpleName()); diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java index 380e5c7b227..299ce368f23 100644 --- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java +++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java @@ -23,8 +23,9 @@ import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause; import static org.apache.ignite.internal.util.IgniteUtils.retryOperationUntilSuccess; +import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; @@ -33,7 +34,6 @@ import org.apache.ignite.internal.lang.IgniteStringFormatter; import org.apache.ignite.internal.logger.IgniteLogger; import org.apache.ignite.internal.logger.Loggers; -import org.apache.ignite.internal.network.NetworkMessage; import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessage; import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse; @@ -43,8 +43,11 @@ import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupService; import org.apache.ignite.internal.replicator.listener.ReplicaListener; import org.apache.ignite.internal.replicator.message.PrimaryReplicaChangeCommand; +import org.apache.ignite.internal.replicator.message.PrimaryReplicaRequest; import org.apache.ignite.internal.replicator.message.ReplicaMessagesFactory; import org.apache.ignite.internal.replicator.message.ReplicaRequest; +import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage; +import org.apache.ignite.internal.util.FastTimestamps; import org.apache.ignite.internal.util.PendingComparableValuesTracker; import org.apache.ignite.network.ClusterNode; @@ -63,6 +66,9 @@ public class Replica { /** Replica group identity, this id is the same as the considered partition's id. */ private final ReplicationGroupId replicaGrpId; + /** Zone partition id. */ + private final ZonePartitionId zonePartitionId; + /** Replica listener. */ private final ReplicaListener listener; @@ -88,16 +94,19 @@ public class Replica { /** External executor. */ // TODO: IGNITE-20063 Maybe get rid of it - private final ExecutorService executor; + private final Executor executor; private final PlacementDriver placementDriver; private final ClockService clockService; + private final CompletableFuture waitForActualStateFuture = new CompletableFuture<>(); + /** * The constructor of a replica server. * * @param replicaGrpId Replication group id. + * @param zonePartitionId Zone partition id. * @param listener Replica listener. * @param storageIndexTracker Storage index tracker. * @param raftClient Topology aware Raft client. @@ -108,15 +117,17 @@ public class Replica { */ public Replica( ReplicationGroupId replicaGrpId, + ZonePartitionId zonePartitionId, ReplicaListener listener, PendingComparableValuesTracker storageIndexTracker, TopologyAwareRaftGroupService raftClient, ClusterNode localNode, - ExecutorService executor, + Executor executor, PlacementDriver placementDriver, ClockService clockService ) { this.replicaGrpId = replicaGrpId; + this.zonePartitionId = zonePartitionId; this.listener = listener; this.storageIndexTracker = storageIndexTracker; this.raftClient = raftClient; @@ -141,6 +152,41 @@ public CompletableFuture processRequest(ReplicaRequest request, S request.groupId(), replicaGrpId); + if (request instanceof PrimaryReplicaRequest) { + var targetPrimaryReq = (PrimaryReplicaRequest) request; + + if (request instanceof WaitReplicaStateMessage) { + if (!waitForActualStateFuture.isDone()) { + return processWaitReplicaStateMessage((WaitReplicaStateMessage) request) + .thenComposeAsync( + v -> sendPrimaryReplicaChangeToReplicationGroup(targetPrimaryReq.enlistmentConsistencyToken()), + executor + ) + .thenComposeAsync( + unused -> completedFuture(new ReplicaResult(null, null)), + executor + ); + } else { + return completedFuture(new ReplicaResult(null, null)); + } + } + + if (!waitForActualStateFuture.isDone()) { + return placementDriver.addSubgroups( + zonePartitionId, + targetPrimaryReq.enlistmentConsistencyToken(), + Set.of(replicaGrpId) + ) + // TODO: https://issues.apache.org/jira/browse/IGNITE-22122 + .thenComposeAsync(unused -> waitForActualState(FastTimestamps.coarseCurrentTimeMillis() + 10_000), executor) + .thenComposeAsync( + v -> sendPrimaryReplicaChangeToReplicationGroup(targetPrimaryReq.enlistmentConsistencyToken()), + executor + ) + .thenComposeAsync(unused -> listener.invoke(request, senderId), executor); + } + } + return listener.invoke(request, senderId); } @@ -171,7 +217,7 @@ private CompletableFuture leaderFuture() { * @param msg Message to process. * @return Future that contains a result. */ - public CompletableFuture processPlacementDriverMessage(PlacementDriverReplicaMessage msg) { + public CompletableFuture processPlacementDriverMessage(PlacementDriverReplicaMessage msg) { if (msg instanceof LeaseGrantedMessage) { return processLeaseGrantedMessage((LeaseGrantedMessage) msg) .handle((v, e) -> { @@ -203,7 +249,7 @@ public CompletableFuture processPlacementDriverMessage private CompletableFuture processLeaseGrantedMessage(LeaseGrantedMessage msg) { LOG.info("Received LeaseGrantedMessage for replica belonging to group=" + groupId() + ", force=" + msg.force()); - return placementDriver.previousPrimaryExpired(groupId()).thenCompose(unused -> leaderFuture().thenCompose(leader -> { + return placementDriver.previousPrimaryExpired(msg.groupId()).thenCompose(unused -> leaderFuture().thenCompose(leader -> { HybridTimestamp leaseExpirationTime = this.leaseExpirationTime; if (leaseExpirationTime != null) { @@ -240,6 +286,18 @@ private CompletableFuture processLeaseGrantedMessag })); } + /** + * Process {@link WaitReplicaStateMessage}. + * + * @param msg Message to process. + * @return Future that contains a result. + */ + private CompletableFuture processWaitReplicaStateMessage(WaitReplicaStateMessage msg) { + LOG.info("WaitReplicaStateMessage was received [groupId = {}]", groupId()); + + return waitForActualState(FastTimestamps.coarseCurrentTimeMillis() + TimeUnit.SECONDS.toMillis(msg.timeout())); + } + private CompletableFuture sendPrimaryReplicaChangeToReplicationGroup(long leaseStartTime) { PrimaryReplicaChangeCommand cmd = REPLICA_MESSAGES_FACTORY.primaryReplicaChangeCommand() .leaseStartTime(leaseStartTime) @@ -257,6 +315,7 @@ private CompletableFuture acceptLease( this.leaseExpirationTime = leaseExpirationTime; LeaseGrantedMessageResponse resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(Set.of(replicaGrpId)) .accepted(true) .build(); @@ -292,7 +351,8 @@ private CompletableFuture waitForActualState(long expirationTime) { return retryOperationUntilSuccess(raftClient::readIndex, e -> currentTimeMillis() > expirationTime, executor) .orTimeout(timeout, TimeUnit.MILLISECONDS) - .thenCompose(storageIndexTracker::waitFor); + .thenCompose(idx -> storageIndexTracker.waitFor(idx)) + .thenRun(() -> waitForActualStateFuture.complete(null)); } /** diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java new file mode 100644 index 00000000000..acb366fc44f --- /dev/null +++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.replicator; + +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import org.apache.ignite.internal.event.AbstractEventProducer; +import org.apache.ignite.internal.event.EventListener; +import org.apache.ignite.internal.hlc.HybridTimestamp; +import org.apache.ignite.internal.placementdriver.PlacementDriver; +import org.apache.ignite.internal.placementdriver.ReplicaMeta; +import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; +import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; +import org.apache.ignite.internal.replicator.message.ReplicaMessagesFactory; +import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage; +import org.apache.ignite.network.ClusterNodeResolver; + +/** + * Implementation of {@link PlacementDriver} that is aware if {@link ReplicaService}. + * It delegates calls to the original {@link PlacementDriver} and after that sends {@link WaitReplicaStateMessage} + * which calls {@link org.apache.ignite.internal.replicator.Replica#waitForActualState(long)}. + */ +// TODO https://issues.apache.org/jira/browse/IGNITE-20362 +@Deprecated +public class ReplicaAwareLeaseTracker extends AbstractEventProducer implements + PlacementDriver { + /** Replicator network message factory. */ + private static final ReplicaMessagesFactory REPLICA_MESSAGES_FACTORY = new ReplicaMessagesFactory(); + + private final PlacementDriver delegate; + private final ReplicaService replicaService; + + /** Resolver that resolves a node consistent ID to cluster node. */ + private final ClusterNodeResolver clusterNodeResolver; + + + /** + * Constructor. + * + * @param delegate Delegate Placement Driver. + * @param replicaService Replica Service. + * @param clusterNodeResolver Cluster node resolver. + */ + public ReplicaAwareLeaseTracker(PlacementDriver delegate, ReplicaService replicaService, ClusterNodeResolver clusterNodeResolver) { + this.delegate = delegate; + this.replicaService = replicaService; + this.clusterNodeResolver = clusterNodeResolver; + } + + @Override + public void listen(PrimaryReplicaEvent evt, EventListener listener) { + delegate.listen(evt, listener); + } + + @Override + public void removeListener(PrimaryReplicaEvent evt, EventListener listener) { + delegate.removeListener(evt, listener); + } + + @Override + public CompletableFuture awaitPrimaryReplica(ReplicationGroupId groupId, HybridTimestamp timestamp, long timeout, + TimeUnit unit) { + return delegate.awaitPrimaryReplica(groupId, timestamp, timeout, unit); + } + + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + ZonePartitionId zonePartitionId = (ZonePartitionId) groupId; + + assert zonePartitionId.tableId() != 0 : "Table id should be defined."; + + ZonePartitionId pureZonePartId = zonePartitionId.purify(); + + return delegate.awaitPrimaryReplicaForTable(pureZonePartId, timestamp, timeout, unit); + } + + @Override + public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) { + return delegate.getPrimaryReplica(replicationGroupId, timestamp); + } + + @Override + public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) { + return delegate.previousPrimaryExpired(grpId); + } + + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + return delegate.getLeaseMeta(grpId); + } + + @Override + public CompletableFuture addSubgroups(ZonePartitionId zoneId, Long enlistmentConsistencyToken, Set subGrps) { + return delegate.addSubgroups(zoneId, enlistmentConsistencyToken, subGrps); + } +} diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java index 1b7c8ef725b..936cabebb57 100644 --- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java +++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java @@ -25,12 +25,16 @@ import static org.apache.ignite.internal.thread.ThreadOperation.STORAGE_READ; import static org.apache.ignite.internal.thread.ThreadOperation.STORAGE_WRITE; import static org.apache.ignite.internal.thread.ThreadOperation.TX_STATE_STORAGE_ACCESS; +import static org.apache.ignite.internal.util.CompletableFutures.allOf; import static org.apache.ignite.internal.util.CompletableFutures.isCompletedSuccessfully; import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause; import static org.apache.ignite.internal.util.IgniteUtils.shutdownAndAwaitTermination; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.CompletableFuture; @@ -38,11 +42,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor; -import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; @@ -63,6 +64,9 @@ import org.apache.ignite.internal.network.NetworkMessage; import org.apache.ignite.internal.network.NetworkMessageHandler; import org.apache.ignite.internal.placementdriver.PlacementDriver; +import org.apache.ignite.internal.placementdriver.ReplicaMeta; +import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessage; +import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse; import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessageGroup; import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessagesFactory; import org.apache.ignite.internal.placementdriver.message.PlacementDriverReplicaMessage; @@ -80,6 +84,7 @@ import org.apache.ignite.internal.replicator.message.ReplicaRequest; import org.apache.ignite.internal.replicator.message.ReplicaSafeTimeSyncRequest; import org.apache.ignite.internal.replicator.message.TimestampAware; +import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage; import org.apache.ignite.internal.thread.ExecutorChooser; import org.apache.ignite.internal.thread.NamedThreadFactory; import org.apache.ignite.internal.thread.PublicApiThreading; @@ -137,9 +142,13 @@ public class ReplicaManager extends AbstractEventProducer> zonePartIdToTablePartId = new ConcurrentHashMap<>(); + /** Scheduled executor for idle safe time sync. */ private final ScheduledExecutorService scheduledIdleSafeTimeSyncExecutor; + private final ScheduledExecutorService scheduledTableLeaseUpdateExecutor; + private final Executor requestsExecutor; private final FailureProcessor failureProcessor; @@ -147,10 +156,6 @@ public class ReplicaManager extends AbstractEventProducer> messageGroupsToHandle; - /** Executor. */ - // TODO: IGNITE-20063 Maybe get rid of it - private final ExecutorService executor; - private String localNodeId; /** @@ -226,15 +231,9 @@ public ReplicaManager( NamedThreadFactory.create(nodeName, "scheduled-idle-safe-time-sync-thread", LOG) ); - int threadCount = Runtime.getRuntime().availableProcessors(); - - executor = new ThreadPoolExecutor( - threadCount, - threadCount, - 30, - TimeUnit.SECONDS, - new LinkedBlockingQueue<>(), - NamedThreadFactory.create(nodeName, "replica", LOG) + scheduledTableLeaseUpdateExecutor = Executors.newScheduledThreadPool( + 1, + NamedThreadFactory.create(nodeName, "scheduled-table-lease-update-thread", LOG) ); } @@ -414,7 +413,7 @@ private void onPlacementDriverMessageReceived(NetworkMessage msg0, ClusterNode s assert correlationId != null; - var msg = (PlacementDriverReplicaMessage) msg0; + var msg = (LeaseGrantedMessage) msg0; if (!busyLock.enterBusy()) { if (LOG.isInfoEnabled()) { @@ -425,17 +424,37 @@ private void onPlacementDriverMessageReceived(NetworkMessage msg0, ClusterNode s } try { - CompletableFuture replicaFut = replicas.computeIfAbsent(msg.groupId(), k -> new CompletableFuture<>()); - - replicaFut - .thenCompose(replica -> replica.processPlacementDriverMessage(msg)) - .whenComplete((response, ex) -> { - if (ex == null) { - clusterNetSvc.messagingService().respond(senderConsistentId, response, correlationId); - } else if (!(unwrapCause(ex) instanceof NodeStoppingException)) { - LOG.error("Failed to process placement driver message [msg={}].", ex, msg); - } - }); + Set replicationGroupIds = zonePartIdToTablePartId.getOrDefault((ZonePartitionId) msg.groupId(), Set.of()); + + CompletableFuture[] futures = new CompletableFuture[replicationGroupIds.size()]; + + int i = 0; + + for (ReplicationGroupId grpId : replicationGroupIds) { + CompletableFuture replicaFut = replicas.computeIfAbsent(grpId, k -> new CompletableFuture<>()); + futures[i++] = replicaFut.thenCompose(replica -> replica.processPlacementDriverMessage(msg)); + } + + allOf(futures).whenComplete((responses, ex) -> { + if (ex == null) { + boolean accepted = responses.stream().allMatch(LeaseGrantedMessageResponse::accepted); + + assert !msg.force() || accepted : "We do not give a replica possibility to decline a forced request."; + + String redirect = accepted ? null : + responses.stream().filter(leaseGranResp -> !leaseGranResp.accepted()).findAny().get().redirectProposal(); + + LeaseGrantedMessageResponse response = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse() + .appliedGroups(replicationGroupIds == null ? Collections.emptySet() : replicationGroupIds) + .redirectProposal(redirect) + .accepted(accepted) + .build(); + + clusterNetSvc.messagingService().respond(senderConsistentId, response, correlationId); + } else if (!(unwrapCause(ex) instanceof NodeStoppingException)) { + LOG.error("Failed to process placement driver message [msg={}].", ex, msg); + } + }); } finally { busyLock.leaveBusy(); } @@ -478,6 +497,7 @@ private void stopLeaseProlongation(ReplicationGroupId groupId, @Nullable String */ public CompletableFuture startReplica( ReplicationGroupId replicaGrpId, + ZonePartitionId zonePartitionId, ReplicaListener listener, TopologyAwareRaftGroupService raftClient, PendingComparableValuesTracker storageIndexTracker @@ -487,7 +507,7 @@ public CompletableFuture startReplica( } try { - return startReplicaInternal(replicaGrpId, listener, raftClient, storageIndexTracker); + return startReplicaInternal(replicaGrpId, zonePartitionId, listener, raftClient, storageIndexTracker); } finally { busyLock.leaveBusy(); } @@ -503,6 +523,7 @@ public CompletableFuture startReplica( */ private CompletableFuture startReplicaInternal( ReplicationGroupId replicaGrpId, + ZonePartitionId zonePartitionId, ReplicaListener listener, TopologyAwareRaftGroupService raftClient, PendingComparableValuesTracker storageIndexTracker @@ -513,16 +534,27 @@ private CompletableFuture startReplicaInternal( Replica newReplica = new Replica( replicaGrpId, + zonePartitionId, listener, storageIndexTracker, raftClient, localNode, - executor, + requestsExecutor, placementDriver, clockService ); CompletableFuture replicaFuture = replicas.compute(replicaGrpId, (k, existingReplicaFuture) -> { + zonePartIdToTablePartId.compute(zonePartitionId, (key, tablePartIds) -> { + if (tablePartIds == null) { + tablePartIds = new HashSet<>(); + } + + tablePartIds.add(replicaGrpId); + + return tablePartIds; + }); + if (existingReplicaFuture == null || existingReplicaFuture.isDone()) { assert existingReplicaFuture == null || isCompletedSuccessfully(existingReplicaFuture); LOG.info("Replica is started [replicationGroupId={}].", replicaGrpId); @@ -612,6 +644,10 @@ private CompletableFuture stopReplicaInternal(ReplicationGroupId replic }); } + zonePartIdToTablePartId.forEach((zonePartId, tblPartIds) -> { + tblPartIds.remove(replicaGrpId); + }); + return null; }); } finally { @@ -639,6 +675,18 @@ public CompletableFuture startAsync() { TimeUnit.MILLISECONDS ); + scheduledTableLeaseUpdateExecutor.scheduleAtFixedRate(() -> { + if (!busyLock.enterBusy()) { + return; + } + + try { + updateTableGroupsInternal(); + } finally { + busyLock.leaveBusy(); + } + }, 0, 1, TimeUnit.SECONDS); + cmgMgr.metaStorageNodes().whenComplete((nodes, e) -> { if (e != null) { msNodes.completeExceptionally(e); @@ -652,6 +700,59 @@ public CompletableFuture startAsync() { return nullCompletedFuture(); } + /** + * Updates list of replication groups for each distributed zone. + */ + private void updateTableGroupsInternal() { + for (Entry> entry : zonePartIdToTablePartId.entrySet()) { + ZonePartitionId repGrp = entry.getKey(); + + ReplicaMeta meta = placementDriver.getLeaseMeta(repGrp); + + if (meta != null) { + HashSet diff = new HashSet<>(entry.getValue()); + diff.removeAll(meta.subgroups()); + + if (meta.getLeaseholderId().equals(localNodeId) && !diff.isEmpty()) { + LOG.info("New subgroups are found for existing lease [repGrp={}, subGroups={}].", repGrp, diff); + + try { + placementDriver.addSubgroups(repGrp, meta.getStartTime().longValue(), diff) + .thenComposeAsync(unused -> { + ArrayList> requestToReplicas = new ArrayList<>(); + + for (ReplicationGroupId partId : diff) { + WaitReplicaStateMessage req = REPLICA_MESSAGES_FACTORY.waitReplicaStateMessage() + .enlistmentConsistencyToken(meta.getStartTime().longValue()) + .groupId(partId) + // TODO: https://issues.apache.org/jira/browse/IGNITE-22122 + .timeout(10) + .build(); + + CompletableFuture replicaFut = replicas.get(repGrp); + + if (replicaFut != null) { + requestToReplicas.add(replicaFut.thenCompose( + replica -> replica.processRequest(req, localNodeId))); + } + } + + return allOf(requestToReplicas.toArray(CompletableFuture[]::new)); + }, scheduledTableLeaseUpdateExecutor) + .get(500, TimeUnit.MILLISECONDS); + } catch (Exception ex) { + LOG.error( + "Failed to add new subgroups to the replication group [repGrp={}, subGroups={}].", + ex, + repGrp, + diff + ); + } + } + } + } + } + /** {@inheritDoc} */ @Override public CompletableFuture stopAsync() { @@ -662,7 +763,7 @@ public CompletableFuture stopAsync() { busyLock.block(); shutdownAndAwaitTermination(scheduledIdleSafeTimeSyncExecutor, 10, TimeUnit.SECONDS); - shutdownAndAwaitTermination(executor, 10, TimeUnit.SECONDS); + shutdownAndAwaitTermination(scheduledTableLeaseUpdateExecutor, 10, TimeUnit.SECONDS); assert replicas.values().stream().noneMatch(CompletableFuture::isDone) : "There are replicas alive [replicas=" diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java index b8407d992fc..d45aa87c8bc 100644 --- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java +++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java @@ -54,4 +54,7 @@ public interface ReplicaMessageGroup { /** Message type for {@link PrimaryReplicaChangeCommand}. */ short PRIMARY_REPLICA_CHANGE_COMMAND = 41; + + /** Message type for {@link WaitReplicaStateMessage}. */ + short WAIT_REPLICA_STATE = 42; } diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java new file mode 100644 index 00000000000..d25e59c204d --- /dev/null +++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.replicator.message; + +import org.apache.ignite.internal.network.annotations.Transferable; + +/** + * Wait for replica state being up to date with a leader. + */ +@Transferable(ReplicaMessageGroup.WAIT_REPLICA_STATE) +public interface WaitReplicaStateMessage extends PrimaryReplicaRequest { + long timeout(); +} diff --git a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java index 676eb5ec362..3c4d3ef837c 100644 --- a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java +++ b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java @@ -117,6 +117,7 @@ private Replica startReplica() { return new Replica( GRP_ID, + new ZonePartitionId(1, 0), mock(ReplicaListener.class), storageIndexTracker, raftClient, @@ -172,7 +173,7 @@ private CompletableFuture sendLeaseGranted( .force(force) .build(); - return replica.processPlacementDriverMessage(msg).thenApply(LeaseGrantedMessageResponse.class::cast); + return replica.processPlacementDriverMessage(msg); } private HybridTimestamp hts(long physical) { diff --git a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java index 9963fabfa60..d2a4729e815 100644 --- a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java +++ b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java @@ -153,9 +153,11 @@ void testReplicaEvents( replicaManager.listen(BEFORE_REPLICA_STOPPED, removeReplicaListener); var groupId = new TablePartitionId(0, 0); + var zonePartId = new ZonePartitionId(0, 0); CompletableFuture startReplicaFuture = replicaManager.startReplica( groupId, + zonePartId, replicaListener, raftGroupService, new PendingComparableValuesTracker<>(0L) diff --git a/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java b/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java index 60ebeafa68c..f6f17602ac3 100644 --- a/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java +++ b/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java @@ -55,7 +55,8 @@ import org.apache.ignite.internal.raft.server.RaftGroupOptions; import org.apache.ignite.internal.raft.server.impl.JraftServerImpl; import org.apache.ignite.internal.raft.util.ThreadLocalOptimizedMarshaller; -import org.apache.ignite.internal.replicator.TestReplicationGroupId; +import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.IgniteAbstractTest; import org.apache.ignite.internal.thread.NamedThreadFactory; import org.apache.ignite.internal.topology.LogicalTopologyServiceTestImpl; @@ -88,7 +89,9 @@ public abstract class AbstractTopologyAwareGroupServiceTest extends IgniteAbstra /** Wait timeout, in milliseconds. */ protected static final int WAIT_TIMEOUT_MILLIS = 10_000; - protected static final TestReplicationGroupId GROUP_ID = new TestReplicationGroupId("group_1"); + protected static final TablePartitionId GROUP_ID = new TablePartitionId(1, 1); + + protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 1); /** RPC executor. */ protected final ScheduledExecutorService executor = new ScheduledThreadPoolExecutor( diff --git a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java index cdb511f090c..5d63268482e 100644 --- a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java +++ b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java @@ -26,6 +26,7 @@ import static org.apache.ignite.internal.TestWrappers.unwrapTableViewInternal; import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE; import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.alterZone; +import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.REBALANCE_SCHEDULER_POOL_SIZE; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.stablePartAssignmentsKey; @@ -117,6 +118,7 @@ import org.apache.ignite.internal.hlc.ClockServiceImpl; import org.apache.ignite.internal.hlc.ClockWaiter; import org.apache.ignite.internal.hlc.HybridClockImpl; +import org.apache.ignite.internal.hlc.TestClockService; import org.apache.ignite.internal.index.IndexManager; import org.apache.ignite.internal.lang.ByteArray; import org.apache.ignite.internal.lang.IgniteInternalException; @@ -152,9 +154,11 @@ import org.apache.ignite.internal.raft.configuration.RaftConfiguration; import org.apache.ignite.internal.raft.server.impl.JraftServerImpl; import org.apache.ignite.internal.raft.storage.impl.LocalLogStorageFactory; +import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker; import org.apache.ignite.internal.replicator.ReplicaManager; import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.schema.SchemaManager; import org.apache.ignite.internal.schema.configuration.GcConfiguration; @@ -232,6 +236,8 @@ public class ItIgniteNodeRestartTest extends BaseIgniteRestartTest { /** Test table name. */ private static final String TABLE_NAME_2 = "Table2"; + protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 1); + @InjectConfiguration("mock: " + RAFT_CFG) private static RaftConfiguration raftConfiguration; @@ -443,10 +449,19 @@ public CompletableFuture invoke(Condition condition, Collection TestIgnitionManager.DEFAULT_DELAY_DURATION_MS; + + var catalogManager = new CatalogManagerImpl( + new UpdateLogImpl(metaStorageMgr), + new TestClockService(hybridClock, clockWaiter), + delayDurationMsSupplier, + partitionIdleSafeTimePropagationPeriodMsSupplier + ); + + ConfigurationRegistry clusterConfigRegistry = clusterCfgMgr.configurationRegistry(); + SchemaSynchronizationConfiguration schemaSyncConfiguration = clusterConfigRegistry.getConfiguration( SchemaSynchronizationConfiguration.KEY ); @@ -465,7 +480,8 @@ public CompletableFuture invoke(Condition condition, Collection ZONE_GROUP_ID ); ReplicaManager replicaMgr = new ReplicaManager( @@ -544,15 +560,6 @@ public CompletableFuture invoke(Condition condition, Collection TestIgnitionManager.DEFAULT_DELAY_DURATION_MS; - - var catalogManager = new CatalogManagerImpl( - new UpdateLogImpl(metaStorageMgr), - clockService, - delayDurationMsSupplier, - partitionIdleSafeTimePropagationPeriodMsSupplier - ); - SchemaManager schemaManager = new SchemaManager(registry, catalogManager); var dataNodesMock = dataNodesMockByNode.get(idx); @@ -645,7 +652,7 @@ public CompletableFuture> dataNodes(long causalityToken, int catalog new SystemViewManagerImpl(name, catalogManager), failureProcessor, partitionIdleSafeTimePropagationPeriodMsSupplier, - placementDriverManager.placementDriver(), + new ReplicaAwareLeaseTracker(placementDriverManager.placementDriver(), replicaService, clusterSvc.topologyService()), clusterConfigRegistry.getConfiguration(SqlDistributedConfiguration.KEY), nodeCfgMgr.configurationRegistry().getConfiguration(SqlLocalConfiguration.KEY), transactionInflights @@ -1419,7 +1426,11 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException inhibitor.startInhibit(); - alterZone(nodes.get(0).catalogManager(), String.format("ZONE_%s", TABLE_NAME.toUpperCase()), 1); + String zoneName = String.format("ZONE_%s", TABLE_NAME.toUpperCase()); + + alterZone(nodes.get(0).catalogManager(), zoneName, 1); + + int zoneId = getZoneIdStrict(nodes.get(0).catalogManager(), zoneName, nodes.get(0).clock().nowLong()); stopNode(restartedNodeIndex); @@ -1437,9 +1448,9 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException .collect(toSet()), Set.of()); for (int p = 0; p < partitions; p++) { - TablePartitionId tablePartitionId = new TablePartitionId(table.tableId(), p); + ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, p); - Entry e = restartedNode.metaStorageManager().getLocally(stablePartAssignmentsKey(tablePartitionId), recoveryRevision); + Entry e = restartedNode.metaStorageManager().getLocally(stablePartAssignmentsKey(zonePartitionId), recoveryRevision); Set assignment = Assignments.fromBytes(e.value()).nodes(); @@ -1447,7 +1458,7 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException Peer peer = configuration.peer(restartedNode.name()); - boolean isStarted = restartedNode.raftManager().isStarted(new RaftNodeId(tablePartitionId, peer)); + boolean isStarted = restartedNode.raftManager().isStarted(new RaftNodeId(new TablePartitionId(table.tableId(), p), peer)); assertEquals(shouldBe, isStarted); } @@ -1500,7 +1511,10 @@ public void createTableCallOnMultipleNodesTest(boolean populateStableAssignments ); } - var partId = new TablePartitionId(TABLE_ID, 0); + // Assume that the zone id will always be 7 for the test table. There is an assertion below to check this is true. + int zoneId = 7; + + var partId = new ZonePartitionId(zoneId, 0); // Populate the stable assignments before calling table create, if needed. if (populateStableAssignmentsBeforeTableCreation) { @@ -1525,7 +1539,7 @@ public void createTableCallOnMultipleNodesTest(boolean populateStableAssignments sql.execute(null, "CREATE TABLE " + TABLE_NAME + "(id INT PRIMARY KEY, name VARCHAR) WITH PRIMARY_ZONE='" + zoneName + "';"); - assertEquals(TABLE_ID, tableId(node, TABLE_NAME)); + assertEquals(zoneId, zoneId(node, zoneName)); node.metaStorageManager().put(new ByteArray(testPrefix.getBytes(StandardCharsets.UTF_8)), new byte[0]); @@ -1603,7 +1617,10 @@ public void tableRecoveryOnMultipleRestartingNodes(int nodeThatWrittenAssignment String tableName = "TEST"; String zoneName = "ZONE_TEST"; - var assignmentsKey = stablePartAssignmentsKey(new TablePartitionId(TABLE_ID, 0)); + // Assume that the zone id will always be 7 for the test table. There is an assertion below to check this is true. + int zoneId = 7; + + var assignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, 0)); var metaStorageInterceptorFut = new CompletableFuture<>(); var metaStorageInterceptorInnerFut = new CompletableFuture<>(); @@ -1679,7 +1696,7 @@ public void tableRecoveryOnMultipleRestartingNodes(int nodeThatWrittenAssignment nodeInhibitor0.stopInhibit(); waitForValueInLocalMs(node0.metaStorageManager(), assignmentsKey); - assertEquals(TABLE_ID, tableId(node0, tableName)); + assertEquals(zoneId, zoneId(node0, zoneName)); Set expectedAssignments = dataNodesMockByNode.get(nodeThatWrittenAssignments).get().join() .stream().map(Assignment::forPeer).collect(toSet()); @@ -1709,7 +1726,9 @@ public void testSequentialAsyncTableCreationThenAlterZoneThenRestartOnMsSnapshot nodeInhibitor0.startInhibit(); nodeInhibitor1.startInhibit(); - var assignmentsKey = stablePartAssignmentsKey(new TablePartitionId(TABLE_ID, 0)); + int zoneId = zoneId(node0, zoneName); + + var assignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, 0)); var tableFut = createTableInCatalog(node0.catalogManager(), tableName, zoneName); @@ -1744,7 +1763,7 @@ public void testSequentialAsyncTableCreationThenAlterZoneThenRestartOnMsSnapshot assertThat(tableFut, willCompleteSuccessfully()); assertThat(alterZoneFut, willCompleteSuccessfully()); - assertEquals(TABLE_ID, tableId(node0, tableName)); + assertEquals(zoneId, zoneId(node0, zoneName)); waitForValueInLocalMs(node0.metaStorageManager(), assignmentsKey); @@ -1828,8 +1847,10 @@ private Set getAssignmentsFromMetaStorage(MetaStorageManager metaSto : Assignments.fromBytes(e.value()).nodes(); } - private int tableId(Ignite node, String tableName) { - return (unwrapTableImpl(node.tables().table(tableName))).tableId(); + private int zoneId(IgniteImpl node, String zoneName) { + int zoneId = getZoneIdStrict(node.catalogManager(), zoneName.toUpperCase(), node.clock().nowLong()); + + return zoneId; } /** diff --git a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java index a36a3886854..57063c7fb2c 100644 --- a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java +++ b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java @@ -50,6 +50,7 @@ import org.apache.ignite.internal.replicator.TablePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.testframework.IgniteTestUtils; +import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.tx.InternalTransaction; import org.apache.ignite.internal.tx.MismatchingTransactionOutcomeException; import org.apache.ignite.internal.tx.TxMeta; @@ -70,6 +71,7 @@ /** * Test resending the finish request from the coordinator when the previous attempts failed for any reason. */ +@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false") public class ItDurableFinishTest extends ClusterPerTestIntegrationTest { private static final int AWAIT_PRIMARY_REPLICA_TIMEOUT = 10; diff --git a/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java b/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java index c90b014454a..f17f670cdfa 100644 --- a/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java +++ b/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java @@ -64,6 +64,7 @@ import org.apache.ignite.internal.catalog.CatalogManager; import org.apache.ignite.internal.catalog.CatalogManagerImpl; import org.apache.ignite.internal.catalog.configuration.SchemaSynchronizationConfiguration; +import org.apache.ignite.internal.catalog.descriptors.CatalogTableDescriptor; import org.apache.ignite.internal.catalog.sql.IgniteCatalogSqlImpl; import org.apache.ignite.internal.catalog.storage.UpdateLogImpl; import org.apache.ignite.internal.cluster.management.ClusterInitializer; @@ -157,8 +158,10 @@ import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory; import org.apache.ignite.internal.raft.configuration.RaftConfiguration; import org.apache.ignite.internal.raft.storage.impl.VolatileLogStorageFactoryCreator; +import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker; import org.apache.ignite.internal.replicator.ReplicaManager; import org.apache.ignite.internal.replicator.ReplicaService; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.rest.RestComponent; import org.apache.ignite.internal.rest.RestFactory; @@ -592,6 +595,19 @@ public class IgniteImpl implements Ignite { Consumer>> registry = c -> metaStorageMgr.registerRevisionUpdateListener(c::apply); + ReplicationConfiguration replicationConfig = clusterConfigRegistry.getConfiguration(ReplicationConfiguration.KEY); + + LongSupplier partitionIdleSafeTimePropagationPeriodMsSupplier = partitionIdleSafeTimePropagationPeriodMsSupplier(replicationConfig); + + LongSupplier delayDurationMsSupplier = delayDurationMsSupplier(schemaSyncConfig); + + CatalogManagerImpl catalogManager = new CatalogManagerImpl( + new UpdateLogImpl(metaStorageMgr), + clockService, + delayDurationMsSupplier, + partitionIdleSafeTimePropagationPeriodMsSupplier + ); + placementDriverMgr = new PlacementDriverManager( name, metaStorageMgr, @@ -601,10 +617,18 @@ public class IgniteImpl implements Ignite { logicalTopologyService, raftMgr, topologyAwareRaftGroupServiceFactory, - clockService - ); + clockService, + tablePartId -> { + CatalogTableDescriptor tbl = catalogManager.table(tablePartId.tableId(), catalogManager.latestCatalogVersion()); - ReplicationConfiguration replicationConfig = clusterConfigRegistry.getConfiguration(ReplicationConfiguration.KEY); + int zoneId = tbl == null ? 2 : tbl.zoneId(); + + return new ZonePartitionId( + zoneId, + tablePartId.partitionId() + ); + } + ); ReplicaService replicaSvc = new ReplicaService( messagingServiceReturningToStorageOperationsPool, @@ -613,8 +637,6 @@ public class IgniteImpl implements Ignite { replicationConfig ); - LongSupplier partitionIdleSafeTimePropagationPeriodMsSupplier = partitionIdleSafeTimePropagationPeriodMsSupplier(replicationConfig); - replicaMgr = new ReplicaManager( name, clusterSvc, @@ -658,16 +680,6 @@ public class IgniteImpl implements Ignite { volatileLogStorageFactoryCreator = new VolatileLogStorageFactoryCreator(name, workDir.resolve("volatile-log-spillout")); outgoingSnapshotsManager = new OutgoingSnapshotsManager(name, clusterSvc.messagingService()); - - LongSupplier delayDurationMsSupplier = delayDurationMsSupplier(schemaSyncConfig); - - CatalogManagerImpl catalogManager = new CatalogManagerImpl( - new UpdateLogImpl(metaStorageMgr), - clockService, - delayDurationMsSupplier, - partitionIdleSafeTimePropagationPeriodMsSupplier - ); - systemViewManager = new SystemViewManagerImpl(name, catalogManager); nodeAttributesCollector.register(systemViewManager); logicalTopology.addEventListener(systemViewManager); @@ -800,13 +812,16 @@ public class IgniteImpl implements Ignite { lowWatermark ); + ReplicaAwareLeaseTracker replicaAwarePlacementDriver = new ReplicaAwareLeaseTracker(placementDriverMgr.placementDriver(), + replicaSvc, clusterSvc.topologyService()); + indexBuildingManager = new IndexBuildingManager( name, replicaSvc, catalogManager, metaStorageMgr, indexManager, - placementDriverMgr.placementDriver(), + replicaAwarePlacementDriver, clusterSvc, logicalTopologyService, clockService @@ -827,7 +842,7 @@ public class IgniteImpl implements Ignite { systemViewManager, failureProcessor, partitionIdleSafeTimePropagationPeriodMsSupplier, - placementDriverMgr.placementDriver(), + replicaAwarePlacementDriver, clusterConfigRegistry.getConfiguration(SqlDistributedConfiguration.KEY), nodeConfigRegistry.getConfiguration(SqlLocalConfiguration.KEY), transactionInflights @@ -859,7 +874,7 @@ public class IgniteImpl implements Ignite { ); compute = new IgniteComputeImpl( - placementDriverMgr.placementDriver(), + replicaAwarePlacementDriver, clusterSvc.topologyService(), distributedTblMgr, computeComponent, diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java index 160bd8fab54..7d9725ad07e 100644 --- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java +++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java @@ -74,6 +74,7 @@ import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.schema.SchemaManager; import org.apache.ignite.internal.sql.api.ResultSetMetadataImpl; import org.apache.ignite.internal.sql.configuration.distributed.SqlDistributedConfiguration; @@ -408,8 +409,10 @@ private CompletableFuture> primaryReplicas(Ignite int partitionId = partId; ReplicationGroupId partGroupId = new TablePartitionId(table.id(), partitionId); - CompletableFuture f = placementDriver.awaitPrimaryReplica( - partGroupId, + ZonePartitionId zonePartitionId = new ZonePartitionId(table.zoneId(), table.id(), partId); + + CompletableFuture f = placementDriver.awaitPrimaryReplicaForTable( + zonePartitionId, clockNow, AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java index 7c28ac249e7..ae70d0777a4 100644 --- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java +++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java @@ -43,7 +43,7 @@ import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologySnapshot; import org.apache.ignite.internal.lang.IgniteInternalException; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.sql.engine.exec.mapping.MappingServiceImpl.LogicalTopologyHolder.TopologySnapshot; import org.apache.ignite.internal.sql.engine.prepare.Fragment; import org.apache.ignite.internal.sql.engine.prepare.MultiStepPlan; @@ -114,9 +114,9 @@ public CompletableFuture> map(MultiStepPlan multiStepPlan, /** Called when the primary replica has expired. */ public CompletableFuture onPrimaryReplicaExpired(PrimaryReplicaEventParameters parameters) { assert parameters != null; - assert parameters.groupId() instanceof TablePartitionId; + assert parameters.groupId() instanceof ZonePartitionId; - int tabId = ((TablePartitionId) parameters.groupId()).tableId(); + int tabId = ((ZonePartitionId) parameters.groupId()).tableId(); // TODO https://issues.apache.org/jira/browse/IGNITE-21201 Move complex computations to a different thread. mappingsCache.removeIfValue(value -> value.tableIds.contains(tabId)); diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java index a7974f3c29f..c827145e736 100644 --- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java +++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java @@ -72,4 +72,11 @@ public interface IgniteTable extends IgniteDataSource { * @return Number of partitions. */ int partitions(); + + /** + * Returns the zone id of this table. + * + * @return Zone id. + */ + int zoneId(); } diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java index 170f23861d6..38dada9742b 100644 --- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java +++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java @@ -48,6 +48,8 @@ public class IgniteTableImpl extends AbstractIgniteDataSource implements IgniteT private final int partitions; + private final int zoneId; + private final Lazy colocationColumnTypes; /** Constructor. */ @@ -59,13 +61,15 @@ public IgniteTableImpl( ImmutableIntList keyColumns, Statistic statistic, Map indexMap, - int partitions + int partitions, + int zoneId ) { super(name, id, version, desc, statistic); this.keyColumns = keyColumns; this.indexMap = indexMap; this.partitions = partitions; + this.zoneId = zoneId; this.columnsToInsert = deriveColumnsToInsert(desc); colocationColumnTypes = new Lazy<>(this::evaluateTypes); @@ -155,6 +159,11 @@ public int partitions() { return partitions; } + @Override + public int zoneId() { + return zoneId; + } + @Override public ImmutableIntList keyColumns() { return keyColumns; diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java index 2ab61fbe201..5176a22fb4c 100644 --- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java +++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java @@ -384,6 +384,7 @@ private static IgniteTable createTable( ) { int tableId = catalogTableDescriptor.id(); String tableName = catalogTableDescriptor.name(); + int zoneId = catalogTableDescriptor.zoneId(); // TODO IGNITE-19558: The table is not available at planning stage. // Let's fix table statistics keeping in mind IGNITE-19558 issue. @@ -402,7 +403,8 @@ private static IgniteTable createTable( primaryIndex.collation().getKeys(), statistic, indexes, - parititions + parititions, + zoneId ); } } diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java index b2bc6ca5de9..672169a7c7a 100644 --- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java +++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java @@ -158,7 +158,7 @@ CompletableFuture getTable(int tableId) { when(descriptor.iterator()).thenReturn(Collections.emptyIterator()); IgniteTable sqlTable = new IgniteTableImpl( - table.name(), tableId, tableVersion, descriptor, ImmutableIntList.of(0), new TestStatistic(1_000.0), Map.of(), 1 + table.name(), tableId, tableVersion, descriptor, ImmutableIntList.of(0), new TestStatistic(1_000.0), Map.of(), 1, 123 ); when(sqlSchemaManager.table(schemaVersion, tableId)).thenReturn(sqlTable); diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java index 4979f140572..43f416b9f02 100644 --- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java +++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java @@ -40,12 +40,12 @@ import java.util.stream.Collectors; import org.apache.ignite.internal.catalog.Catalog; import org.apache.ignite.internal.catalog.CatalogService; -import org.apache.ignite.internal.catalog.descriptors.CatalogObjectDescriptor; +import org.apache.ignite.internal.catalog.descriptors.CatalogTableDescriptor; import org.apache.ignite.internal.cluster.management.topology.api.LogicalNode; import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologySnapshot; import org.apache.ignite.internal.hlc.HybridTimestamp; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.sql.engine.framework.TestBuilders; import org.apache.ignite.internal.sql.engine.framework.TestCluster; import org.apache.ignite.internal.sql.engine.prepare.MultiStepPlan; @@ -235,15 +235,14 @@ public void testCacheInvalidationOnPrimaryExpiration() { CatalogService catalogService = cluster.catalogManager(); Catalog catalog = catalogService.catalog(catalogService.latestCatalogVersion()); - Optional tblId = catalog.tables().stream() + Optional tblDesc = catalog.tables().stream() .filter(desc -> name.equals(desc.name())) - .findFirst() - .map(CatalogObjectDescriptor::id); + .findFirst(); - assertTrue(tblId.isPresent()); + assertTrue(tblDesc.isPresent()); return new PrimaryReplicaEventParameters( - 0, new TablePartitionId(tblId.get(), 0), "ignored", "ignored", HybridTimestamp.MIN_VALUE); + 0, new ZonePartitionId(tblDesc.get().zoneId(), tblDesc.get().id(), 0), "ignored", "ignored", HybridTimestamp.MIN_VALUE); }; // Initialize mapping service. diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java index 5c12103b02a..59cae1afa8d 100644 --- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java +++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java @@ -242,6 +242,7 @@ private static class TestInternalTableImpl extends InternalTableImpl { super( "test", 1, + 123, PART_CNT, new SingleClusterNodeResolver(mock(ClusterNode.class)), txManager, diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java index 4194a187f19..de0160615c7 100644 --- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java +++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java @@ -807,6 +807,7 @@ private static class TableBuilderImpl implements TableBuilder { private int size = 100_000; private Integer tableId; private int partitions = CatalogUtils.DEFAULT_PARTITION_COUNT; + private int zoneId = 123; /** {@inheritDoc} */ @Override @@ -929,7 +930,8 @@ public IgniteTable build() { findPrimaryKey(tableDescriptor, indexes.values()), new TestStatistic(size), indexes, - partitions + partitions, + zoneId ); } } diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java index ce236bdf17b..3649e650d7a 100644 --- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java +++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java @@ -664,6 +664,11 @@ public int partitions() { return 1; } + @Override + public int zoneId() { + return 123; + } + @Override public String name() { return name; diff --git a/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java b/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java index ca2913f8faf..cc7b31a16b5 100644 --- a/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java +++ b/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java @@ -76,6 +76,7 @@ private static CatalogHashIndexDescriptor createHashIndexDescriptor(int indexId, false, AVAILABLE, 1, + 0, List.of(COLUMN_NAME) ); } @@ -88,6 +89,7 @@ private static CatalogSortedIndexDescriptor createSortedIndexDescriptor(int inde false, AVAILABLE, 1, + 0, List.of(new CatalogIndexColumnDescriptor(COLUMN_NAME, ASC_NULLS_FIRST)) ); } diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java index 7b661792838..10e1d0bb45f 100644 --- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java +++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java @@ -366,6 +366,7 @@ public void testDestroySortedIndexIndependence() { false, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST)) ); @@ -376,6 +377,7 @@ public void testDestroySortedIndexIndependence() { false, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST)) ); @@ -419,6 +421,7 @@ public void testDestroyHashIndexIndependence() { true, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of("STRKEY") ); @@ -429,6 +432,7 @@ public void testDestroyHashIndexIndependence() { true, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of("STRKEY") ); @@ -1056,6 +1060,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) { false, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST)) ); @@ -1066,6 +1071,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) { true, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of("STRKEY") ); @@ -1076,6 +1082,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) { true, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of(pkColumnName) ); diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java index 7a0b604629c..3f140d41730 100644 --- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java +++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java @@ -74,6 +74,7 @@ CatalogHashIndexDescriptor createCatalogIndexDescriptor(int tableId, int indexId false, AVAILABLE, catalogService.latestCatalogVersion(), + 0, Stream.of(columnTypes).map(AbstractIndexStorageTest::columnName).collect(toList()) ); diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java index 99b9fe07055..df1855b6d1c 100644 --- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java +++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java @@ -188,6 +188,7 @@ private CatalogSortedIndexDescriptor createCatalogIndexDescriptor( false, AVAILABLE, catalogService.latestCatalogVersion(), + 0, List.of(columns) ); diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java index 333ae059a90..4cd561bde70 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java @@ -76,6 +76,9 @@ public void testPrimaryReplicaDirectUpdateForExplicitTxn() throws InterruptedExc JraftServerImpl server = (JraftServerImpl) txTestCluster.raftServers.get(leader.consistentId()).server(); var groupId = new TablePartitionId(accounts.tableId(), 0); + // TODO: IGNITE-20362 It need to be don before the message blocking to update lease subgroups. + accounts.recordView().insert(null, makeValue(1, 500.0)); + // BLock replication messages to both replicas. server.blockMessages(new RaftNodeId(groupId, leader), (msg, peerId) -> { if (msg instanceof RpcRequests.AppendEntriesRequest) { diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java index f5f4fede066..d9843bae18f 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java @@ -27,6 +27,7 @@ import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedFast; import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedIn; import static org.apache.ignite.internal.util.CompletableFutures.emptySetCompletedFuture; +import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture; import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause; import static org.apache.ignite.lang.ErrorGroups.Replicator.REPLICA_TIMEOUT_ERR; import static org.apache.ignite.raft.jraft.test.TestUtils.getLocalAddress; @@ -35,6 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -62,6 +64,7 @@ import org.apache.ignite.internal.replicator.ReplicaResult; import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.replicator.exception.ReplicaStoppingException; import org.apache.ignite.internal.replicator.exception.ReplicationException; @@ -180,6 +183,7 @@ public void testWithReplicaStartedAfterRequestSending() throws Exception { ClusterNode clusterNode = clusterService.topologyService().localMember(); TablePartitionId tablePartitionId = new TablePartitionId(1, 1); + ZonePartitionId zonePartitionId = new ZonePartitionId(1, 1); ReadWriteSingleRowReplicaRequest request = getRequest(tablePartitionId); @@ -188,12 +192,17 @@ public void testWithReplicaStartedAfterRequestSending() throws Exception { try { log.info("Replica msg " + message.getClass().getSimpleName()); + var mockRaftClient = mock(TopologyAwareRaftGroupService.class); + when(mockRaftClient.readIndex()).thenReturn(completedFuture(-1L)); + when(mockRaftClient.run(any())).thenReturn(nullCompletedFuture()); + replicaManager.startReplica( tablePartitionId, + zonePartitionId, (request0, senderId) -> completedFuture(new ReplicaResult(replicaMessageFactory.replicaResponse() .result(5) .build(), null)), - mock(TopologyAwareRaftGroupService.class), + mockRaftClient, new PendingComparableValuesTracker<>(0L) ); } catch (NodeStoppingException e) { @@ -295,16 +304,22 @@ public void testWithNotReadyReplica() { ClusterNode clusterNode = clusterService.topologyService().localMember(); TablePartitionId tablePartitionId = new TablePartitionId(1, 1); + ZonePartitionId zonePartitionId = new ZonePartitionId(1, 1); clusterService.messagingService().addMessageHandler(ReplicaMessageGroup.class, (message, sender, correlationId) -> { runAsync(() -> { try { log.info("Replica msg " + message.getClass().getSimpleName()); + var mockRaftClient = mock(TopologyAwareRaftGroupService.class); + when(mockRaftClient.readIndex()).thenReturn(completedFuture(-1L)); + when(mockRaftClient.run(any())).thenReturn(nullCompletedFuture()); + replicaManager.startReplica( tablePartitionId, + zonePartitionId, (request, senderId) -> new CompletableFuture<>(), - mock(TopologyAwareRaftGroupService.class), + mockRaftClient, new PendingComparableValuesTracker<>(0L) ); } catch (NodeStoppingException e) { diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java index 341658cdf6a..bb2ec1a0036 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java @@ -374,8 +374,7 @@ private void waitForScale(IgniteImpl node, int targetDataNodesCount) throws Inte assertTrue(IgniteTestUtils.waitForCondition(() -> { long causalityToken = node.metaStorageManager().appliedRevision(); - long msSafeTime = node.metaStorageManager().timestampByRevision(causalityToken).longValue(); - int catalogVersion = node.catalogManager().activeCatalogVersion(msSafeTime); + int catalogVersion = node.catalogManager().latestCatalogVersion(); CompletableFuture> dataNodes = dzManager.dataNodes(causalityToken, catalogVersion, zoneId); diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java index f95ac9cc6b9..8ace6f51702 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java @@ -71,6 +71,7 @@ import org.apache.ignite.internal.table.distributed.schema.PartitionCommandsMarshallerImpl; import org.apache.ignite.internal.test.WatchListenerInhibitor; import org.apache.ignite.internal.testframework.IgniteTestUtils; +import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.testframework.WorkDirectory; import org.apache.ignite.internal.testframework.log4j2.LogInspector; import org.apache.ignite.internal.testframework.log4j2.LogInspector.Handler; @@ -472,6 +473,7 @@ void entriesKeepAppendedDuringSnapshotInstallation() throws Exception { * (and can install a RAFT snapshot on the ex-leader). */ @Test + @WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false") void nodeCanInstallSnapshotsAfterSnapshotInstalledToIt() throws Exception { feedNode2WithSnapshotOfOneRow(); diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java index 0c742a98016..b2459f87e23 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java @@ -25,6 +25,7 @@ import static org.apache.ignite.internal.TestDefaultProfilesNames.DEFAULT_TEST_PROFILE_NAME; import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_SCHEMA_NAME; import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE; +import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.REBALANCE_SCHEDULER_POOL_SIZE; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber; @@ -161,6 +162,7 @@ import org.apache.ignite.internal.replicator.ReplicaManager; import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.rest.configuration.RestConfiguration; import org.apache.ignite.internal.schema.SchemaManager; @@ -675,7 +677,7 @@ void testRaftClientsUpdatesAfterRebalance() throws Exception { // Write the new assignments to metastore as a pending assignments. { - TablePartitionId partId = new TablePartitionId(getTableId(node, TABLE_NAME), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node.catalogManager, ZONE_NAME, node.hybridClock.nowLong()), 0); ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId); @@ -733,7 +735,7 @@ void testClientsAreUpdatedAfterPendingRebalanceHandled() throws Exception { Set newAssignment = Set.of(Assignment.forPeer(newNodeNameForAssignment)); // Write the new assignments to metastore as a pending assignments. - TablePartitionId partId = new TablePartitionId(getTableId(node, TABLE_NAME), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node.catalogManager, ZONE_NAME, node.hybridClock.nowLong()), 0); ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId); @@ -800,7 +802,7 @@ private void directUpdateMetastoreRebalanceAssignmentKeys() throws Exception { Node node0 = getNode(0); - TablePartitionId partId = new TablePartitionId(getTableId(node0, TABLE_NAME), 0); + ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node0.catalogManager, ZONE_NAME, node0.hybridClock.nowLong()), 0); ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId); ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(partId); @@ -818,7 +820,7 @@ private void verifyThatRaftNodesAndReplicasWereStartedOnlyOnce() throws Exceptio verify(getNode(i).raftManager, timeout(AWAIT_TIMEOUT_MILLIS).times(1)) .startRaftGroupNodeWithoutService(any(), any(), any(), any(), any(RaftGroupOptions.class)); verify(getNode(i).replicaManager, timeout(AWAIT_TIMEOUT_MILLIS).times(1)) - .startReplica(any(), any(), any(), any()); + .startReplica(any(), any(), any(), any(), any()); } } @@ -878,40 +880,40 @@ private static Set getPartitionClusterNodes(Node node, int partNum) } private static Set getPartitionClusterNodes(Node node, String tableName, int partNum) { - return Optional.ofNullable(getTableId(node, tableName)) - .map(tableId -> partitionAssignments(node.metaStorageManager, tableId, partNum).join()) + return Optional.ofNullable(getTableZoneId(node, tableName)) + .map(zoneId -> partitionAssignments(node.metaStorageManager, zoneId, partNum) + .thenApply(a -> a == null ? Set.of() : a).join() + ) .orElse(Set.of()); } private static Set getPartitionPendingClusterNodes(Node node, int partNum) { - return Optional.ofNullable(getTableId(node, TABLE_NAME)) - .map(tableId -> partitionPendingAssignments(node.metaStorageManager, tableId, partNum).join()) - .orElse(Set.of()); + return partitionPendingAssignments(node.metaStorageManager, getZoneId(node, ZONE_NAME), partNum) + .thenApply(a -> a == null ? Set.of() : a).join(); } private static Set getPartitionPlannedClusterNodes(Node node, int partNum) { - return Optional.ofNullable(getTableId(node, TABLE_NAME)) - .map(tableId -> partitionPlannedAssignments(node.metaStorageManager, tableId, partNum).join()) - .orElse(Set.of()); + return partitionPlannedAssignments(node.metaStorageManager, getZoneId(node, ZONE_NAME), partNum) + .thenApply(a -> a == null ? Set.of() : a).join(); } private static CompletableFuture> partitionPendingAssignments( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, int partitionNumber ) { return metaStorageManager - .get(pendingPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber))) + .get(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber))) .thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes()); } private static CompletableFuture> partitionPlannedAssignments( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, int partitionNumber ) { return metaStorageManager - .get(plannedPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber))) + .get(plannedPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber))) .thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes()); } @@ -1562,6 +1564,12 @@ private static void createTable(Node node, String zoneName, String tableName) { return TableTestUtils.getTableId(node.catalogManager, tableName, node.hybridClock.nowLong()); } + private static @Nullable Integer getTableZoneId(Node node, String tableName) { + CatalogTableDescriptor tblDesc = TableTestUtils.getTable(node.catalogManager, tableName, node.hybridClock.nowLong()); + + return tblDesc == null ? null : tblDesc.zoneId(); + } + private Node getNode(int nodeIndex) { return nodes.get(nodeIndex); } @@ -1577,4 +1585,8 @@ private void checkPartitionNodes(String tableName, int partitionId, int expNodeC assertEquals(expNodeCount, getPartitionClusterNodes(node, tableName, partitionId).size(), node.name); } } + + private static int getZoneId(Node node, String zoneName) { + return getZoneIdStrict(node.catalogManager, zoneName, node.hybridClock.nowLong()); + } } diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java index 97a81caf0ff..64c414be8c5 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java @@ -220,8 +220,11 @@ private void waitForStableAssignmentsInMetastore(Set expectedNodes, int Set[] lastAssignmentsHolderForLog = new Set[1]; assertTrue(waitForCondition(() -> { + int zoneId = cluster.aliveNode().catalogManager().table(table, cluster.aliveNode().catalogManager().latestCatalogVersion()) + .zoneId(); + Set assignments = - await(partitionAssignments(cluster.aliveNode().metaStorageManager(), table, 0)) + await(partitionAssignments(cluster.aliveNode().metaStorageManager(), zoneId, 0)) .stream() .map(Assignment::consistentId) .collect(Collectors.toSet()); @@ -236,8 +239,11 @@ private void waitForStableAssignmentsInMetastore(int expectedNodesNumber, int ta Set[] lastAssignmentsHolderForLog = new Set[1]; assertTrue(waitForCondition(() -> { + int zoneId = cluster.aliveNode().catalogManager().table(table, cluster.aliveNode().catalogManager().latestCatalogVersion()) + .zoneId(); + Set assignments = - await(partitionAssignments(cluster.aliveNode().metaStorageManager(), table, 0)) + await(partitionAssignments(cluster.aliveNode().metaStorageManager(), zoneId, 0)) .stream() .map(Assignment::consistentId) .collect(Collectors.toSet()); diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java index 24357590de0..6d977a93381 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java @@ -19,8 +19,9 @@ import static org.apache.ignite.internal.TestWrappers.unwrapTableManager; import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE; +import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneId; +import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.pendingPartAssignmentsKey; -import static org.apache.ignite.internal.table.TableTestUtils.getTableId; import static org.apache.ignite.internal.testframework.IgniteTestUtils.bypassingThreadAssertions; import static org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -38,7 +39,7 @@ import org.apache.ignite.internal.app.IgniteImpl; import org.apache.ignite.internal.hlc.HybridClockImpl; import org.apache.ignite.internal.metastorage.MetaStorageManager; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.storage.MvPartitionStorage; import org.apache.ignite.internal.table.distributed.TableManager; import org.apache.ignite.internal.test.WatchListenerInhibitor; @@ -115,10 +116,10 @@ void testRebalanceTriggersRecoveryAfterFilterUpdate() throws InterruptedExceptio 10_000)); // Remove the pending keys in a barbarian way. So, the rebalance can be triggered only by the recovery logic now. - Integer tableId = getTableId(node(0).catalogManager(), "TEST", new HybridClockImpl().nowLong()); + int zoneId = getZoneIdStrict(node(0).catalogManager(), "TEST_ZONE", node(0).clock().nowLong()); node(0) .metaStorageManager() - .remove(pendingPartAssignmentsKey(new TablePartitionId(tableId, 0))).join(); + .remove(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, 0))).join(); restartNode(1); restartNode(2); @@ -160,10 +161,10 @@ void testRebalanceTriggersRecoveryAfterReplicasUpdate() throws InterruptedExcept 10_000)); // Remove the pending keys in a barbarian way. So, the rebalance can be triggered only by the recovery logic now. - Integer tableId = getTableId(node(0).catalogManager(), "TEST", new HybridClockImpl().nowLong()); + int zoneId = getZoneIdStrict(node(0).catalogManager(), "TEST_ZONE", node(0).clock().nowLong()); node(0) .metaStorageManager() - .remove(pendingPartAssignmentsKey(new TablePartitionId(tableId, 0))).join(); + .remove(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, 0))).join(); restartNode(1); restartNode(2); @@ -202,22 +203,22 @@ void testRebalanceTriggersRecoveryWhenUpdatesWereProcessedByAnotherNodesAlready( (() -> getPartitionPendingClusterNodes(node(0), 0).equals(Set.of())), 10_000)); - TablePartitionId tablePartitionId = - new TablePartitionId( - getTableId(node(0).catalogManager(), - "TEST", + ZonePartitionId zonePartitionId = + new ZonePartitionId( + getZoneIdStrict(node(0).catalogManager(), + "TEST_ZONE", new HybridClockImpl().nowLong()), 0 ); long pendingsKeysRevisionBeforeRecovery = node(0).metaStorageManager() - .get(pendingPartAssignmentsKey(tablePartitionId)) + .get(pendingPartAssignmentsKey(zonePartitionId)) .get(10, TimeUnit.SECONDS).revision(); startNode(3, GLOBAL_NODE_BOOTSTRAP_CFG_TEMPLATE); long pendingsKeysRevisionAfterRecovery = node(0).metaStorageManager() - .get(pendingPartAssignmentsKey(tablePartitionId)) + .get(pendingPartAssignmentsKey(zonePartitionId)) .get(10, TimeUnit.SECONDS).revision(); // Check that recovered node doesn't produce new rebalances for already processed triggers. @@ -225,18 +226,18 @@ void testRebalanceTriggersRecoveryWhenUpdatesWereProcessedByAnotherNodesAlready( } private static Set getPartitionPendingClusterNodes(IgniteImpl node, int partNum) { - return Optional.ofNullable(getTableId(node.catalogManager(), "TEST", new HybridClockImpl().nowLong())) - .map(tableId -> partitionPendingAssignments(node.metaStorageManager(), tableId, partNum).join()) + return Optional.ofNullable(getZoneId(node.catalogManager(), "TEST_ZONE", new HybridClockImpl().nowLong())) + .map(zoneId -> partitionPendingAssignments(node.metaStorageManager(), zoneId, partNum).join()) .orElse(Set.of()); } private static CompletableFuture> partitionPendingAssignments( MetaStorageManager metaStorageManager, - int tableId, + int zoneId, int partitionNumber ) { return metaStorageManager - .get(pendingPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber))) + .get(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber))) .thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes()); } diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java index 36e61a1d872..de78916c02c 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java @@ -291,6 +291,7 @@ public CompletableFuture finish( intTable = new InternalTableImpl( "PUBLIC.TEST", tblId, + 123, PARTS, new SingleClusterNodeResolver(clusterNode), txManager, diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java index 08b79a07acb..17eb7fa7476 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java @@ -35,6 +35,7 @@ import org.apache.ignite.internal.app.IgniteImpl; import org.apache.ignite.internal.replicator.TablePartitionId; import org.apache.ignite.internal.table.distributed.command.UpdateCommand; +import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.tx.impl.ReadWriteTransactionImpl; import org.apache.ignite.raft.jraft.rpc.WriteActionRequest; import org.apache.ignite.table.RecordView; @@ -105,6 +106,7 @@ protected String getNodeBootstrapConfigTemplate() { } @Test + @WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false") public void testFullTxConsistency() throws InterruptedException { TableImpl tbl = unwrapTableImpl(node(0).tables().table(TABLE_NAME)); diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java index ccb91a867b8..f9b14accd35 100644 --- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java +++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java @@ -34,6 +34,7 @@ import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.tableId; import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.txId; import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.waitAndGetPrimaryReplica; +import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.zoneId; import static org.apache.ignite.internal.util.IgniteUtils.shutdownAndAwaitTermination; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -57,6 +58,7 @@ import org.apache.ignite.internal.app.IgniteImpl; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.testframework.SystemPropertiesExtension; import org.apache.ignite.internal.testframework.WithSystemProperty; import org.apache.ignite.internal.thread.IgniteThreadFactory; @@ -205,7 +207,7 @@ public void testVacuum() throws InterruptedException { int partId = partitionIdForTuple(node, TABLE_NAME, tuple, tx); - Set nodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), partId)); + Set nodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), partId)); view.upsert(tx, tuple); view.upsert(parallelTx1, tupleForParallelTx); @@ -296,7 +298,7 @@ public void testAbandonedTxnsAreNotVacuumizedUntilRecovered() throws Interrupted int partId = partitionIdForTuple(anyNode(), TABLE_NAME, tuple, null); - TablePartitionId groupId = new TablePartitionId(tableId(anyNode(), TABLE_NAME), partId); + ZonePartitionId groupId = new ZonePartitionId(zoneId(anyNode(), TABLE_NAME), partId); Set txNodes = partitionAssignment(anyNode(), groupId); @@ -380,7 +382,7 @@ public void testVacuumWithCleanupDelay() throws InterruptedException { ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId); IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId())); - Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId)); + Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId)); log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes); @@ -484,7 +486,7 @@ public void testCommitPartitionPrimaryChangesBeforeVacuum() throws InterruptedEx ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId); IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId())); - Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId)); + Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId)); log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes); @@ -568,7 +570,7 @@ public void testVacuumPersistentStateAfterCleanupDelayAndVolatileStateVacuum() t ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId); IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId())); - Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId)); + Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId)); log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes); @@ -655,7 +657,7 @@ public void testRecoveryAfterPersistentStateVacuumized() throws InterruptedExcep int commitPartId = partitionIdForTuple(commitPartitionLeaseholder, TABLE_NAME, tuple0, null); Set commitPartitionNodes = partitionAssignment(commitPartitionLeaseholder, - new TablePartitionId(tableId(commitPartitionLeaseholder, TABLE_NAME), commitPartId)); + new ZonePartitionId(zoneId(commitPartitionLeaseholder, TABLE_NAME), commitPartId)); // Choose some node that doesn't host the partition as a tx coordinator. IgniteImpl coord0 = findNode(n -> !commitPartitionNodes.contains(n.name())); diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java b/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java index a59ebba8fcd..8c0fc97e920 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java @@ -83,6 +83,13 @@ public interface InternalTable extends ManuallyCloseable { */ int partitionId(BinaryRowEx row); + /** + * Returns zone id in which the table is presented. + * + * @return Zone id. + */ + int zoneId(); + /** * Asynchronously gets a row with same key columns values as given one from the table. * diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java index fc34504bd51..af06e638e9c 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java @@ -40,7 +40,7 @@ import org.apache.ignite.internal.network.MessagingService; import org.apache.ignite.internal.raft.Peer; import org.apache.ignite.internal.raft.PeersAndLearners; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.storage.MvPartitionStorage; import org.apache.ignite.internal.storage.RowId; import org.apache.ignite.internal.storage.engine.MvTableStorage; @@ -127,13 +127,13 @@ private void addMessageHandler() { /** * Returns a future that completes with a decision: should we start the corresponding group locally or not. * - * @param tablePartitionId ID of the table partition. + * @param zonePartitionId ID of the zone partition. * @param internalTable Table we are working with. * @param newConfiguration New configuration that is going to be applied if we'll start the group. * @param localMemberAssignment Assignment of this node in this group. */ CompletableFuture shouldStartGroup( - TablePartitionId tablePartitionId, + ZonePartitionId zonePartitionId, InternalTable internalTable, PeersAndLearners newConfiguration, Assignment localMemberAssignment @@ -141,7 +141,7 @@ CompletableFuture shouldStartGroup( // If Raft is running in in-memory mode or the PDS has been cleared, we need to remove the current node // from the Raft group in order to avoid the double vote problem. if (mightNeedGroupRecovery(internalTable)) { - return performGroupRecovery(tablePartitionId, newConfiguration, localMemberAssignment); + return performGroupRecovery(zonePartitionId, newConfiguration, localMemberAssignment, internalTable.tableId()); } return trueCompletedFuture(); @@ -154,12 +154,12 @@ private static boolean mightNeedGroupRecovery(InternalTable internalTable) { } private CompletableFuture performGroupRecovery( - TablePartitionId tablePartitionId, + ZonePartitionId zonePartitionId, PeersAndLearners newConfiguration, - Assignment localMemberAssignment + Assignment localMemberAssignment, + int tableId ) { - int tableId = tablePartitionId.tableId(); - int partId = tablePartitionId.partitionId(); + int partId = zonePartitionId.partitionId(); // No majority and not a full partition restart - need to 'remove, then add' nodes // with current partition. @@ -174,7 +174,7 @@ private CompletableFuture performGroupRecovery( boolean majorityAvailable = dataNodesCount >= (newConfiguration.peers().size() / 2) + 1; if (majorityAvailable) { - RebalanceUtilEx.startPeerRemoval(tablePartitionId, localMemberAssignment, metaStorageManager); + RebalanceUtilEx.startPeerRemoval(zonePartitionId, localMemberAssignment, metaStorageManager); return false; } else { diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java index 273d92b2720..e58cacfcadb 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java @@ -35,7 +35,7 @@ import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.PENDING_ASSIGNMENTS_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber; -import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractTableId; +import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneId; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.intersect; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.partitionAssignmentsGetLocally; import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.pendingPartAssignmentsKey; @@ -154,9 +154,11 @@ import org.apache.ignite.internal.raft.service.RaftGroupListener; import org.apache.ignite.internal.raft.service.RaftGroupService; import org.apache.ignite.internal.raft.storage.impl.LogStorageFactoryCreator; +import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker; import org.apache.ignite.internal.replicator.ReplicaManager; import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.schema.SchemaManager; import org.apache.ignite.internal.schema.SchemaRegistry; import org.apache.ignite.internal.schema.configuration.GcConfiguration; @@ -510,7 +512,10 @@ public TableManager( this.nodeName = nodeName; this.executorInclinedSchemaSyncService = new ExecutorInclinedSchemaSyncService(schemaSyncService, partitionOperationsExecutor); - this.executorInclinedPlacementDriver = new ExecutorInclinedPlacementDriver(placementDriver, partitionOperationsExecutor); + this.executorInclinedPlacementDriver = new ExecutorInclinedPlacementDriver( + new ReplicaAwareLeaseTracker(placementDriver, replicaSvc, topologyService), + partitionOperationsExecutor + ); TxMessageSender txMessageSender = new TxMessageSender( messagingService, @@ -697,12 +702,12 @@ private CompletableFuture onTableCreate(CreateTableEventParameters para * Writes the set of assignments to meta storage. If there are some assignments already, gets them from meta storage. Returns * the list of assignments that really are in meta storage. * - * @param tableId Table id. + * @param zoneId Zone id. * @param assignmentsFuture Assignments future, to get the assignments that should be written. * @return Real list of assignments. */ - public CompletableFuture> writeTableAssignmentsToMetastore( - int tableId, + public CompletableFuture> writeZoneAssignmentsToMetastore( + int zoneId, CompletableFuture> assignmentsFuture ) { return assignmentsFuture.thenCompose(newAssignments -> { @@ -711,7 +716,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore( List partitionAssignments = new ArrayList<>(newAssignments.size()); for (int i = 0; i < newAssignments.size(); i++) { - ByteArray stableAssignmentsKey = stablePartAssignmentsKey(new TablePartitionId(tableId, i)); + ByteArray stableAssignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, i)); byte[] anAssignment = newAssignments.get(i).toBytes(); Operation op = put(stableAssignmentsKey, anAssignment); partitionAssignments.add(op); @@ -738,15 +743,15 @@ public CompletableFuture> writeTableAssignmentsToMetastore( if (invokeResult) { LOG.info( "Assignments calculated from data nodes are successfully written to meta storage" - + " [tableId={}, assignments={}].", - tableId, + + " [zoneId={}, assignments={}].", + zoneId, Assignments.assignmentListToString(newAssignments) ); return completedFuture(newAssignments); } else { Set partKeys = IntStream.range(0, newAssignments.size()) - .mapToObj(p -> stablePartAssignmentsKey(new TablePartitionId(tableId, p))) + .mapToObj(p -> stablePartAssignmentsKey(new ZonePartitionId(zoneId, p))) .collect(toSet()); CompletableFuture> resFuture = metaStorageMgr.getAll(partKeys); @@ -755,7 +760,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore( List realAssignments = new ArrayList<>(); for (int p = 0; p < newAssignments.size(); p++) { - var partId = new TablePartitionId(tableId, p); + var partId = new ZonePartitionId(zoneId, p); Entry assignmentsEntry = metaStorageAssignments.get(stablePartAssignmentsKey(partId)); assert assignmentsEntry != null && !assignmentsEntry.empty() && !assignmentsEntry.tombstone() @@ -767,8 +772,8 @@ public CompletableFuture> writeTableAssignmentsToMetastore( } LOG.info( - "Assignments picked up from meta storage [tableId={}, assignments={}].", - tableId, + "Assignments picked up from meta storage [zoneId={}, assignments={}].", + zoneId, Assignments.assignmentListToString(realAssignments) ); @@ -778,7 +783,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore( }) .handle((realAssignments, e) -> { if (e != null) { - LOG.error("Couldn't get assignments from metastore for table [tableId={}].", e, tableId); + LOG.error("Couldn't get assignments from metastore for table [zoneId={}].", e, zoneId); throw ExceptionUtils.sneakyThrow(e); } @@ -852,6 +857,8 @@ private CompletableFuture startLocalPartitionsAndClients( for (int i = 0; i < partitions; i++) { int partId = i; + LOG.info("Start partition " + new TablePartitionId(tableId, i)); + CompletableFuture future = startPartitionAndStartClient( table, partId, @@ -932,10 +939,13 @@ private CompletableFuture startPartitionAndStartClient( CompletableFuture startGroupFut; + // TODO: revisit for in-memory + ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, partId); + if (localMemberAssignment != null) { CompletableFuture shouldStartGroupFut = isRecovery ? partitionReplicatorNodeRecovery.shouldStartGroup( - replicaGrpId, + zonePartitionId, internalTbl, newConfiguration, localMemberAssignment @@ -957,7 +967,7 @@ private CompletableFuture startPartitionAndStartClient( try { startPartitionRaftGroupNode( - replicaGrpId, + zonePartitionId, raftNodeId, newConfiguration, safeTimeTracker, @@ -965,8 +975,7 @@ private CompletableFuture startPartitionAndStartClient( table, partitionStorages.getTxStateStorage(), partitionDataStorage, - partitionUpdateHandlers, - zoneId + partitionUpdateHandlers ); return true; @@ -1010,6 +1019,7 @@ private CompletableFuture startPartitionAndStartClient( try { startReplicaWithNewListener( replicaGrpId, + zonePartitionId, table, safeTimeTracker, storageIndexTracker, @@ -1037,6 +1047,7 @@ private CompletableFuture startPartitionAndStartClient( private void startReplicaWithNewListener( TablePartitionId replicaGrpId, + ZonePartitionId zonePartitionId, TableImpl table, PendingComparableValuesTracker safeTimeTracker, PendingComparableValuesTracker storageIndexTracker, @@ -1046,7 +1057,7 @@ private void startReplicaWithNewListener( TopologyAwareRaftGroupService raftGroupService ) throws NodeStoppingException { PartitionReplicaListener listener = createReplicaListener( - replicaGrpId, + replicaGrpId.partitionId(), table, safeTimeTracker, mvPartitionStorage, @@ -1057,6 +1068,7 @@ private void startReplicaWithNewListener( replicaMgr.startReplica( replicaGrpId, + zonePartitionId, listener, raftGroupService, storageIndexTracker @@ -1064,7 +1076,7 @@ private void startReplicaWithNewListener( } private PartitionReplicaListener createReplicaListener( - TablePartitionId tablePartitionId, + int partId, TableImpl table, PendingComparableValuesTracker safeTimeTracker, MvPartitionStorage mvPartitionStorage, @@ -1072,8 +1084,7 @@ private PartitionReplicaListener createReplicaListener( PartitionUpdateHandlers partitionUpdateHandlers, RaftGroupService raftClient ) { - int tableId = tablePartitionId.tableId(); - int partId = tablePartitionId.partitionId(); + int tableId = table.tableId(); return new PartitionReplicaListener( mvPartitionStorage, @@ -1265,7 +1276,7 @@ private CompletableFuture createTableLocally( boolean onNodeRecovery ) { return inBusyLockAsync(busyLock, () -> { - int tableId = tableDescriptor.id(); + int zoneId = tableDescriptor.zoneId(); // Retrieve descriptor during synchronous call, before the previous catalog version could be concurrently compacted. CatalogZoneDescriptor zoneDescriptor = getZoneDescriptor(tableDescriptor, catalogVersion); @@ -1278,7 +1289,7 @@ private CompletableFuture createTableLocally( ); CompletableFuture> assignmentsFutureAfterInvoke = - writeTableAssignmentsToMetastore(tableId, assignmentsFuture); + writeZoneAssignmentsToMetastore(zoneId, assignmentsFuture); return createTableLocally( causalityToken, @@ -1327,6 +1338,7 @@ private CompletableFuture createTableLocally( InternalTableImpl internalTable = new InternalTableImpl( tableName, tableId, + zoneDescriptor.id(), partitions, topologyService, txManager, @@ -1416,12 +1428,12 @@ private CompletableFuture> getOrCreateAssignments( long causalityToken, int catalogVersion ) { - int tableId = tableDescriptor.id(); + int zoneId = tableDescriptor.zoneId(); CompletableFuture> assignmentsFuture; - if (partitionAssignmentsGetLocally(metaStorageMgr, tableId, 0, causalityToken) != null) { + if (partitionAssignmentsGetLocally(metaStorageMgr, zoneId, 0, causalityToken) != null) { assignmentsFuture = completedFuture( - tableAssignmentsGetLocally(metaStorageMgr, tableId, zoneDescriptor.partitions(), causalityToken)); + tableAssignmentsGetLocally(metaStorageMgr, zoneId, zoneDescriptor.partitions(), causalityToken)); } else { assignmentsFuture = distributionZoneManager.dataNodes(causalityToken, catalogVersion, zoneDescriptor.id()) .thenApply(dataNodes -> AffinityUtils.calculateAssignments( @@ -1431,9 +1443,9 @@ private CompletableFuture> getOrCreateAssignments( ).stream().map(Assignments::of).collect(toList())); assignmentsFuture.thenAccept(assignmentsList -> LOG.info( - "Assignments calculated from data nodes [table={}, tableId={}, assignments={}, revision={}]", + "Assignments calculated from data nodes [table={}, zoneId={}, assignments={}, revision={}]", tableDescriptor.name(), - tableId, + zoneId, Assignments.assignmentListToString(assignmentsList), causalityToken )); @@ -1489,6 +1501,7 @@ protected TxStateTableStorage createTxStateTableStorage(CatalogTableDescriptor t */ private CompletableFuture destroyTableLocally(int tableId) { TableImpl table = startedTables.remove(tableId); + localPartsByTableId.remove(tableId); assert table != null : tableId; @@ -1496,12 +1509,6 @@ private CompletableFuture destroyTableLocally(int tableId) { InternalTable internalTable = table.internalTable(); int partitions = internalTable.partitions(); - // TODO https://issues.apache.org/jira/browse/IGNITE-18991 Move assigment manipulations to Distribution zones. - Set assignmentKeys = IntStream.range(0, partitions) - .mapToObj(p -> stablePartAssignmentsKey(new TablePartitionId(tableId, p))) - .collect(toSet()); - metaStorageMgr.removeAll(assignmentKeys); - CompletableFuture[] stopReplicaFutures = new CompletableFuture[partitions]; // TODO https://issues.apache.org/jira/browse/IGNITE-19170 Partitions should be stopped on the assignments change @@ -1796,15 +1803,23 @@ private CompletableFuture handleChangePendingAssignmentEvent( } int partId = extractPartitionNumber(pendingAssignmentsEntry.key()); - int tblId = extractTableId(pendingAssignmentsEntry.key(), PENDING_ASSIGNMENTS_PREFIX); + int zoneId = extractZoneId(pendingAssignmentsEntry.key(), PENDING_ASSIGNMENTS_PREFIX); - var replicaGrpId = new TablePartitionId(tblId, partId); + var zonePartitionId = new ZonePartitionId(zoneId, partId); // Stable assignments from the meta store, which revision is bounded by the current pending event. - Entry stableAssignmentsEntry = metaStorageMgr.getLocally(stablePartAssignmentsKey(replicaGrpId), revision); + Entry stableAssignmentsEntry = metaStorageMgr.getLocally(stablePartAssignmentsKey(zonePartitionId), revision); Assignments pendingAssignments = Assignments.fromBytes(pendingAssignmentsEntry.value()); + HybridTimestamp msSafeTime = metaStorageMgr.timestampByRevision(revision); + + int catalogVersion = catalogService.activeCatalogVersion(msSafeTime.longValue()); + + Set tablesInZone = findTablesByZoneId(zoneId, catalogVersion, catalogService).stream() + .map(CatalogObjectDescriptor::id) + .collect(toSet()); + return tablesVv.get(revision) .thenApply(ignore -> { if (!busyLock.enterBusy()) { @@ -1812,42 +1827,47 @@ private CompletableFuture handleChangePendingAssignmentEvent( } try { - TableImpl table = tables.get(tblId); - - // Table can be null only recovery, because we use a revision from the future. See comment inside - // performRebalanceOnRecovery. - if (table == null) { - if (LOG.isInfoEnabled()) { - LOG.info("Skipping Pending Assignments update, because table {} does not exist", tblId); - } - - return CompletableFutures.nullCompletedFuture(); - } - - if (LOG.isInfoEnabled()) { - var stringKey = new String(pendingAssignmentsEntry.key(), UTF_8); - - LOG.info("Received update on pending assignments. Check if new raft group should be started" - + " [key={}, partition={}, table={}, localMemberAddress={}, pendingAssignments={}]", - stringKey, partId, table.name(), localNode().address(), pendingAssignments); - } - Set stableAssignments = stableAssignmentsEntry.value() == null ? emptySet() : Assignments.fromBytes(stableAssignmentsEntry.value()).nodes(); - return setTablesPartitionCountersForRebalance(replicaGrpId, revision, pendingAssignments.force()) - .thenCompose(r -> - handleChangePendingAssignmentEvent( - replicaGrpId, - table, - pendingAssignments, - stableAssignments, - revision, - isRecovery - ) - ) - .thenCompose(v -> changePeersOnRebalance(table, replicaGrpId, pendingAssignments.nodes(), revision)); + return setTablesPartitionCountersForRebalance(zonePartitionId, revision, pendingAssignments.force()) + .thenCompose(r -> { + List> tableFutures = new ArrayList<>(tables.size()); + + for (int tableId : tablesInZone) { + TableImpl table = tables.get(tableId); + + if (LOG.isInfoEnabled()) { + var stringKey = new String(pendingAssignmentsEntry.key(), UTF_8); + + LOG.info("Received update on pending assignments. Check if new raft group should be started" + + " [key={}, partition={}, table={}, " + + "localMemberAddress={}, pendingAssignments={}]", + stringKey, partId, table.name(), localNode().address(), pendingAssignments); + } + + tableFutures.add( + handleChangePendingAssignmentEventForTable( + zonePartitionId, + table, + pendingAssignments, + stableAssignments, + revision, + isRecovery + ).thenCompose( + v -> changePeersOnRebalance( + table, + zonePartitionId, + pendingAssignments.nodes(), + revision + ) + ) + ); + } + + return allOf(tableFutures.toArray(CompletableFuture[]::new)); + }); } finally { busyLock.leaveBusy(); } @@ -1855,8 +1875,8 @@ private CompletableFuture handleChangePendingAssignmentEvent( .thenCompose(identity()); } - private CompletableFuture handleChangePendingAssignmentEvent( - TablePartitionId replicaGrpId, + private CompletableFuture handleChangePendingAssignmentEventForTable( + ZonePartitionId zonePartitionId, TableImpl tbl, Assignments pendingAssignments, Set stableAssignments, @@ -1864,7 +1884,10 @@ private CompletableFuture handleChangePendingAssignmentEvent( boolean isRecovery ) { ClusterNode localMember = localNode(); - RaftNodeId raftNodeId = new RaftNodeId(replicaGrpId, new Peer(localNode().name())); + RaftNodeId raftNodeId = new RaftNodeId( + new TablePartitionId(tbl.tableId(), zonePartitionId.partitionId()), + new Peer(localNode().name()) + ); boolean pendingAssignmentsAreForced = pendingAssignments.force(); Set pendingAssignmentsNodes = pendingAssignments.nodes(); @@ -1876,9 +1899,7 @@ private CompletableFuture handleChangePendingAssignmentEvent( CompletableFuture localServicesStartFuture; - int tableId = tbl.tableId(); - - int zoneId = getTableDescriptor(tableId, catalogService.latestCatalogVersion()).zoneId(); + int zoneId = zonePartitionId.zoneId(); // This is a set of assignments for nodes that are not the part of stable assignments, i.e. unstable part of the distribution. // For regular pending assignments we use (old) stable set, so that none of new nodes would be able to propose itself as a leader. @@ -1897,7 +1918,7 @@ private CompletableFuture handleChangePendingAssignmentEvent( Assignments nonStableNodeAssignmentsFinal = nonStableNodeAssignments; - int partitionId = replicaGrpId.partitionId(); + int partitionId = zonePartitionId.partitionId(); if (shouldStartLocalGroupNode) { PartitionSet singlePartitionIdSet = PartitionSet.of(partitionId); @@ -1921,7 +1942,7 @@ private CompletableFuture handleChangePendingAssignmentEvent( return startPartitionAndStartClient( tbl, - replicaGrpId.partitionId(), + zonePartitionId.partitionId(), pendingAssignments, nonStableNodeAssignmentsFinal, zoneId, @@ -1945,21 +1966,17 @@ private CompletableFuture handleChangePendingAssignmentEvent( tbl.internalTable() .tableRaftService() - .partitionRaftGroupService(partitionId) + .partitionRaftGroupService(zonePartitionId.partitionId()) .updateConfiguration(configurationFromAssignments(cfg)); }, ioExecutor); } - private CompletableFuture setTablesPartitionCountersForRebalance(TablePartitionId replicaGrpId, long revision, boolean force) { + private CompletableFuture setTablesPartitionCountersForRebalance(ZonePartitionId zonePartitionId, long revision, boolean force) { int catalogVersion = catalogService.latestCatalogVersion(); - int tableId = replicaGrpId.tableId(); + int zoneId = zonePartitionId.zoneId(); - CatalogZoneDescriptor zoneDescriptor = getZoneDescriptor(getTableDescriptor(tableId, catalogVersion), catalogVersion); - - int zoneId = zoneDescriptor.id(); - - int partId = replicaGrpId.partitionId(); + int partId = zonePartitionId.partitionId(); SimpleCondition revisionMatches = revision(tablesCounterKey(zoneId, partId)).lt(revision); SimpleCondition counterIsEmpty = value(tablesCounterKey(zoneId, partId)).eq(toBytes(Set.of())); @@ -2003,7 +2020,7 @@ private CompletableFuture setTablesPartitionCountersForRebalance(TablePart private CompletableFuture changePeersOnRebalance( TableImpl table, - TablePartitionId replicaGrpId, + ZonePartitionId replicaGrpId, Set pendingAssignments, long revision ) { @@ -2055,7 +2072,7 @@ private CompletableFuture changePeersOnRebalance( } private void startPartitionRaftGroupNode( - TablePartitionId replicaGrpId, + ZonePartitionId zonePartitionId, RaftNodeId raftNodeId, PeersAndLearners stableConfiguration, PendingComparableValuesTracker safeTimeTracker, @@ -2063,15 +2080,14 @@ private void startPartitionRaftGroupNode( TableImpl table, TxStateStorage txStatePartitionStorage, PartitionDataStorage partitionDataStorage, - PartitionUpdateHandlers partitionUpdateHandlers, - int zoneId + PartitionUpdateHandlers partitionUpdateHandlers ) throws NodeStoppingException { InternalTable internalTable = table.internalTable(); RaftGroupOptions groupOptions = groupOptionsForPartition( internalTable.storage(), internalTable.txStateStorage(), - partitionKey(internalTable, replicaGrpId.partitionId()), + partitionKey(internalTable, zonePartitionId.partitionId()), partitionUpdateHandlers ); @@ -2089,11 +2105,11 @@ private void startPartitionRaftGroupNode( RaftGroupEventsListener raftGrpEvtsLsnr = new RebalanceRaftGroupEventsListener( metaStorageMgr, - replicaGrpId, + zonePartitionId, busyLock, - createPartitionMover(internalTable, replicaGrpId.partitionId()), + createPartitionMover(internalTable, zonePartitionId.partitionId()), rebalanceScheduler, - zoneId + table.tableId() ); // TODO: use RaftManager interface, see https://issues.apache.org/jira/browse/IGNITE-18273 @@ -2142,22 +2158,20 @@ public CompletableFuture onUpdate(WatchEvent evt) { byte[] key = evt.entryEvent().newEntry().key(); int partitionId = extractPartitionNumber(key); - int tableId = extractTableId(key, ASSIGNMENTS_SWITCH_REDUCE_PREFIX); + int zoneId = extractZoneId(key, ASSIGNMENTS_SWITCH_REDUCE_PREFIX); - TablePartitionId replicaGrpId = new TablePartitionId(tableId, partitionId); + ZonePartitionId replicaGrpId = new ZonePartitionId(zoneId, partitionId); // It is safe to get the latest version of the catalog as we are in the metastore thread. int catalogVersion = catalogService.latestCatalogVersion(); return tablesById(evt.revision()) .thenCompose(tables -> inBusyLockAsync(busyLock, () -> { - CatalogTableDescriptor tableDescriptor = getTableDescriptor(tableId, catalogVersion); - - CatalogZoneDescriptor zoneDescriptor = getZoneDescriptor(tableDescriptor, catalogVersion); + CatalogZoneDescriptor zoneDescriptor = catalogService.zone(zoneId, catalogVersion); long causalityToken = zoneDescriptor.updateToken(); - return distributionZoneManager.dataNodes(causalityToken, catalogVersion, tableDescriptor.zoneId()) + return distributionZoneManager.dataNodes(causalityToken, catalogVersion, zoneId) .thenCompose(dataNodes -> RebalanceUtilEx.handleReduceChanged( metaStorageMgr, dataNodes, @@ -2288,16 +2302,16 @@ protected CompletableFuture handleChangeStableAssignmentEvent( boolean isRecovery ) { int partitionId = extractPartitionNumber(stableAssignmentsWatchEvent.key()); - int tableId = extractTableId(stableAssignmentsWatchEvent.key(), STABLE_ASSIGNMENTS_PREFIX); + int zoneId = extractZoneId(stableAssignmentsWatchEvent.key(), STABLE_ASSIGNMENTS_PREFIX); - TablePartitionId tablePartitionId = new TablePartitionId(tableId, partitionId); + ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, partitionId); Set stableAssignments = stableAssignmentsWatchEvent.value() == null ? emptySet() : Assignments.fromBytes(stableAssignmentsWatchEvent.value()).nodes(); return supplyAsync(() -> { - Entry pendingAssignmentsEntry = metaStorageMgr.getLocally(pendingPartAssignmentsKey(tablePartitionId), revision); + Entry pendingAssignmentsEntry = metaStorageMgr.getLocally(pendingPartAssignmentsKey(zonePartitionId), revision); byte[] pendingAssignmentsFromMetaStorage = pendingAssignmentsEntry.value(); @@ -2305,13 +2319,27 @@ protected CompletableFuture handleChangeStableAssignmentEvent( ? Assignments.EMPTY : Assignments.fromBytes(pendingAssignmentsFromMetaStorage); - return stopAndDestroyPartitionAndUpdateClients( - tablePartitionId, - stableAssignments, - pendingAssignments, - isRecovery, - revision - ); + HybridTimestamp msSafeTime = metaStorageMgr.timestampByRevision(revision); + + int catalogVersion = catalogService.activeCatalogVersion(msSafeTime.longValue()); + + Set tablesInZone = findTablesByZoneId(zoneId, catalogVersion, catalogService).stream() + .map(CatalogObjectDescriptor::id) + .collect(toSet()); + + List> tableFutures = new ArrayList<>(tablesInZone.size()); + + for (Integer tableId : tablesInZone) { + tableFutures.add(stopAndDestroyPartitionAndUpdateClients( + new TablePartitionId(tableId, partitionId), + stableAssignments, + pendingAssignments, + isRecovery, + revision + )); + } + + return allOf(tableFutures.toArray(CompletableFuture[]::new)); }, ioExecutor).thenCompose(identity()); } @@ -2363,7 +2391,7 @@ private CompletableFuture stopAndDestroyPartition(TablePartitionId tablePa TableImpl table = tables.get(tablePartitionId.tableId()); return stopPartition(tablePartitionId, table) - .thenComposeAsync(v -> destroyPartitionStorages(tablePartitionId, table), ioExecutor); + .thenComposeAsync(v -> destroyPartitionStorages(tablePartitionId.partitionId(), table), ioExecutor); }); } @@ -2400,7 +2428,7 @@ private CompletableFuture stopPartition(TablePartitionId tablePartitionId, }); } - private CompletableFuture destroyPartitionStorages(TablePartitionId tablePartitionId, TableImpl table) { + private CompletableFuture destroyPartitionStorages(int partitionId, TableImpl table) { // TODO: IGNITE-18703 Destroy raft log and meta if (table == null) { return nullCompletedFuture(); @@ -2408,8 +2436,6 @@ private CompletableFuture destroyPartitionStorages(TablePartitionId tableP InternalTable internalTable = table.internalTable(); - int partitionId = tablePartitionId.partitionId(); - List> destroyFutures = new ArrayList<>(); if (internalTable.storage().getMvPartition(partitionId) != null) { @@ -2482,14 +2508,6 @@ private static PartitionUpdateHandlers createPartitionUpdateHandlers( return findTableImplByName(tables.values(), name); } - private CatalogTableDescriptor getTableDescriptor(int tableId, int catalogVersion) { - CatalogTableDescriptor tableDescriptor = catalogService.table(tableId, catalogVersion); - - assert tableDescriptor != null : "tableId=" + tableId + ", catalogVersion=" + catalogVersion; - - return tableDescriptor; - } - private CatalogZoneDescriptor getZoneDescriptor(CatalogTableDescriptor tableDescriptor, int catalogVersion) { CatalogZoneDescriptor zoneDescriptor = catalogService.zone(tableDescriptor.zoneId(), catalogVersion); diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImpl.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImpl.java index 4987f375f18..3eb85da2a24 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImpl.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImpl.java @@ -75,6 +75,7 @@ import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.exception.PrimaryReplicaMissException; import org.apache.ignite.internal.replicator.exception.ReplicationException; import org.apache.ignite.internal.replicator.message.ReplicaRequest; @@ -142,6 +143,9 @@ public class InternalTableImpl implements InternalTable { /** Table identifier. */ private final int tableId; + /** Zone identifier where table is presented. */ + private final int zoneId; + /** Resolver that resolves a node consistent ID to cluster node. */ private final ClusterNodeResolver clusterNodeResolver; @@ -194,6 +198,7 @@ public class InternalTableImpl implements InternalTable { * * @param tableName Table name. * @param tableId Table id. + * @param zoneId Zone id. * @param partitions Partitions. * @param clusterNodeResolver Cluster node resolver. * @param txManager Transaction manager. @@ -210,6 +215,7 @@ public class InternalTableImpl implements InternalTable { public InternalTableImpl( String tableName, int tableId, + int zoneId, int partitions, ClusterNodeResolver clusterNodeResolver, TxManager txManager, @@ -227,6 +233,7 @@ public InternalTableImpl( ) { this.tableName = tableName; this.tableId = tableId; + this.zoneId = zoneId; this.partitions = partitions; this.clusterNodeResolver = clusterNodeResolver; this.txManager = txManager; @@ -262,6 +269,12 @@ public int tableId() { return tableId; } + /** {@inheritDoc} */ + @Override + public int zoneId() { + return zoneId; + } + /** {@inheritDoc} */ @Override public String name() { @@ -734,8 +747,8 @@ private CompletableFuture evaluateReadOnlyPrimaryNode( TablePartitionId tablePartitionId = new TablePartitionId(tableId, partId); - CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplica( - tablePartitionId, + CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplicaForTable( + new ZonePartitionId(zoneId, tableId, partId), tx.startTimestamp(), AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS @@ -785,8 +798,8 @@ private CompletableFuture evaluateReadOnlyPrimaryNode( TablePartitionId tablePartitionId = new TablePartitionId(tableId, partId); - CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplica( - tablePartitionId, + CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplicaForTable( + new ZonePartitionId(zoneId, tableId, partId), tx.startTimestamp(), AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS @@ -1869,8 +1882,8 @@ protected CompletableFuture> enlist(int partId, HybridTimestamp now = clock.now(); - CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplica( - tablePartitionId, + CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplicaForTable( + new ZonePartitionId(zoneId, tableId, partId), now, AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS @@ -2135,9 +2148,9 @@ public void close() { * @return Cluster node to evalute read-only request. */ protected CompletableFuture evaluateReadOnlyRecipientNode(int partId) { - TablePartitionId tablePartitionId = new TablePartitionId(tableId, partId); + ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, tableId, partId); - return placementDriver.awaitPrimaryReplica(tablePartitionId, clock.now(), AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS) + return placementDriver.awaitPrimaryReplicaForTable(zonePartitionId, clock.now(), AWAIT_PRIMARY_REPLICA_TIMEOUT, SECONDS) .handle((res, e) -> { if (e != null) { throw withCause(TransactionException::new, REPLICA_UNAVAILABLE_ERR, e); diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/DelegatingPlacementDriver.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/DelegatingPlacementDriver.java index 4ad473891e1..170bac946a7 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/DelegatingPlacementDriver.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/DelegatingPlacementDriver.java @@ -17,6 +17,7 @@ package org.apache.ignite.internal.table.distributed.wrappers; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import org.apache.ignite.internal.event.EventListener; @@ -26,6 +27,7 @@ import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent; import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.ZonePartitionId; /** * A base for a {@link PlacementDriver} that delegates some of its methods to another {@link PlacementDriver}. @@ -53,6 +55,16 @@ public CompletableFuture awaitPrimaryReplica(ReplicationGroupId gro return delegate.awaitPrimaryReplica(groupId, timestamp, timeout, unit); } + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + return delegate.awaitPrimaryReplicaForTable(groupId, timestamp, timeout, unit); + } + @Override public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) { return delegate.getPrimaryReplica(replicationGroupId, timestamp); @@ -62,4 +74,18 @@ public CompletableFuture getPrimaryReplica(ReplicationGroupId repli public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) { return delegate.previousPrimaryExpired(grpId); } + + @Override + public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) { + return delegate.getLeaseMeta(grpId); + } + + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + return delegate.addSubgroups(zoneId, enlistmentConsistencyToken, subGrps); + } } diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/ExecutorInclinedPlacementDriver.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/ExecutorInclinedPlacementDriver.java index ee6a4c39810..a6cc82cae00 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/ExecutorInclinedPlacementDriver.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/wrappers/ExecutorInclinedPlacementDriver.java @@ -19,6 +19,7 @@ import static java.util.function.Function.identity; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; @@ -26,6 +27,7 @@ import org.apache.ignite.internal.placementdriver.PlacementDriver; import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.replicator.ReplicationGroupId; +import org.apache.ignite.internal.replicator.ZonePartitionId; /** * Decorates a {@link PlacementDriver} to make sure that completion stages depending on the returned futures are always completed @@ -49,6 +51,16 @@ public CompletableFuture awaitPrimaryReplica(ReplicationGroupId gro return decorateFuture(super.awaitPrimaryReplica(groupId, timestamp, timeout, unit)); } + @Override + public CompletableFuture awaitPrimaryReplicaForTable( + ReplicationGroupId groupId, + HybridTimestamp timestamp, + long timeout, + TimeUnit unit + ) { + return decorateFuture(super.awaitPrimaryReplicaForTable(groupId, timestamp, timeout, unit)); + } + private CompletableFuture decorateFuture(CompletableFuture future) { if (future.isDone()) { return future; @@ -66,4 +78,13 @@ public CompletableFuture getPrimaryReplica(ReplicationGroupId repli public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) { return decorateFuture(super.previousPrimaryExpired(grpId)); } + + @Override + public CompletableFuture addSubgroups( + ZonePartitionId zoneId, + Long enlistmentConsistencyToken, + Set subGrps + ) { + return decorateFuture(super.addSubgroups(zoneId, enlistmentConsistencyToken, subGrps)); + } } diff --git a/modules/table/src/main/java/org/apache/ignite/internal/utils/RebalanceUtilEx.java b/modules/table/src/main/java/org/apache/ignite/internal/utils/RebalanceUtilEx.java index 451e98c4c35..0f5071c77be 100644 --- a/modules/table/src/main/java/org/apache/ignite/internal/utils/RebalanceUtilEx.java +++ b/modules/table/src/main/java/org/apache/ignite/internal/utils/RebalanceUtilEx.java @@ -44,7 +44,7 @@ import org.apache.ignite.internal.metastorage.WatchEvent; import org.apache.ignite.internal.metastorage.dsl.Iif; import org.apache.ignite.internal.metastorage.dsl.Operations; -import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.util.ByteUtils; /** @@ -62,7 +62,7 @@ public class RebalanceUtilEx { * @return Completable future that signifies the completion of this operation. */ public static CompletableFuture startPeerRemoval( - TablePartitionId partId, + ZonePartitionId partId, Assignment peerAssignment, MetaStorageManager metaStorageMgr ) { @@ -114,7 +114,7 @@ public static CompletableFuture startPeerRemoval( * @return Completable future that signifies the completion of this operation. */ public static CompletableFuture handleReduceChanged(MetaStorageManager metaStorageMgr, Collection dataNodes, - int replicas, TablePartitionId partId, WatchEvent event) { + int replicas, ZonePartitionId partId, WatchEvent event) { Entry entry = event.entryEvent().newEntry(); byte[] eventData = entry.value(); diff --git a/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/TableManagerTest.java b/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/TableManagerTest.java index ac04b091cdc..226a36f6e22 100644 --- a/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/TableManagerTest.java +++ b/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/TableManagerTest.java @@ -356,8 +356,8 @@ public void testCreateTable() throws Exception { */ @Test public void testWriteTableAssignmentsToMetastoreExceptionally() throws Exception { - TableViewInternal table = mockManagersAndCreateTable(DYNAMIC_TABLE_NAME, tblManagerFut); - int tableId = table.tableId(); + mockManagersAndCreateTable(DYNAMIC_TABLE_NAME, tblManagerFut); + int zoneId = 2; TableManager tableManager = tblManagerFut.join(); List assignmentsList = List.of(Assignments.of(Assignment.forPeer(node.id()))); @@ -366,7 +366,7 @@ public void testWriteTableAssignmentsToMetastoreExceptionally() throws Exception var outerExceptionMsg = "Outer future is interrupted"; assignmentsFuture.completeExceptionally(new TimeoutException(outerExceptionMsg)); CompletableFuture> writtenAssignmentsFuture = tableManager - .writeTableAssignmentsToMetastore(tableId, assignmentsFuture); + .writeZoneAssignmentsToMetastore(zoneId, assignmentsFuture); assertTrue(writtenAssignmentsFuture.isCompletedExceptionally()); assertThrowsWithCause(writtenAssignmentsFuture::get, TimeoutException.class, outerExceptionMsg); @@ -376,7 +376,7 @@ public void testWriteTableAssignmentsToMetastoreExceptionally() throws Exception var innerExceptionMsg = "Inner future is interrupted"; invokeTimeoutFuture.completeExceptionally(new TimeoutException(innerExceptionMsg)); when(msm.invoke(any(), any(List.class), any(List.class))).thenReturn(invokeTimeoutFuture); - writtenAssignmentsFuture = tableManager.writeTableAssignmentsToMetastore(tableId, assignmentsFuture); + writtenAssignmentsFuture = tableManager.writeZoneAssignmentsToMetastore(zoneId, assignmentsFuture); assertTrue(writtenAssignmentsFuture.isCompletedExceptionally()); assertThrowsWithCause(writtenAssignmentsFuture::get, TimeoutException.class, innerExceptionMsg); } diff --git a/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImplTest.java b/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImplTest.java index b07518f94d0..c5ec359f1f4 100644 --- a/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImplTest.java +++ b/modules/table/src/test/java/org/apache/ignite/internal/table/distributed/storage/InternalTableImplTest.java @@ -62,6 +62,7 @@ void testUpdatePartitionTrackers() { InternalTableImpl internalTable = new InternalTableImpl( "test", 1, + 123, 1, new SingleClusterNodeResolver(mock(ClusterNode.class)), mock(TxManager.class), @@ -112,6 +113,7 @@ void testRowBatchByPartitionId() { InternalTableImpl internalTable = new InternalTableImpl( "test", 1, + 123, 3, new SingleClusterNodeResolver(mock(ClusterNode.class)), mock(TxManager.class), diff --git a/modules/table/src/testFixtures/java/org/apache/ignite/distributed/ItTxTestCluster.java b/modules/table/src/testFixtures/java/org/apache/ignite/distributed/ItTxTestCluster.java index a7b3fe44abe..921be60d688 100644 --- a/modules/table/src/testFixtures/java/org/apache/ignite/distributed/ItTxTestCluster.java +++ b/modules/table/src/testFixtures/java/org/apache/ignite/distributed/ItTxTestCluster.java @@ -102,6 +102,7 @@ import org.apache.ignite.internal.replicator.ReplicaManager; import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration; import org.apache.ignite.internal.schema.BinaryRowConverter; import org.apache.ignite.internal.schema.ColumnsExtractor; @@ -288,7 +289,7 @@ public class ItTxTestCluster { private CatalogService catalogService; - private final AtomicInteger globalCatalogId = new AtomicInteger(); + private final AtomicInteger globalCatalogId = new AtomicInteger(1); protected final TestLowWatermark lowWatermark = new TestLowWatermark(); @@ -689,6 +690,7 @@ public TableViewInternal startTable(String tableName, SchemaDescriptor schemaDes replicaManagers.get(assignment).startReplica( new TablePartitionId(tableId, partId), + new ZonePartitionId(globalCatalogId.incrementAndGet(), partId), listener, raftSvc, storageIndexTracker @@ -743,6 +745,7 @@ public TableViewInternal startTable(String tableName, SchemaDescriptor schemaDes new InternalTableImpl( tableName, tableId, + 123, 1, nodeResolver, clientTxManager, diff --git a/modules/table/src/testFixtures/java/org/apache/ignite/internal/table/impl/DummyInternalTableImpl.java b/modules/table/src/testFixtures/java/org/apache/ignite/internal/table/impl/DummyInternalTableImpl.java index a3a5fecf9d9..5c7533d1d29 100644 --- a/modules/table/src/testFixtures/java/org/apache/ignite/internal/table/impl/DummyInternalTableImpl.java +++ b/modules/table/src/testFixtures/java/org/apache/ignite/internal/table/impl/DummyInternalTableImpl.java @@ -237,6 +237,7 @@ public DummyInternalTableImpl( super( "test", nextTableId.getAndIncrement(), + 123, 1, new SingleClusterNodeResolver(LOCAL_NODE), txManager(replicaSvc, placementDriver, txConfiguration, resourcesRegistry), diff --git a/modules/transactions/src/main/java/org/apache/ignite/internal/tx/impl/TxManagerImpl.java b/modules/transactions/src/main/java/org/apache/ignite/internal/tx/impl/TxManagerImpl.java index a8c8414f28e..7597a8fc618 100644 --- a/modules/transactions/src/main/java/org/apache/ignite/internal/tx/impl/TxManagerImpl.java +++ b/modules/transactions/src/main/java/org/apache/ignite/internal/tx/impl/TxManagerImpl.java @@ -82,6 +82,7 @@ import org.apache.ignite.internal.replicator.ReplicaService; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.replicator.exception.PrimaryReplicaMissException; import org.apache.ignite.internal.replicator.exception.ReplicationException; import org.apache.ignite.internal.replicator.exception.ReplicationTimeoutException; @@ -352,11 +353,13 @@ private CompletableFuture primaryReplicaEventListener( Consumer action ) { return inBusyLock(busyLock, () -> { - if (!(eventParameters.groupId() instanceof TablePartitionId)) { + if (!(eventParameters.groupId() instanceof ZonePartitionId)) { return falseCompletedFuture(); } - TablePartitionId groupId = (TablePartitionId) eventParameters.groupId(); + ZonePartitionId zonePartitionId = (ZonePartitionId) eventParameters.groupId(); + + TablePartitionId groupId = new TablePartitionId(zonePartitionId.tableId(), zonePartitionId.partitionId()); action.accept(groupId); diff --git a/modules/transactions/src/testFixtures/java/org/apache/ignite/internal/tx/test/ItTransactionTestUtils.java b/modules/transactions/src/testFixtures/java/org/apache/ignite/internal/tx/test/ItTransactionTestUtils.java index 12aa397ae64..e8a4d93e3d7 100644 --- a/modules/transactions/src/testFixtures/java/org/apache/ignite/internal/tx/test/ItTransactionTestUtils.java +++ b/modules/transactions/src/testFixtures/java/org/apache/ignite/internal/tx/test/ItTransactionTestUtils.java @@ -39,6 +39,7 @@ import org.apache.ignite.internal.placementdriver.ReplicaMeta; import org.apache.ignite.internal.replicator.ReplicationGroupId; import org.apache.ignite.internal.replicator.TablePartitionId; +import org.apache.ignite.internal.replicator.ZonePartitionId; import org.apache.ignite.internal.schema.BinaryRowEx; import org.apache.ignite.internal.table.RecordBinaryViewImpl; import org.apache.ignite.internal.table.TableImpl; @@ -61,7 +62,7 @@ public class ItTransactionTestUtils { * @param grpId Group id. * @return Node names. */ - public static Set partitionAssignment(IgniteImpl node, TablePartitionId grpId) { + public static Set partitionAssignment(IgniteImpl node, ZonePartitionId grpId) { MetaStorageManager metaStorageManager = node.metaStorageManager(); ByteArray stableAssignmentKey = stablePartAssignmentsKey(grpId); @@ -121,6 +122,7 @@ public static Tuple findTupleToBeHostedOnNode( boolean primary ) { Tuple t = initialTuple; + int zoneId = zoneId(node, tableName); int tableId = tableId(node, tableName); int maxAttempts = 100; @@ -128,10 +130,11 @@ public static Tuple findTupleToBeHostedOnNode( while (maxAttempts >= 0) { int partId = partitionIdForTuple(node, tableName, t, tx); - TablePartitionId grpId = new TablePartitionId(tableId, partId); + ZonePartitionId grpId = new ZonePartitionId(zoneId, partId); + TablePartitionId tblGrpId = new TablePartitionId(tableId, partId); if (primary) { - ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, grpId); + ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, tblGrpId); if (node.id().equals(replicaMeta.getLeaseholderId())) { return t; @@ -174,6 +177,17 @@ public static int tableId(IgniteImpl node, String tableName) { return table(node, tableName).tableId(); } + /** + * Returns the zone id. + * + * @param node Any node in the cluster. + * @param tableName Table name. + * @return Zone id. + */ + public static int zoneId(IgniteImpl node, String tableName) { + return table(node, tableName).internalTable().zoneId(); + } + /** * Transaction id. *