diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java
index 5b5b401af8a..9187abb521d 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/AbstractCreateIndexCommand.java
@@ -60,7 +60,7 @@ public abstract class AbstractCreateIndexCommand extends AbstractIndexCommand {
this.columns = copyOrNull(columns);
}
- protected abstract CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion);
+ protected abstract CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion);
@Override
public List get(Catalog catalog) {
@@ -84,7 +84,9 @@ public List get(Catalog catalog) {
}
return List.of(
- new NewIndexEntry(createDescriptor(catalog.objectIdGenState(), table.id(), catalog.version() + 1), schemaName),
+ new NewIndexEntry(
+ createDescriptor(catalog.objectIdGenState(), table.id(), table.zoneId(), catalog.version() + 1), schemaName
+ ),
new ObjectIdGenUpdateEntry(1)
);
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java
index 45662b39503..44dc6e2fd68 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateHashIndexCommand.java
@@ -48,9 +48,9 @@ private CreateHashIndexCommand(String schemaName, String indexName, String table
}
@Override
- protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion) {
+ protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion) {
return new CatalogHashIndexDescriptor(
- indexId, indexName, tableId, unique, creationCatalogVersion, columns
+ indexId, indexName, tableId, unique, creationCatalogVersion, zoneId, columns
);
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java
index 34c9eb9e9a0..b1cc4dbb65a 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateSortedIndexCommand.java
@@ -61,7 +61,7 @@ private CreateSortedIndexCommand(String schemaName, String indexName, String tab
}
@Override
- protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int creationCatalogVersion) {
+ protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int zoneId, int creationCatalogVersion) {
var indexColumnDescriptors = new ArrayList(columns.size());
for (int i = 0; i < columns.size(); i++) {
@@ -71,7 +71,7 @@ protected CatalogIndexDescriptor createDescriptor(int indexId, int tableId, int
}
return new CatalogSortedIndexDescriptor(
- indexId, indexName, tableId, unique, creationCatalogVersion, indexColumnDescriptors
+ indexId, indexName, tableId, unique, creationCatalogVersion, zoneId, indexColumnDescriptors
);
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java
index 565cdce0bfc..f043c2d8a5a 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/commands/CreateTableCommand.java
@@ -147,7 +147,7 @@ public List get(Catalog catalog) {
ensureNoTableIndexOrSysViewExistsWithGivenName(schema, indexName);
int txWaitCatalogVersion = catalog.version() + 1;
- CatalogIndexDescriptor pkIndex = createIndexDescriptor(txWaitCatalogVersion, indexName, pkIndexId, tableId);
+ CatalogIndexDescriptor pkIndex = createIndexDescriptor(txWaitCatalogVersion, indexName, pkIndexId, tableId, zone.id());
return List.of(
new NewTableEntry(table, schemaName),
@@ -200,7 +200,13 @@ private void validate() {
}
}
- private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, String indexName, int pkIndexId, int tableId) {
+ private CatalogIndexDescriptor createIndexDescriptor(
+ int txWaitCatalogVersion,
+ String indexName,
+ int pkIndexId,
+ int tableId,
+ int zoneId
+ ) {
CatalogIndexDescriptor pkIndex;
if (primaryKey instanceof TableSortedPrimaryKey) {
@@ -221,6 +227,7 @@ private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, S
true,
AVAILABLE,
txWaitCatalogVersion,
+ zoneId,
indexColumns
);
} else if (primaryKey instanceof TableHashPrimaryKey) {
@@ -232,6 +239,7 @@ private CatalogIndexDescriptor createIndexDescriptor(int txWaitCatalogVersion, S
true,
AVAILABLE,
txWaitCatalogVersion,
+ zoneId,
hashPrimaryKey.columns()
);
} else {
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java
index e251a6016fb..a25ab5b88e1 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptor.java
@@ -45,11 +45,20 @@ public class CatalogHashIndexDescriptor extends CatalogIndexDescriptor {
* @param unique Unique flag.
* @param txWaitCatalogVersion Catalog version used in special index status updates to wait for RW transactions, started before
* this version, to finish.
+ * @param zoneId Zone id where table for the index is presented.
* @param columns A list of indexed columns. Must not contains duplicates.
* @throws IllegalArgumentException If columns list contains duplicates.
*/
- public CatalogHashIndexDescriptor(int id, String name, int tableId, boolean unique, int txWaitCatalogVersion, List columns) {
- this(id, name, tableId, unique, CatalogIndexStatus.REGISTERED, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN);
+ public CatalogHashIndexDescriptor(
+ int id,
+ String name,
+ int tableId,
+ boolean unique,
+ int txWaitCatalogVersion,
+ int zoneId,
+ List columns
+ ) {
+ this(id, name, tableId, unique, CatalogIndexStatus.REGISTERED, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN);
}
/**
@@ -72,9 +81,10 @@ public CatalogHashIndexDescriptor(
boolean unique,
CatalogIndexStatus status,
int txWaitCatalogVersion,
+ int zoneId,
List columns
) {
- this(id, name, tableId, unique, status, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN);
+ this(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN);
}
/**
@@ -98,10 +108,11 @@ private CatalogHashIndexDescriptor(
boolean unique,
CatalogIndexStatus status,
int txWaitCatalogVersion,
+ int zoneId,
List columns,
long causalityToken
) {
- super(CatalogIndexDescriptorType.HASH, id, name, tableId, unique, status, txWaitCatalogVersion, causalityToken);
+ super(CatalogIndexDescriptorType.HASH, id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, causalityToken);
this.columns = List.copyOf(Objects.requireNonNull(columns, "columns"));
}
@@ -126,9 +137,10 @@ public CatalogHashIndexDescriptor readFrom(IgniteDataInput input) throws IOExcep
boolean unique = input.readBoolean();
CatalogIndexStatus status = CatalogIndexStatus.forId(input.readByte());
int txWaitCatalogVersion = input.readInt();
+ int zoneId = input.readInt();
List columns = readStringCollection(input, ArrayList::new);
- return new CatalogHashIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, columns, updateToken);
+ return new CatalogHashIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, updateToken);
}
@Override
@@ -140,6 +152,7 @@ public void writeTo(CatalogHashIndexDescriptor descriptor, IgniteDataOutput outp
output.writeBoolean(descriptor.unique());
output.writeByte(descriptor.status().id());
output.writeInt(descriptor.txWaitCatalogVersion());
+ output.writeInt(descriptor.zoneId());
writeStringCollection(descriptor.columns(), output);
}
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java
index 93a1affae6f..9a011be3706 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogIndexDescriptor.java
@@ -39,14 +39,18 @@ public abstract class CatalogIndexDescriptor extends CatalogObjectDescriptor {
/** Index descriptor type. */
private final CatalogIndexDescriptorType indexType;
+ /** Zone id where table for the index is presented. */
+ private final int zoneId;
+
CatalogIndexDescriptor(CatalogIndexDescriptorType indexType, int id, String name, int tableId, boolean unique,
- CatalogIndexStatus status, int txWaitCatalogVersion, long causalityToken) {
+ CatalogIndexStatus status, int txWaitCatalogVersion, int zoneId, long causalityToken) {
super(id, Type.INDEX, name, causalityToken);
this.indexType = indexType;
this.tableId = tableId;
this.unique = unique;
this.status = Objects.requireNonNull(status, "status");
this.txWaitCatalogVersion = txWaitCatalogVersion;
+ this.zoneId = zoneId;
}
/** Gets table ID. */
@@ -72,6 +76,11 @@ public int txWaitCatalogVersion() {
return txWaitCatalogVersion;
}
+ /** Return zone id where table for the index is presented. */
+ public int zoneId() {
+ return zoneId;
+ }
+
/** Returns catalog index descriptor type. */
public CatalogIndexDescriptorType indexType() {
return indexType;
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java
index 6697c162596..e06907511bd 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptor.java
@@ -46,6 +46,7 @@ public class CatalogSortedIndexDescriptor extends CatalogIndexDescriptor {
* @param unique Unique flag.
* @param txWaitCatalogVersion Catalog version used in special index status updates to wait for RW transactions, started before
* this version, to finish.
+ * @param zoneId Zone id where table for the index is presented.
* @param columns A list of columns descriptors.
* @throws IllegalArgumentException If columns list contains duplicates or columns size doesn't match the collations size.
*/
@@ -55,9 +56,10 @@ public CatalogSortedIndexDescriptor(
int tableId,
boolean unique,
int txWaitCatalogVersion,
+ int zoneId,
List columns
) {
- this(id, name, tableId, unique, REGISTERED, txWaitCatalogVersion, columns);
+ this(id, name, tableId, unique, REGISTERED, txWaitCatalogVersion, zoneId, columns);
}
/**
@@ -80,9 +82,10 @@ public CatalogSortedIndexDescriptor(
boolean unique,
CatalogIndexStatus status,
int txWaitCatalogVersion,
+ int zoneId,
List columns
) {
- this(id, name, tableId, unique, status, txWaitCatalogVersion, columns, INITIAL_CAUSALITY_TOKEN);
+ this(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, INITIAL_CAUSALITY_TOKEN);
}
/**
@@ -106,10 +109,11 @@ private CatalogSortedIndexDescriptor(
boolean unique,
CatalogIndexStatus status,
int txWaitCatalogVersion,
+ int zoneId,
List columns,
long causalityToken
) {
- super(CatalogIndexDescriptorType.SORTED, id, name, tableId, unique, status, txWaitCatalogVersion, causalityToken);
+ super(CatalogIndexDescriptorType.SORTED, id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, causalityToken);
this.columns = Objects.requireNonNull(columns, "columns");
}
@@ -134,9 +138,10 @@ public CatalogSortedIndexDescriptor readFrom(IgniteDataInput input) throws IOExc
boolean unique = input.readBoolean();
CatalogIndexStatus status = CatalogIndexStatus.forId(input.readByte());
int txWaitCatalogVersion = input.readInt();
+ int zoneId = input.readInt();
List columns = readList(CatalogIndexColumnDescriptor.SERIALIZER, input);
- return new CatalogSortedIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, columns, updateToken);
+ return new CatalogSortedIndexDescriptor(id, name, tableId, unique, status, txWaitCatalogVersion, zoneId, columns, updateToken);
}
@Override
@@ -148,6 +153,7 @@ public void writeTo(CatalogSortedIndexDescriptor descriptor, IgniteDataOutput ou
output.writeBoolean(descriptor.unique());
output.writeByte(descriptor.status().id());
output.writeInt(descriptor.txWaitCatalogVersion());
+ output.writeInt(descriptor.zoneId());
writeList(descriptor.columns(), CatalogIndexColumnDescriptor.SERIALIZER, output);
}
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java
index 6c6baac45fa..884440dc21c 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/AbstractChangeIndexStatusEntry.java
@@ -111,6 +111,7 @@ private static CatalogIndexDescriptor updateHashIndexStatus(
index.unique(),
newStatus,
txWaitCatalogVersion,
+ index.zoneId(),
index.columns()
);
}
@@ -125,6 +126,7 @@ private static CatalogIndexDescriptor updateSortedIndexStatus(
index.unique(),
newStatus,
txWaitCatalogVersion,
+ index.zoneId(),
index.columns()
);
}
diff --git a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java
index 1b2278f21da..f1ec0f2400a 100644
--- a/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java
+++ b/modules/catalog/src/main/java/org/apache/ignite/internal/catalog/storage/RenameIndexEntry.java
@@ -101,6 +101,7 @@ private CatalogIndexDescriptor changeHashIndexName(CatalogHashIndexDescriptor in
index.unique(),
index.status(),
index.txWaitCatalogVersion(),
+ index.zoneId(),
index.columns()
);
}
@@ -113,6 +114,7 @@ private CatalogIndexDescriptor changeSortedIndexName(CatalogSortedIndexDescripto
index.unique(),
index.status(),
index.txWaitCatalogVersion(),
+ index.zoneId(),
index.columns()
);
}
diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java
index c40cf4d5409..7db0c9132e0 100644
--- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java
+++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/AbstractChangeIndexStatusCommandValidationTest.java
@@ -88,6 +88,7 @@ void exceptionIsThrownIfIndexHasInvalidPreviousStatus(CatalogIndexStatus invalid
false,
invalidPreviousIndexStatus,
version,
+ 0,
List.of(columnName)
)
},
diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java
index a1e24c4bb84..210b1e2bedc 100644
--- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java
+++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/commands/CatalogUtilsTest.java
@@ -367,6 +367,7 @@ void testReplaceIndex() {
fooIndex.unique(),
fooIndex.status(),
fooIndex.txWaitCatalogVersion(),
+ fooIndex.zoneId(),
fooIndex.columns()
);
diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java
index d6153169ec2..b8b565975dc 100644
--- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java
+++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogHashIndexDescriptorTest.java
@@ -27,7 +27,7 @@
class CatalogHashIndexDescriptorTest {
@Test
void toStringContainsTypeAndFields() {
- var descriptor = new CatalogHashIndexDescriptor(1, "index1", 2, false, 3, List.of("col"));
+ var descriptor = new CatalogHashIndexDescriptor(1, "index1", 2, false, 3, 0, List.of("col"));
String toString = descriptor.toString();
@@ -36,5 +36,6 @@ void toStringContainsTypeAndFields() {
assertThat(toString, containsString("name=index1"));
assertThat(toString, containsString("tableId=2"));
assertThat(toString, containsString("status=REGISTERED"));
+ assertThat(toString, containsString("zoneId=0"));
}
}
diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java
index 979ff659ed6..c461d6a009d 100644
--- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java
+++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/descriptors/CatalogSortedIndexDescriptorTest.java
@@ -27,7 +27,7 @@
class CatalogSortedIndexDescriptorTest {
@Test
void toStringContainsTypeAndFields() {
- var descriptor = new CatalogSortedIndexDescriptor(1, "index1", 2, false, 3, List.of());
+ var descriptor = new CatalogSortedIndexDescriptor(1, "index1", 2, false, 3, 0, List.of());
String toString = descriptor.toString();
@@ -36,5 +36,6 @@ void toStringContainsTypeAndFields() {
assertThat(toString, containsString("name=index1"));
assertThat(toString, containsString("tableId=2"));
assertThat(toString, containsString("status=REGISTERED"));
+ assertThat(toString, containsString("zoneId=0"));
}
}
diff --git a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java
index ac0e23b8edc..c346fd4acd3 100644
--- a/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java
+++ b/modules/catalog/src/test/java/org/apache/ignite/internal/catalog/storage/CatalogEntrySerializationTest.java
@@ -452,12 +452,12 @@ private static CatalogSortedIndexDescriptor newSortedIndexDescriptor(String name
CatalogIndexColumnDescriptor idxCol4 = new CatalogIndexColumnDescriptor("C4", CatalogColumnCollation.ASC_NULLS_LAST);
return new CatalogSortedIndexDescriptor(
- 1, name, 12, false, CatalogIndexStatus.AVAILABLE, 1, List.of(idxCol1, idxCol2, idxCol3, idxCol4));
+ 1, name, 12, false, CatalogIndexStatus.AVAILABLE, 1, 0, List.of(idxCol1, idxCol2, idxCol3, idxCol4));
}
private static CatalogHashIndexDescriptor newHashIndexDescriptor(String name) {
return new CatalogHashIndexDescriptor(
- 1, name, 12, true, CatalogIndexStatus.REGISTERED, 1, List.of("C1", "C2"));
+ 1, name, 12, true, CatalogIndexStatus.REGISTERED, 1, 0, List.of("C1", "C2"));
}
private static CatalogTableDescriptor newTableDescriptor(String name, List columns) {
diff --git a/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java b/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java
index ee040d01591..514b496a4a2 100644
--- a/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java
+++ b/modules/client-handler/src/main/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTracker.java
@@ -45,6 +45,7 @@
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.table.LongPriorityQueue;
import org.apache.ignite.internal.table.distributed.schema.SchemaSyncService;
import org.apache.ignite.internal.util.ExceptionUtils;
@@ -291,11 +292,13 @@ void stop() {
private void onPrimaryReplicaChanged(PrimaryReplicaEventParameters primaryReplicaEvent) {
inBusyLock(busyLock, () -> {
- if (!(primaryReplicaEvent.groupId() instanceof TablePartitionId)) {
+ if (!(primaryReplicaEvent.groupId() instanceof ZonePartitionId)) {
return;
}
- TablePartitionId tablePartitionId = (TablePartitionId) primaryReplicaEvent.groupId();
+ ZonePartitionId zonePartitionId = (ZonePartitionId) primaryReplicaEvent.groupId();
+
+ TablePartitionId tablePartitionId = new TablePartitionId(zonePartitionId.tableId(), zonePartitionId.partitionId());
updatePrimaryReplica(tablePartitionId, primaryReplicaEvent.startTime(), primaryReplicaEvent.leaseholder());
});
diff --git a/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java b/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java
index 01e1f6b0b2a..01a643ed823 100644
--- a/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java
+++ b/modules/client-handler/src/test/java/org/apache/ignite/client/handler/ClientPrimaryReplicaTrackerTest.java
@@ -43,6 +43,8 @@ class ClientPrimaryReplicaTrackerTest extends BaseIgniteAbstractTest {
private static final int TABLE_ID = 123;
+ private static final int ZONE_ID = 1234;
+
private ClientPrimaryReplicaTracker tracker;
private FakePlacementDriver driver;
@@ -52,7 +54,7 @@ class ClientPrimaryReplicaTrackerTest extends BaseIgniteAbstractTest {
@BeforeEach
public void setUp() throws Exception {
driver = new FakePlacementDriver(PARTITIONS);
- driver.setReplicas(List.of("s1", "s2"), TABLE_ID, 1);
+ driver.setReplicas(List.of("s1", "s2"), TABLE_ID, ZONE_ID, 1);
InternalTable internalTable = mock(InternalTable.class);
when(internalTable.partitions()).thenReturn(PARTITIONS);
@@ -90,7 +92,7 @@ public void testUpdateByEvent() {
tracker.start();
assertEquals(1, tracker.maxStartTime());
- driver.updateReplica("s3", TABLE_ID, 0, 2);
+ driver.updateReplica("s3", TABLE_ID, ZONE_ID, 0, 2);
assertEquals(2, tracker.maxStartTime());
@@ -102,11 +104,11 @@ public void testUpdateByEvent() {
@Test
public void testNullReplicas() {
- driver.updateReplica(null, TABLE_ID, 0, 2);
+ driver.updateReplica(null, TABLE_ID, ZONE_ID, 0, 2);
tracker.start();
assertEquals(1, tracker.maxStartTime());
- driver.updateReplica(null, TABLE_ID, 1, 2);
+ driver.updateReplica(null, TABLE_ID, ZONE_ID, 1, 2);
assertEquals(2, tracker.maxStartTime());
@@ -136,10 +138,10 @@ public void testOldEventsAreIgnoredByLeaseStartTime() {
tracker.start();
tracker.primaryReplicasAsync(TABLE_ID, null).join(); // Start tracking the table.
- driver.updateReplica("update-1", TABLE_ID, 0, 10);
- driver.updateReplica("old-update-2", TABLE_ID, 0, 5);
- driver.updateReplica("update-3", TABLE_ID, 0, 15);
- driver.updateReplica("old-update-4", TABLE_ID, 0, 14);
+ driver.updateReplica("update-1", TABLE_ID, ZONE_ID, 0, 10);
+ driver.updateReplica("old-update-2", TABLE_ID, ZONE_ID, 0, 5);
+ driver.updateReplica("update-3", TABLE_ID, ZONE_ID, 0, 15);
+ driver.updateReplica("old-update-4", TABLE_ID, ZONE_ID, 0, 14);
assertEquals(15, tracker.maxStartTime());
diff --git a/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java b/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java
index 5853de37f18..7d6adf45e1a 100644
--- a/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java
+++ b/modules/client-handler/src/testFixtures/java/org/apache/ignite/client/handler/FakePlacementDriver.java
@@ -22,16 +22,19 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import org.apache.ignite.internal.event.AbstractEventProducer;
import org.apache.ignite.internal.hlc.HybridTimestamp;
+import org.apache.ignite.internal.lang.IgniteInternalException;
import org.apache.ignite.internal.placementdriver.PlacementDriver;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
/**
* Fake placement driver.
@@ -56,21 +59,21 @@ public void returnError(boolean returnError) {
/**
* Sets all primary replicas.
*/
- public void setReplicas(List replicas, int tableId, long leaseStartTime) {
+ public void setReplicas(List replicas, int tableId, int zoneId, long leaseStartTime) {
assert replicas.size() == partitions;
for (int partition = 0; partition < replicas.size(); partition++) {
String replica = replicas.get(partition);
- updateReplica(replica, tableId, partition, leaseStartTime);
+ updateReplica(replica, tableId, zoneId, partition, leaseStartTime);
}
}
/**
* Sets primary replica for the given partition.
*/
- public void updateReplica(String replica, int tableId, int partition, long leaseStartTime) {
+ public void updateReplica(String replica, int tableId, int zoneId, int partition, long leaseStartTime) {
primaryReplicas.set(partition, getReplicaMeta(replica, leaseStartTime));
- TablePartitionId groupId = new TablePartitionId(tableId, partition);
+ ZonePartitionId groupId = new ZonePartitionId(zoneId, tableId, partition);
PrimaryReplicaEventParameters params = new PrimaryReplicaEventParameters(
0,
@@ -93,6 +96,16 @@ public CompletableFuture awaitPrimaryReplica(ReplicationGroupId gro
: CompletableFuture.completedFuture(primaryReplicas.get(id.partitionId()));
}
+ @Override
+ public CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
+ ) {
+ throw new IgniteInternalException("Not implemented yet.");
+ }
+
@Override
public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) {
return awaitPrimaryReplica(replicationGroupId, timestamp, 0, TimeUnit.MILLISECONDS);
@@ -125,6 +138,25 @@ public HybridTimestamp getStartTime() {
public HybridTimestamp getExpirationTime() {
return HybridTimestamp.MAX_VALUE;
}
+
+ @Override
+ public Set subgroups() {
+ return Set.of();
+ }
};
}
+
+ @Override
+ public CompletableFuture addSubgroups(
+ ZonePartitionId zoneId,
+ Long enlistmentConsistencyToken,
+ Set subGrps
+ ) {
+ return nullCompletedFuture();
+ }
+
+ @Override
+ public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) {
+ return null;
+ }
}
diff --git a/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java b/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java
index 8ac8a715615..2bb4e8a29c1 100644
--- a/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java
+++ b/modules/client/src/test/java/org/apache/ignite/client/PartitionAwarenessTest.java
@@ -86,6 +86,8 @@ public class PartitionAwarenessTest extends AbstractClientTest {
private static final AtomicInteger nextTableId = new AtomicInteger(101);
+ private static final int zoneId = 1234;
+
/**
* Before all.
*/
@@ -651,7 +653,7 @@ private static void initPrimaryReplicas(FakePlacementDriver placementDriver, @Nu
replicas = defaultReplicas();
}
- placementDriver.setReplicas(replicas, nextTableId.get() - 1, leaseStartTime);
+ placementDriver.setReplicas(replicas, nextTableId.get() - 1, zoneId, leaseStartTime);
}
private static List defaultReplicas() {
diff --git a/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java b/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java
index 8016953e97f..0811a1f0603 100644
--- a/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java
+++ b/modules/client/src/test/java/org/apache/ignite/client/fakes/FakeInternalTable.java
@@ -116,6 +116,11 @@ public int partitionId(BinaryRowEx row) {
return 0;
}
+ @Override
+ public int zoneId() {
+ return 123;
+ }
+
@Override
public CompletableFuture get(BinaryRowEx keyRow, @Nullable InternalTransaction tx) {
return completedFuture(getImpl(keyRow.tupleSlice(), keyRow));
diff --git a/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java b/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java
index 0bd0d64e36f..5f7beff0d4c 100644
--- a/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java
+++ b/modules/compute/src/main/java/org/apache/ignite/internal/compute/IgniteComputeImpl.java
@@ -50,7 +50,7 @@
import org.apache.ignite.compute.task.ComputeJobRunner;
import org.apache.ignite.internal.hlc.HybridClock;
import org.apache.ignite.internal.placementdriver.PlacementDriver;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.table.IgniteTablesInternal;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.internal.util.CompletableFutures;
@@ -320,9 +320,9 @@ private CompletableFuture primaryReplicaForPartitionByMappedKey
}
private CompletableFuture primaryReplicaForPartition(TableViewInternal table, int partitionIndex) {
- TablePartitionId tablePartitionId = new TablePartitionId(table.tableId(), partitionIndex);
+ ZonePartitionId zonePartitionId = new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), partitionIndex);
- return placementDriver.awaitPrimaryReplica(tablePartitionId, clock.now(), 30, TimeUnit.SECONDS)
+ return placementDriver.awaitPrimaryReplicaForTable(zonePartitionId, clock.now(), 30, TimeUnit.SECONDS)
.thenApply(replicaMeta -> {
if (replicaMeta != null && replicaMeta.getLeaseholderId() != null) {
return topologyService.getById(replicaMeta.getLeaseholderId());
diff --git a/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java b/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java
index a9055ad5f51..5eb89bd170b 100644
--- a/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java
+++ b/modules/compute/src/main/java/org/apache/ignite/internal/compute/NextColocatedWorkerSelector.java
@@ -22,7 +22,7 @@
import org.apache.ignite.internal.hlc.HybridClock;
import org.apache.ignite.internal.placementdriver.PlacementDriver;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.network.ClusterNode;
import org.apache.ignite.network.TopologyService;
@@ -93,9 +93,9 @@ private NextColocatedWorkerSelector(
this.tuple = tuple;
}
- private CompletableFuture tryToFindPrimaryReplica(TablePartitionId tablePartitionId) {
- return placementDriver.awaitPrimaryReplica(
- tablePartitionId,
+ private CompletableFuture tryToFindPrimaryReplica(ZonePartitionId zonePartitionId) {
+ return placementDriver.awaitPrimaryReplicaForTable(
+ zonePartitionId,
clock.now().addPhysicalTime(PRIMARY_REPLICA_ASK_CLOCK_ADDITION_MILLIS),
AWAIT_FOR_PRIMARY_REPLICA_SECONDS,
TimeUnit.SECONDS
@@ -105,15 +105,15 @@ private CompletableFuture tryToFindPrimaryReplica(TablePartitionId
@Override
public CompletableFuture next() {
- TablePartitionId tablePartitionId = tablePartitionId();
- return tryToFindPrimaryReplica(tablePartitionId);
+ ZonePartitionId zonePartitionId = zonePartitionId();
+ return tryToFindPrimaryReplica(zonePartitionId);
}
- private TablePartitionId tablePartitionId() {
+ private ZonePartitionId zonePartitionId() {
if (key != null && keyMapper != null) {
- return new TablePartitionId(table.tableId(), table.partition(key, keyMapper));
+ return new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), table.partition(key, keyMapper));
} else {
- return new TablePartitionId(table.tableId(), table.partition(tuple));
+ return new ZonePartitionId(table.internalTable().zoneId(), table.tableId(), table.partition(tuple));
}
}
}
diff --git a/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java b/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java
index 7a078086e17..a8940d85e06 100644
--- a/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java
+++ b/modules/compute/src/test/java/org/apache/ignite/internal/compute/IgniteComputeImplTest.java
@@ -49,6 +49,7 @@
import org.apache.ignite.internal.placementdriver.PlacementDriver;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.table.IgniteTablesInternal;
+import org.apache.ignite.internal.table.InternalTable;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.network.ClusterNode;
@@ -92,6 +93,9 @@ class IgniteComputeImplTest extends BaseIgniteAbstractTest {
@Mock
private TableViewInternal table;
+ @Mock
+ private InternalTable internalTable;
+
private final ClusterNode localNode = new ClusterNodeImpl("local", "local", new NetworkAddress("local-host", 1));
private final ClusterNode remoteNode = new ClusterNodeImpl("remote", "remote", new NetworkAddress("remote-host", 1));
@@ -103,6 +107,8 @@ void setupMocks() {
lenient().when(topologyService.localMember()).thenReturn(localNode);
lenient().when(topologyService.getByConsistentId(localNode.name())).thenReturn(localNode);
lenient().when(topologyService.getByConsistentId(remoteNode.name())).thenReturn(remoteNode);
+ lenient().when(table.internalTable()).thenReturn(internalTable);
+ lenient().when(table.tableId()).thenReturn(42);
}
@Test
@@ -212,7 +218,7 @@ private void respondWhenAskForPrimaryReplica() {
ReplicaMeta replicaMeta = mock(ReplicaMeta.class);
doReturn("").when(replicaMeta).getLeaseholderId();
CompletableFuture toBeReturned = completedFuture(replicaMeta);
- doReturn(toBeReturned).when(placementDriver).awaitPrimaryReplica(any(), any(), anyLong(), any());
+ doReturn(toBeReturned).when(placementDriver).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any());
doReturn(remoteNode).when(topologyService).getById(any());
}
diff --git a/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java b/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java
new file mode 100644
index 00000000000..d6c07698370
--- /dev/null
+++ b/modules/core/src/main/java/org/apache/ignite/internal/replicator/ZonePartitionId.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.replicator;
+
+import java.util.Objects;
+
+/**
+ * The class is used to identify a zone replication group id for a given partition.
+ */
+public class ZonePartitionId implements ReplicationGroupId {
+ private final int zoneId;
+
+ private final int tableId;
+
+ private final int partId;
+
+ /**
+ * The constructor.
+ *
+ * @param zoneId Zone id.
+ * @param partId Partition id.
+ */
+ public ZonePartitionId(int zoneId, int partId) {
+ this.zoneId = zoneId;
+ this.partId = partId;
+ this.tableId = 0;
+ }
+
+ /**
+ * The constructor.
+ *
+ * @param zoneId Zone id.
+ * @param tableId Table id.
+ * @param partId Partition id.
+ */
+ public ZonePartitionId(int zoneId, int tableId, int partId) {
+ assert tableId != 0 : "Use constructor with two parameters.";
+
+ this.zoneId = zoneId;
+ this.tableId = tableId;
+ this.partId = partId;
+ }
+
+ /**
+ * Get the zone id.
+ *
+ * @return Zone id.
+ */
+ public int zoneId() {
+ return zoneId;
+ }
+
+ /**
+ * Get the table id.
+ *
+ * @return Table id.
+ */
+ public int tableId() {
+ return tableId;
+ }
+
+ /**
+ * Get the partition id.
+ *
+ * @return Partition id.
+ */
+ public int partitionId() {
+ return partId;
+ }
+
+ /**
+ * Converts a string representation of zone partition id to the object.
+ *
+ * @param str String representation.
+ * @return An zone partition id.
+ */
+ public static ZonePartitionId fromString(String str) {
+ String[] parts = str.split("_part_");
+
+ return new ZonePartitionId(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
+ }
+
+ /**
+ * Creates a new object if this one has a defined table id or returns itself if it does not have a value of the table id.
+ *
+ * @return Pure zone partition id.
+ */
+ public ZonePartitionId purify() {
+ if (tableId == 0) {
+ return this;
+ }
+
+ return new ZonePartitionId(zoneId, partId);
+ }
+
+ @Override
+ public String toString() {
+ return zoneId + "_part_" + partId;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ ZonePartitionId that = (ZonePartitionId) o;
+
+ return zoneId == that.zoneId && partId == that.partId && tableId == that.tableId;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(zoneId, partId, tableId);
+ }
+}
diff --git a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java
index e676b39c857..9f336298be2 100644
--- a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java
+++ b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItDistributionZonesFiltersTest.java
@@ -45,7 +45,7 @@
import org.apache.ignite.internal.lang.ByteArray;
import org.apache.ignite.internal.metastorage.Entry;
import org.apache.ignite.internal.metastorage.MetaStorageManager;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.internal.table.distributed.TableManager;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
@@ -130,11 +130,7 @@ void testFilteredDataNodesPropagatedToStable() throws Exception {
MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils
.getFieldValue(node, IgniteImpl.class, "metaStorageMgr");
- TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node, IgniteImpl.class, "distributedTblMgr");
-
- TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME);
-
- TablePartitionId partId = new TablePartitionId(table.tableId(), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneId(node), 0);
assertValueInStorage(
metaStorageManager,
@@ -199,11 +195,7 @@ void testAlteringFiltersPropagatedDataNodesToStableImmediately() throws Exceptio
MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils
.getFieldValue(node0, IgniteImpl.class, "metaStorageMgr");
- TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr");
-
- TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME);
-
- TablePartitionId partId = new TablePartitionId(table.tableId(), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneId(node0), 0);
assertValueInStorage(
metaStorageManager,
@@ -254,11 +246,7 @@ void testEmptyDataNodesDoNotPropagatedToStableAfterAlteringFilter() throws Excep
MetaStorageManager metaStorageManager = (MetaStorageManager) IgniteTestUtils
.getFieldValue(node0, IgniteImpl.class, "metaStorageMgr");
- TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr");
-
- TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME);
-
- TablePartitionId partId = new TablePartitionId(table.tableId(), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneId(node0), 0);
assertValueInStorage(
metaStorageManager,
@@ -332,7 +320,7 @@ void testFilteredEmptyDataNodesDoNotTriggerRebalance() throws Exception {
TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME);
- TablePartitionId partId = new TablePartitionId(table.tableId(), 0);
+ ZonePartitionId partId = new ZonePartitionId(zoneId, 0);
// Table was created after both nodes was up, so there wasn't any rebalance.
assertPendingAssignmentsWereNeverExist(metaStorageManager, partId);
@@ -372,11 +360,7 @@ void testFilteredEmptyDataNodesDoNotTriggerRebalanceOnReplicaUpdate() throws Exc
node0.sql().execute(null, createTableSql());
- TableManager tableManager = (TableManager) IgniteTestUtils.getFieldValue(node0, IgniteImpl.class, "distributedTblMgr");
-
- TableViewInternal table = (TableViewInternal) tableManager.table(TABLE_NAME);
-
- TablePartitionId partId = new TablePartitionId(table.tableId(), 0);
+ ZonePartitionId partId = new ZonePartitionId(zoneId, 0);
// Table was created after both nodes was up, so there wasn't any rebalance.
assertPendingAssignmentsWereNeverExist(metaStorageManager, partId);
@@ -443,7 +427,7 @@ private static void waitDataNodeAndListenersAreHandled(
private static void assertPendingAssignmentsWereNeverExist(
MetaStorageManager metaStorageManager,
- TablePartitionId partId
+ ZonePartitionId partId
) throws InterruptedException, ExecutionException {
assertTrue(metaStorageManager.get(pendingPartAssignmentsKey(partId)).get().empty());
}
diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java
index 39448ae24f5..29edaf223c4 100644
--- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java
+++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/DistributionZoneRebalanceEngine.java
@@ -20,23 +20,21 @@
import static java.util.concurrent.CompletableFuture.allOf;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static org.apache.ignite.internal.catalog.events.CatalogEvent.ZONE_ALTER;
-import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX;
import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.filterDataNodes;
import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.findTablesByZoneId;
import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.parseDataNodes;
import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.zoneDataNodesKey;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceRaftGroupEventsListener.doStableKeySwitch;
+import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.TABLES_COUNTER_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneId;
+import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneIdDataNodes;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.raftConfigurationAppliedKey;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.tablesCounterPrefixKey;
import static org.apache.ignite.internal.util.ByteUtils.fromBytes;
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
-import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
@@ -56,14 +54,13 @@
import org.apache.ignite.internal.distributionzones.DistributionZoneManager;
import org.apache.ignite.internal.distributionzones.Node;
import org.apache.ignite.internal.distributionzones.utils.CatalogAlterZoneEventListener;
-import org.apache.ignite.internal.lang.ByteArray;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
import org.apache.ignite.internal.metastorage.Entry;
import org.apache.ignite.internal.metastorage.MetaStorageManager;
import org.apache.ignite.internal.metastorage.WatchEvent;
import org.apache.ignite.internal.metastorage.WatchListener;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.ExceptionUtils;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
import org.apache.ignite.internal.util.IgniteUtils;
@@ -202,7 +199,7 @@ public CompletableFuture onUpdate(WatchEvent evt) {
return nullCompletedFuture();
}
- int zoneId = extractZoneId(evt.entryEvent().newEntry().key(), DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX);
+ int zoneId = extractZoneIdDataNodes(evt.entryEvent().newEntry().key());
// It is safe to get the latest version of the catalog as we are in the metastore thread.
int catalogVersion = catalogService.latestCatalogVersion();
@@ -224,13 +221,10 @@ public CompletableFuture onUpdate(WatchEvent evt) {
return nullCompletedFuture();
}
- List tableDescriptors = findTablesByZoneId(zoneId, catalogVersion, catalogService);
-
- return triggerPartitionsRebalanceForAllTables(
+ return triggerPartitionsRebalanceForZone(
evt.entryEvent().newEntry().revision(),
zoneDescriptor,
- filteredDataNodes,
- tableDescriptors
+ filteredDataNodes
);
});
}
@@ -263,7 +257,7 @@ public CompletableFuture onUpdate(WatchEvent event) {
return nullCompletedFuture();
}
- int zoneId = RebalanceUtil.extractZoneIdFromTablesCounter(event.entryEvent().newEntry().key());
+ int zoneId = extractZoneId(event.entryEvent().newEntry().key(), TABLES_COUNTER_PREFIX);
// TODO: https://issues.apache.org/jira/browse/IGNITE-21254 tables here must be the same as they were on rebalance start
List tables = findTablesByZoneId(zoneId, catalogService.latestCatalogVersion(), catalogService);
@@ -279,27 +273,20 @@ public CompletableFuture onUpdate(WatchEvent event) {
);
try {
- Map partitionTablesKeys = new HashMap<>();
-
int partId = extractPartitionNumber(event.entryEvent().newEntry().key());
- for (CatalogTableDescriptor table : tables) {
- TablePartitionId replicaGrpId = new TablePartitionId(table.id(), partId);
- partitionTablesKeys.put(raftConfigurationAppliedKey(replicaGrpId), replicaGrpId);
- }
+ ZonePartitionId replicaGrpId = new ZonePartitionId(zoneId, partId);
- Map entriesMap = metaStorageManager.getAll(partitionTablesKeys.keySet()).get();
+ Entry assignmentEntry = metaStorageManager.get(raftConfigurationAppliedKey(replicaGrpId)).get();
- entriesMap.forEach((key, stable) -> {
- doStableKeySwitch(
- Assignments.fromBytes(stable.value()).nodes(),
- partitionTablesKeys.get(key),
- event.revision(),
- metaStorageManager,
- catalogService,
- distributionZoneManager
- );
- });
+ tables.forEach(tbl -> doStableKeySwitch(
+ Assignments.fromBytes(assignmentEntry.value()).nodes(),
+ replicaGrpId,
+ event.revision(),
+ metaStorageManager,
+ catalogService,
+ distributionZoneManager
+ ));
} catch (Exception e) {
LOG.error(
@@ -332,24 +319,24 @@ private CompletableFuture onUpdateReplicas(AlterZoneEventParameters parame
}
static CompletableFuture> calculateAssignments(
- TablePartitionId tablePartitionId,
+ ZonePartitionId zonePartitionId,
CatalogService catalogService,
DistributionZoneManager distributionZoneManager
) {
int catalogVersion = catalogService.latestCatalogVersion();
- CatalogTableDescriptor tableDescriptor = catalogService.table(tablePartitionId.tableId(), catalogVersion);
+ CatalogZoneDescriptor zoneDescriptor = catalogService.zone(zonePartitionId.zoneId(), catalogVersion);
- CatalogZoneDescriptor zoneDescriptor = catalogService.zone(tableDescriptor.zoneId(), catalogVersion);
+ int zoneId = zonePartitionId.zoneId();
return distributionZoneManager.dataNodes(
zoneDescriptor.updateToken(),
catalogVersion,
- tableDescriptor.zoneId()
+ zoneId
).thenApply(dataNodes ->
AffinityUtils.calculateAssignmentForPartition(
dataNodes,
- tablePartitionId.partitionId(),
+ zonePartitionId.partitionId(),
zoneDescriptor.replicas()
)
);
@@ -375,72 +362,61 @@ private CompletableFuture recalculateAssignmentsAndScheduleRebalance(
return nullCompletedFuture();
}
- List tableDescriptors = findTablesByZoneId(zoneDescriptor.id(), catalogVersion, catalogService);
-
- return triggerPartitionsRebalanceForAllTables(
+ return triggerPartitionsRebalanceForZone(
causalityToken,
zoneDescriptor,
- dataNodes,
- tableDescriptors
+ dataNodes
);
});
}
- private CompletableFuture triggerPartitionsRebalanceForAllTables(
+ private CompletableFuture triggerPartitionsRebalanceForZone(
long revision,
CatalogZoneDescriptor zoneDescriptor,
- Set dataNodes,
- List tableDescriptors
+ Set dataNodes
) {
- List> tableFutures = new ArrayList<>(tableDescriptors.size());
-
- for (CatalogTableDescriptor tableDescriptor : tableDescriptors) {
- CompletableFuture>[] partitionFutures = RebalanceUtil.triggerAllTablePartitionsRebalance(
- tableDescriptor,
- zoneDescriptor,
- dataNodes,
- revision,
- metaStorageManager
- );
-
- // This set is used to deduplicate exceptions (if there is an exception from upstream, for instance,
- // when reading from MetaStorage, it will be encountered by every partition future) to avoid noise
- // in the logs.
- Set unwrappedCauses = ConcurrentHashMap.newKeySet();
-
- for (int partId = 0; partId < partitionFutures.length; partId++) {
- int finalPartId = partId;
-
- partitionFutures[partId].exceptionally(e -> {
- Throwable cause = ExceptionUtils.unwrapCause(e);
-
- if (unwrappedCauses.add(cause)) {
- // The exception is specific to this partition.
- LOG.error(
- "Exception on updating assignments for [table={}, partition={}]",
- e,
- tableInfo(tableDescriptor), finalPartId
- );
- } else {
- // The exception is from upstream and not specific for this partition, so don't log the partition index.
- LOG.error(
- "Exception on updating assignments for [table={}]",
- e,
- tableInfo(tableDescriptor)
- );
- }
+ CompletableFuture>[] partitionFutures = RebalanceUtil.triggerZonePartitionsRebalance(
+ zoneDescriptor,
+ dataNodes,
+ revision,
+ metaStorageManager
+ );
- return null;
- });
- }
+ // This set is used to deduplicate exceptions (if there is an exception from upstream, for instance,
+ // when reading from MetaStorage, it will be encountered by every partition future) to avoid noise
+ // in the logs.
+ Set unwrappedCauses = ConcurrentHashMap.newKeySet();
+
+ for (int partId = 0; partId < partitionFutures.length; partId++) {
+ int finalPartId = partId;
+
+ partitionFutures[partId].exceptionally(e -> {
+ Throwable cause = ExceptionUtils.unwrapCause(e);
+
+ if (unwrappedCauses.add(cause)) {
+ // The exception is specific to this partition.
+ LOG.error(
+ "Exception on updating assignments for [zone={}, partition={}]",
+ e,
+ zoneInfo(zoneDescriptor), finalPartId
+ );
+ } else {
+ // The exception is from upstream and not specific for this partition, so don't log the partition index.
+ LOG.error(
+ "Exception on updating assignments for [zone={}]",
+ e,
+ zoneInfo(zoneDescriptor)
+ );
+ }
- tableFutures.add(allOf(partitionFutures));
+ return null;
+ });
}
- return allOf(tableFutures.toArray(CompletableFuture[]::new));
+ return allOf(partitionFutures);
}
- private static String tableInfo(CatalogTableDescriptor tableDescriptor) {
- return tableDescriptor.id() + "/" + tableDescriptor.name();
+ private static String zoneInfo(CatalogZoneDescriptor zoneDescriptor) {
+ return zoneDescriptor.id() + "/" + zoneDescriptor.name();
}
}
diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java
index dcab08ef394..aa8037195e8 100644
--- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java
+++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceRaftGroupEventsListener.java
@@ -67,13 +67,13 @@
import org.apache.ignite.internal.raft.RaftError;
import org.apache.ignite.internal.raft.RaftGroupEventsListener;
import org.apache.ignite.internal.raft.Status;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.ByteUtils;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
/**
- * Listener for the raft group events, which must provide correct error handling of rebalance process
- * and start new rebalance after the current one finished.
+ * Listener for the raft group events, which must provide correct error handling of rebalance process and start new rebalance after the
+ * current one finished.
*/
public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener {
/** Ignite logger. */
@@ -122,7 +122,9 @@ public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener
private final MetaStorageManager metaStorageMgr;
/** Unique table partition id. */
- private final TablePartitionId tablePartitionId;
+ private final ZonePartitionId zonePartitionId;
+
+ private final int tableId;
/** Busy lock of parent component for synchronous stop. */
private final IgniteSpinBusyLock busyLock;
@@ -130,38 +132,35 @@ public class RebalanceRaftGroupEventsListener implements RaftGroupEventsListener
/** Executor for scheduling rebalance retries. */
private final ScheduledExecutorService rebalanceScheduler;
- /** Zone id. */
- private final int zoneId;
-
/** Performs reconfiguration of a Raft group of a partition. */
private final PartitionMover partitionMover;
/** Attempts to retry the current rebalance in case of errors. */
- private final AtomicInteger rebalanceAttempts = new AtomicInteger(0);
+ private final AtomicInteger rebalanceAttempts = new AtomicInteger(0);
/**
* Constructs new listener.
*
* @param metaStorageMgr Meta storage manager.
- * @param tablePartitionId Partition id.
+ * @param zonePartitionId Partition id.
* @param busyLock Busy lock.
* @param partitionMover Class that moves partition between nodes.
* @param rebalanceScheduler Executor for scheduling rebalance retries.
*/
public RebalanceRaftGroupEventsListener(
MetaStorageManager metaStorageMgr,
- TablePartitionId tablePartitionId,
+ ZonePartitionId zonePartitionId,
IgniteSpinBusyLock busyLock,
PartitionMover partitionMover,
ScheduledExecutorService rebalanceScheduler,
- int zoneId
+ int tableId
) {
this.metaStorageMgr = metaStorageMgr;
- this.tablePartitionId = tablePartitionId;
+ this.zonePartitionId = zonePartitionId;
this.busyLock = busyLock;
this.partitionMover = partitionMover;
this.rebalanceScheduler = rebalanceScheduler;
- this.zoneId = zoneId;
+ this.tableId = tableId;
}
/** {@inheritDoc} */
@@ -180,7 +179,7 @@ public void onLeaderElected(long term) {
try {
rebalanceAttempts.set(0);
- byte[] pendingAssignmentsBytes = metaStorageMgr.get(pendingPartAssignmentsKey(tablePartitionId)).get().value();
+ byte[] pendingAssignmentsBytes = metaStorageMgr.get(pendingPartAssignmentsKey(zonePartitionId)).get().value();
if (pendingAssignmentsBytes != null) {
Set pendingAssignments = Assignments.fromBytes(pendingAssignmentsBytes).nodes();
@@ -198,7 +197,7 @@ public void onLeaderElected(long term) {
LOG.info(
"New leader elected. Going to apply new configuration [tablePartitionId={}, peers={}, learners={}]",
- tablePartitionId, peers, learners
+ zonePartitionId, peers, learners
);
PeersAndLearners peersAndLearners = PeersAndLearners.fromConsistentIds(peers, learners);
@@ -207,7 +206,7 @@ public void onLeaderElected(long term) {
}
} catch (Exception e) {
// TODO: IGNITE-14693
- LOG.warn("Unable to start rebalance [tablePartitionId, term={}]", e, tablePartitionId, term);
+ LOG.warn("Unable to start rebalance [tablePartitionId, term={}]", e, zonePartitionId, term);
} finally {
busyLock.leaveBusy();
}
@@ -251,9 +250,9 @@ public void onNewPeersConfigurationApplied(PeersAndLearners configuration) {
*/
private void countDownPartitionsFromZone(Set stable) {
try {
- int partId = tablePartitionId.partitionId();
+ int partId = zonePartitionId.partitionId();
- Entry counterEntry = metaStorageMgr.get(tablesCounterKey(zoneId, partId)).get();
+ Entry counterEntry = metaStorageMgr.get(tablesCounterKey(zonePartitionId)).get();
assert counterEntry.value() != null;
@@ -261,32 +260,33 @@ private void countDownPartitionsFromZone(Set stable) {
assert !counter.isEmpty();
- if (!counter.contains(tablePartitionId.tableId())) {
+ if (!counter.contains(tableId)) {
// Count down for this table has already been processed, just skip.
// For example, this can happen when leader re-election happened during the rebalance process.
return;
}
- Condition condition = value(tablesCounterKey(zoneId, partId)).eq(counterEntry.value());
+ Condition condition = value(tablesCounterKey(zonePartitionId)).eq(counterEntry.value());
byte[] stableArray = Assignments.toBytes(stable);
- counter.remove(tablePartitionId.tableId());
+ counter.remove(tableId);
if (counter.isEmpty()) {
counter = Set.of();
}
Update successCase = ops(
- put(tablesCounterKey(zoneId, partId), toBytes(counter)),
+ put(tablesCounterKey(zonePartitionId), toBytes(counter)),
// Todo: change to one key https://issues.apache.org/jira/browse/IGNITE-18991
- put(raftConfigurationAppliedKey(tablePartitionId), stableArray)
+ put(raftConfigurationAppliedKey(zonePartitionId), stableArray)
).yield(TABLES_COUNTER_DECREMENT_SUCCESS);
Update failCase = ops().yield(PART_COUNTER_DECREMENT_FAIL);
int res = metaStorageMgr.invoke(iif(condition, successCase, failCase)).get().getAsInt();
+ int zoneId = zonePartitionId.zoneId();
if (res < 0) {
LOG.info("Count down of zone's tables counter is failed. "
+ "Going to retry [zoneId={}, appliedPeers={}]",
@@ -309,7 +309,7 @@ private void countDownPartitionsFromZone(Set stable) {
rebalanceAttempts.set(0);
} catch (InterruptedException | ExecutionException e) {
// TODO: IGNITE-14693
- LOG.warn("Unable to count down partitions counter in metastore: " + tablePartitionId, e);
+ LOG.warn("Unable to count down partitions counter in metastore: " + zonePartitionId, e);
}
}
@@ -325,7 +325,7 @@ public void onReconfigurationError(Status status, PeersAndLearners configuration
if (status.equals(Status.LEADER_STEPPED_DOWN)) {
// Leader stepped down, so we are expecting RebalanceRaftGroupEventsListener.onLeaderElected to be called on a new leader.
- LOG.info("Leader stepped down during rebalance [partId={}]", tablePartitionId);
+ LOG.info("Leader stepped down during rebalance [partId={}]", zonePartitionId);
return;
}
@@ -335,12 +335,12 @@ public void onReconfigurationError(Status status, PeersAndLearners configuration
assert raftError == RaftError.ECATCHUP : "According to the JRaft protocol, " + RaftError.ECATCHUP
+ " is expected, got " + raftError;
- LOG.debug("Error occurred during rebalance [partId={}]", tablePartitionId);
+ LOG.debug("Error occurred during rebalance [partId={}]", zonePartitionId);
if (rebalanceAttempts.incrementAndGet() < REBALANCE_RETRY_THRESHOLD) {
scheduleChangePeers(configuration, term);
} else {
- LOG.info("Number of retries for rebalance exceeded the threshold [partId={}, threshold={}]", tablePartitionId,
+ LOG.info("Number of retries for rebalance exceeded the threshold [partId={}, threshold={}]", zonePartitionId,
REBALANCE_RETRY_THRESHOLD);
// TODO: currently we just retry intent to change peers according to the rebalance infinitely, until new leader is elected,
@@ -364,7 +364,7 @@ private void scheduleChangePeers(PeersAndLearners peersAndLearners, long term) {
return;
}
- LOG.info("Going to retry rebalance [attemptNo={}, partId={}]", rebalanceAttempts.get(), tablePartitionId);
+ LOG.info("Going to retry rebalance [attemptNo={}, partId={}]", rebalanceAttempts.get(), zonePartitionId);
try {
partitionMover.movePartition(peersAndLearners, term).join();
@@ -379,19 +379,19 @@ private void scheduleChangePeers(PeersAndLearners peersAndLearners, long term) {
*/
static void doStableKeySwitch(
Set stableFromRaft,
- TablePartitionId tablePartitionId,
+ ZonePartitionId zonePartitionId,
long revision,
MetaStorageManager metaStorageMgr,
CatalogService catalogService,
DistributionZoneManager distributionZoneManager
) {
try {
- ByteArray pendingPartAssignmentsKey = pendingPartAssignmentsKey(tablePartitionId);
- ByteArray stablePartAssignmentsKey = stablePartAssignmentsKey(tablePartitionId);
- ByteArray plannedPartAssignmentsKey = plannedPartAssignmentsKey(tablePartitionId);
- ByteArray switchReduceKey = switchReduceKey(tablePartitionId);
- ByteArray switchAppendKey = switchAppendKey(tablePartitionId);
- ByteArray stableChangeTriggerKey = stableChangeTriggerKey(tablePartitionId);
+ ByteArray pendingPartAssignmentsKey = pendingPartAssignmentsKey(zonePartitionId);
+ ByteArray stablePartAssignmentsKey = stablePartAssignmentsKey(zonePartitionId);
+ ByteArray plannedPartAssignmentsKey = plannedPartAssignmentsKey(zonePartitionId);
+ ByteArray switchReduceKey = switchReduceKey(zonePartitionId);
+ ByteArray switchAppendKey = switchAppendKey(zonePartitionId);
+ ByteArray stableChangeTriggerKey = stableChangeTriggerKey(zonePartitionId);
// TODO: https://issues.apache.org/jira/browse/IGNITE-17592 Remove synchronous wait
Map values = metaStorageMgr.getAll(
@@ -405,7 +405,7 @@ static void doStableKeySwitch(
)
).get();
- Set calculatedAssignments = calculateAssignments(tablePartitionId, catalogService, distributionZoneManager).get();
+ Set calculatedAssignments = calculateAssignments(zonePartitionId, catalogService, distributionZoneManager).get();
Entry stableEntry = values.get(stablePartAssignmentsKey);
Entry pendingEntry = values.get(pendingPartAssignmentsKey);
@@ -518,8 +518,8 @@ static void doStableKeySwitch(
// TODO: https://issues.apache.org/jira/browse/IGNITE-17592 Remove synchronous wait
int res = metaStorageMgr.invoke(
iif(or(
- notExists(stableChangeTriggerKey(tablePartitionId)),
- value(stableChangeTriggerKey(tablePartitionId)).lt(ByteUtils.longToBytes(revision))
+ notExists(stableChangeTriggerKey(zonePartitionId)),
+ value(stableChangeTriggerKey(zonePartitionId)).lt(ByteUtils.longToBytes(revision))
),
iif(retryPreconditions, successCase, failCase),
ops().yield(OUTDATED_INVOKE_STATUS)
@@ -531,20 +531,20 @@ static void doStableKeySwitch(
case SWITCH_APPEND_FAIL:
LOG.info("Rebalance keys changed while trying to update rebalance pending addition information. "
+ "Going to retry [tablePartitionID={}, appliedPeers={}]",
- tablePartitionId, stableFromRaft
+ zonePartitionId, stableFromRaft
);
break;
case SWITCH_REDUCE_FAIL:
LOG.info("Rebalance keys changed while trying to update rebalance pending reduce information. "
+ "Going to retry [tablePartitionID={}, appliedPeers={}]",
- tablePartitionId, stableFromRaft
+ zonePartitionId, stableFromRaft
);
break;
case SCHEDULE_PENDING_REBALANCE_FAIL:
case FINISH_REBALANCE_FAIL:
LOG.info("Rebalance keys changed while trying to update rebalance information. "
+ "Going to retry [tablePartitionId={}, appliedPeers={}]",
- tablePartitionId, stableFromRaft
+ zonePartitionId, stableFromRaft
);
break;
default:
@@ -554,7 +554,7 @@ static void doStableKeySwitch(
doStableKeySwitch(
stableFromRaft,
- tablePartitionId,
+ zonePartitionId,
revision,
metaStorageMgr,
catalogService,
@@ -568,29 +568,29 @@ static void doStableKeySwitch(
case SWITCH_APPEND_SUCCESS:
LOG.info("Rebalance finished. Going to schedule next rebalance with addition"
+ " [tablePartitionId={}, appliedPeers={}, plannedPeers={}]",
- tablePartitionId, stableFromRaft, calculatedPendingAddition
+ zonePartitionId, stableFromRaft, calculatedPendingAddition
);
break;
case SWITCH_REDUCE_SUCCESS:
LOG.info("Rebalance finished. Going to schedule next rebalance with reduction"
+ " [tablePartitionId={}, appliedPeers={}, plannedPeers={}]",
- tablePartitionId, stableFromRaft, calculatedPendingReduction
+ zonePartitionId, stableFromRaft, calculatedPendingReduction
);
break;
case SCHEDULE_PENDING_REBALANCE_SUCCESS:
LOG.info(
"Rebalance finished. Going to schedule next rebalance [tablePartitionId={}, appliedPeers={}, plannedPeers={}]",
- tablePartitionId, stableFromRaft, Assignments.fromBytes(plannedEntry.value()).nodes()
+ zonePartitionId, stableFromRaft, Assignments.fromBytes(plannedEntry.value()).nodes()
);
break;
case FINISH_REBALANCE_SUCCESS:
- LOG.info("Rebalance finished [tablePartitionId={}, appliedPeers={}]", tablePartitionId, stableFromRaft);
+ LOG.info("Rebalance finished [tablePartitionId={}, appliedPeers={}]", zonePartitionId, stableFromRaft);
break;
case OUTDATED_INVOKE_STATUS:
LOG.debug("Stable switch skipped because event is outdated "
+ "[tablePartitionId={}, stableChangeTriggerKey={}, revision={}]",
- tablePartitionId, stableChangeTriggerValue, revision
+ zonePartitionId, stableChangeTriggerValue, revision
);
break;
@@ -601,7 +601,7 @@ static void doStableKeySwitch(
} catch (InterruptedException | ExecutionException e) {
// TODO: IGNITE-14693
- LOG.warn("Unable to commit partition configuration to metastore: " + tablePartitionId, e);
+ LOG.warn("Unable to commit partition configuration to metastore: " + zonePartitionId, e);
}
}
diff --git a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java
index 6fdec030e47..49bd1bbb6f9 100644
--- a/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java
+++ b/modules/distribution-zones/src/main/java/org/apache/ignite/internal/distributionzones/rebalance/RebalanceUtil.java
@@ -19,6 +19,7 @@
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;
+import static org.apache.ignite.internal.distributionzones.DistributionZonesUtil.DISTRIBUTION_ZONE_DATA_NODES_VALUE_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.ASSIGNMENT_NOT_UPDATED;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.OUTDATED_UPDATE_RECEIVED;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.UpdateStatus.PENDING_KEY_UPDATED;
@@ -58,7 +59,7 @@
import org.apache.ignite.internal.metastorage.dsl.Condition;
import org.apache.ignite.internal.metastorage.dsl.Iif;
import org.apache.ignite.internal.metastorage.dsl.StatementResult;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.ByteUtils;
import org.apache.ignite.internal.util.CollectionUtils;
import org.jetbrains.annotations.Nullable;
@@ -73,8 +74,9 @@ public class RebalanceUtil {
/**
* Status values for methods like
- * {@link #updatePendingAssignmentsKeys(CatalogTableDescriptor, TablePartitionId, Collection, int, long, MetaStorageManager, int, Set)}
- * or {@link #manualPartitionUpdate(TablePartitionId, Collection, Set, int, long, MetaStorageManager, Set)}.
+ * {@link #updatePendingAssignmentsKeys(CatalogZoneDescriptor, ZonePartitionId, Collection, int, long, MetaStorageManager, int, Set)}
+ * (CatalogTableDescriptor, ZonePartitionId, Collection, int, long, MetaStorageManager, int, Set)} or
+ * {@link #manualPartitionUpdate(ZonePartitionId, Collection, Set, int, long, MetaStorageManager, Set)}.
*/
public enum UpdateStatus {
/**
@@ -126,8 +128,8 @@ public static UpdateStatus valueOf(int ordinal) {
/**
* Update keys that related to rebalance algorithm in Meta Storage. Keys are specific for partition.
*
- * @param tableDescriptor Table descriptor.
- * @param partId Unique identifier of a partition.
+ * @param zoneDescriptor Zone descriptor.
+ * @param zonePartitionId Unique aggregate identifier of a partition of a zone.
* @param dataNodes Data nodes.
* @param replicas Number of replicas for a table.
* @param revision Revision of Meta Storage that is specific for the assignment update.
@@ -137,8 +139,8 @@ public static UpdateStatus valueOf(int ordinal) {
* @return Future representing result of updating keys in {@code metaStorageMgr}
*/
public static CompletableFuture updatePendingAssignmentsKeys(
- CatalogTableDescriptor tableDescriptor,
- TablePartitionId partId,
+ CatalogZoneDescriptor zoneDescriptor,
+ ZonePartitionId zonePartitionId,
Collection dataNodes,
int replicas,
long revision,
@@ -146,13 +148,13 @@ public static CompletableFuture updatePendingAssignmentsKeys(
int partNum,
Set tableCfgPartAssignments
) {
- ByteArray partChangeTriggerKey = pendingChangeTriggerKey(partId);
+ ByteArray partChangeTriggerKey = pendingChangeTriggerKey(zonePartitionId);
- ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId);
+ ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(zonePartitionId);
- ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(partId);
+ ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(zonePartitionId);
- ByteArray partAssignmentsStableKey = stablePartAssignmentsKey(partId);
+ ByteArray partAssignmentsStableKey = stablePartAssignmentsKey(zonePartitionId);
Set partAssignments = AffinityUtils.calculateAssignmentForPartition(dataNodes, partNum, replicas);
@@ -211,14 +213,14 @@ public static CompletableFuture updatePendingAssignmentsKeys(
case PENDING_KEY_UPDATED:
LOG.info(
"Update metastore pending partitions key [key={}, partition={}, table={}/{}, newVal={}]",
- partAssignmentsPendingKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(),
+ partAssignmentsPendingKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(),
partAssignments);
break;
case PLANNED_KEY_UPDATED:
LOG.info(
"Update metastore planned partitions key [key={}, partition={}, table={}/{}, newVal={}]",
- partAssignmentsPlannedKey, partNum, tableDescriptor.id(), tableDescriptor.name(),
+ partAssignmentsPlannedKey, partNum, zoneDescriptor.id(), zoneDescriptor.name(),
partAssignments
);
@@ -226,7 +228,7 @@ public static CompletableFuture updatePendingAssignmentsKeys(
case PLANNED_KEY_REMOVED_EQUALS_PENDING:
LOG.info(
"Remove planned key because current pending key has the same value [key={}, partition={}, table={}/{}, val={}]",
- partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(),
+ partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(),
partAssignments
);
@@ -235,7 +237,7 @@ public static CompletableFuture updatePendingAssignmentsKeys(
LOG.info(
"Remove planned key because pending is empty and calculated assignments are equal to current assignments "
+ "[key={}, partition={}, table={}/{}, val={}]",
- partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(),
+ partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(),
partAssignments
);
@@ -243,7 +245,7 @@ public static CompletableFuture updatePendingAssignmentsKeys(
case ASSIGNMENT_NOT_UPDATED:
LOG.debug(
"Assignments are not updated [key={}, partition={}, table={}/{}, val={}]",
- partAssignmentsPlannedKey.toString(), partNum, tableDescriptor.id(), tableDescriptor.name(),
+ partAssignmentsPlannedKey.toString(), partNum, zoneDescriptor.id(), zoneDescriptor.name(),
partAssignments
);
@@ -251,7 +253,7 @@ public static CompletableFuture updatePendingAssignmentsKeys(
case OUTDATED_UPDATE_RECEIVED:
LOG.debug(
"Received outdated rebalance trigger event [revision={}, partition={}, table={}/{}]",
- revision, partNum, tableDescriptor.id(), tableDescriptor.name());
+ revision, partNum, zoneDescriptor.id(), zoneDescriptor.name());
break;
default:
@@ -266,7 +268,6 @@ public static CompletableFuture updatePendingAssignmentsKeys(
* provided data nodes, and, if the calculated assignments are different from the ones loaded from the
* MetaStorages, writes them as pending assignments.
*
- * @param tableDescriptor Table descriptor.
* @param zoneDescriptor Zone descriptor.
* @param dataNodes Data nodes to use.
* @param storageRevision MetaStorage revision corresponding to this request.
@@ -274,16 +275,15 @@ public static CompletableFuture updatePendingAssignmentsKeys(
* @return Array of futures, one per partition of the table; the futures complete when the described
* rebalance triggering completes.
*/
- public static CompletableFuture>[] triggerAllTablePartitionsRebalance(
- CatalogTableDescriptor tableDescriptor,
+ public static CompletableFuture>[] triggerZonePartitionsRebalance(
CatalogZoneDescriptor zoneDescriptor,
Set dataNodes,
long storageRevision,
MetaStorageManager metaStorageManager
) {
- CompletableFuture
*/
class IndexBuildController implements ManuallyCloseable {
+ private static final IgniteLogger LOG = Loggers.forClass(IndexBuildController.class);
+
private final IndexBuilder indexBuilder;
private final IndexManager indexManager;
@@ -94,7 +96,7 @@ class IndexBuildController implements ManuallyCloseable {
private final AtomicBoolean closeGuard = new AtomicBoolean();
- private final Set primaryReplicaIds = ConcurrentHashMap.newKeySet();
+ private final Set primaryReplicaIds = ConcurrentHashMap.newKeySet();
/** Constructor. */
IndexBuildController(
@@ -146,16 +148,29 @@ private CompletableFuture> onIndexBuilding(StartBuildingIndexEventParameters p
var startBuildIndexFutures = new ArrayList>();
- for (TablePartitionId primaryReplicaId : primaryReplicaIds) {
- if (primaryReplicaId.tableId() == indexDescriptor.tableId()) {
- CompletableFuture> startBuildIndexFuture = getMvTableStorageFuture(parameters.causalityToken(), primaryReplicaId)
- .thenCompose(mvTableStorage -> awaitPrimaryReplica(primaryReplicaId, clockService.now())
- .thenAccept(replicaMeta -> tryScheduleBuildIndex(
- primaryReplicaId,
- indexDescriptor,
- mvTableStorage,
- replicaMeta
- ))
+ for (ZonePartitionId zonePartitionId : primaryReplicaIds) {
+
+ int tableId = zonePartitionId.tableId();
+
+ TablePartitionId tablePartId = new TablePartitionId(tableId, zonePartitionId.partitionId());
+
+ if (tableId == indexDescriptor.tableId()) {
+ CompletableFuture> startBuildIndexFuture = getMvTableStorageFuture(parameters.causalityToken(), tablePartId)
+ .thenCompose(mvTableStorage -> {
+ if (mvTableStorage == null) {
+ LOG.info("The table has been removed, so the index build is skipped [tblId={}].", tableId);
+
+ return nullCompletedFuture();
+ }
+
+ return awaitPrimaryReplica(zonePartitionId, clockService.now())
+ .thenAccept(replicaMeta -> tryScheduleBuildIndex(
+ tablePartId,
+ indexDescriptor,
+ mvTableStorage,
+ replicaMeta
+ ));
+ }
);
startBuildIndexFutures.add(startBuildIndexFuture);
@@ -176,26 +191,37 @@ private CompletableFuture> onIndexRemoved(RemoveIndexEventParameters parameter
private CompletableFuture> onPrimaryReplicaElected(PrimaryReplicaEventParameters parameters) {
return inBusyLockAsync(busyLock, () -> {
- TablePartitionId primaryReplicaId = (TablePartitionId) parameters.groupId();
+ ZonePartitionId zonePartitionId = (ZonePartitionId) parameters.groupId();
- if (isLocalNode(clusterService, parameters.leaseholderId())) {
- primaryReplicaIds.add(primaryReplicaId);
+ int tableId = zonePartitionId.tableId();
+ TablePartitionId tablePartitionId = new TablePartitionId(tableId, zonePartitionId.partitionId());
+
+ if (isLocalNode(clusterService, parameters.leaseholderId())) {
// It is safe to get the latest version of the catalog because the PRIMARY_REPLICA_ELECTED event is handled on the
// metastore thread.
int catalogVersion = catalogService.latestCatalogVersion();
- return getMvTableStorageFuture(parameters.causalityToken(), primaryReplicaId)
- .thenCompose(mvTableStorage -> awaitPrimaryReplica(primaryReplicaId, parameters.startTime())
- .thenAccept(replicaMeta -> tryScheduleBuildIndexesForNewPrimaryReplica(
- catalogVersion,
- primaryReplicaId,
- mvTableStorage,
- replicaMeta
- ))
- );
+ primaryReplicaIds.add(zonePartitionId);
+
+ return getMvTableStorageFuture(parameters.causalityToken(), tablePartitionId).thenCompose(mvTableStorage -> {
+ if (mvTableStorage == null) {
+ LOG.info("The table has been removed, so the index build is skipped [tblId={}].", tableId);
+
+ return nullCompletedFuture();
+ }
+
+ return inBusyLock(busyLock, () -> awaitPrimaryReplica(zonePartitionId, parameters.startTime()))
+ .thenAccept(replicaMeta -> inBusyLock(busyLock, () -> tryScheduleBuildIndexesForNewPrimaryReplica(
+ catalogVersion,
+ tablePartitionId,
+ mvTableStorage,
+ replicaMeta
+ ))
+ );
+ });
} else {
- stopBuildingIndexesIfPrimaryExpired(primaryReplicaId);
+ stopBuildingIndexesIfPrimaryExpired(tablePartitionId);
return nullCompletedFuture();
}
@@ -256,32 +282,19 @@ private void tryScheduleBuildIndex(
* @param replicaId Replica ID.
*/
private void stopBuildingIndexesIfPrimaryExpired(TablePartitionId replicaId) {
- if (primaryReplicaIds.remove(replicaId)) {
+ if (primaryReplicaIds.removeIf(z -> z.tableId() == replicaId.tableId() && z.partitionId() == replicaId.partitionId())) {
// Primary replica is no longer current, we need to stop building indexes for it.
indexBuilder.stopBuildingIndexes(replicaId.tableId(), replicaId.partitionId());
}
}
private CompletableFuture getMvTableStorageFuture(long causalityToken, TablePartitionId replicaId) {
- return indexManager.getMvTableStorage(causalityToken, replicaId.tableId())
- .thenApply(mvTableStorage -> requireMvTableStorageNonNull(mvTableStorage, replicaId.tableId()));
- }
-
- private static MvTableStorage requireMvTableStorageNonNull(@Nullable MvTableStorage mvTableStorage, int tableId) {
- if (mvTableStorage == null) {
- throw new IgniteInternalException(
- INTERNAL_ERR,
- "Table storage for the specified table cannot be null [tableId = {}]",
- tableId
- );
- }
-
- return mvTableStorage;
+ return indexManager.getMvTableStorage(causalityToken, replicaId.tableId());
}
- private CompletableFuture awaitPrimaryReplica(TablePartitionId replicaId, HybridTimestamp timestamp) {
+ private CompletableFuture awaitPrimaryReplica(ZonePartitionId replicaId, HybridTimestamp timestamp) {
return placementDriver
- .awaitPrimaryReplica(replicaId, timestamp, AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS)
+ .awaitPrimaryReplicaForTable(replicaId, timestamp, AWAIT_PRIMARY_REPLICA_TIMEOUT_SEC, SECONDS)
.handle((replicaMeta, throwable) -> {
if (throwable != null) {
Throwable unwrapThrowable = ExceptionUtils.unwrapCause(throwable);
diff --git a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java
index 5a4bfc01489..7be29600cab 100644
--- a/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java
+++ b/modules/index/src/main/java/org/apache/ignite/internal/index/IndexManager.java
@@ -60,7 +60,6 @@
import org.apache.ignite.internal.table.distributed.PartitionSet;
import org.apache.ignite.internal.table.distributed.TableManager;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
-import org.jetbrains.annotations.Nullable;
/**
* An Ignite component that is responsible for handling index-related commands like CREATE or DROP
@@ -172,8 +171,14 @@ public CompletableFuture stopAsync() {
* @return Future with multi-version table storage, completes with {@code null} if the table does not exist according to the passed
* parameters.
*/
- CompletableFuture<@Nullable MvTableStorage> getMvTableStorage(long causalityToken, int tableId) {
- return tableManager.tableAsync(causalityToken, tableId).thenApply(table -> table == null ? null : table.internalTable().storage());
+ CompletableFuture getMvTableStorage(long causalityToken, int tableId) {
+ return tableManager.tableAsync(causalityToken, tableId).thenApply(table -> {
+ if (table == null) {
+ return null;
+ }
+
+ return table.internalTable().storage();
+ });
}
private CompletableFuture onIndexCreate(CreateIndexEventParameters parameters) {
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java
index b119dcda9fd..6d3b7f0de33 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskControllerTest.java
@@ -49,7 +49,7 @@
import org.apache.ignite.internal.network.ClusterNodeImpl;
import org.apache.ignite.internal.network.ClusterService;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.network.ClusterNode;
import org.apache.ignite.network.NetworkAddress;
@@ -190,17 +190,21 @@ private void setPrimaryReplicaAnotherNode() {
}
private void setPrimaryReplica(ClusterNode clusterNode) {
- TablePartitionId groupId = new TablePartitionId(tableId(), 0);
+ ZonePartitionId zonePartId = new ZonePartitionId(zoneId(), tableId(), 0);
- ReplicaMeta replicaMeta = newPrimaryReplicaMeta(clusterNode, groupId, HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE);
+ ReplicaMeta replicaMeta = newPrimaryReplicaMeta(clusterNode, zonePartId, HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE);
- assertThat(placementDriver.setPrimaryReplicaMeta(0, groupId, completedFuture(replicaMeta)), willCompleteSuccessfully());
+ assertThat(placementDriver.setPrimaryReplicaMeta(0, zonePartId, completedFuture(replicaMeta)), willCompleteSuccessfully());
}
private int tableId() {
return TestIndexManagementUtils.tableId(catalogManager, TABLE_NAME, clock);
}
+ private int zoneId() {
+ return catalogManager.catalog(catalogManager.latestCatalogVersion()).table(tableId()).zoneId();
+ }
+
private CatalogIndexDescriptor indexDescriptor() {
return TestIndexManagementUtils.indexDescriptor(catalogManager, INDEX_NAME, clock);
}
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java
index 2031b554384..1b19fb41746 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/ChangeIndexStatusTaskTest.java
@@ -84,6 +84,7 @@
import org.apache.ignite.internal.placementdriver.PrimaryReplicaAwaitTimeoutException;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.IgniteAbstractTest;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
import org.apache.ignite.network.ClusterNode;
@@ -145,7 +146,7 @@ void setUp() {
createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE)
);
- when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn(localNodeReplicaMetaFuture);
+ when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn(localNodeReplicaMetaFuture);
CompletableFuture logicalTopologySnapshotFuture = completedFuture(
new LogicalTopologySnapshot(1, List.of(LOGICAL_LOCAL_NODE))
@@ -189,7 +190,7 @@ void testSimpleTaskExecution() {
verify(executor, atLeast(3)).execute(any());
verify(clockWaiter, atLeast(2)).waitFor(any());
- verify(placementDriver).awaitPrimaryReplica(any(), any(), anyLong(), any());
+ verify(placementDriver).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any());
verify(logicalTopologyService).logicalTopologyOnLeader();
verify(logicalTopologyService).addEventListener(any());
verify(logicalTopologyService).removeEventListener(any());
@@ -204,7 +205,7 @@ void testTimeoutAndSuccessOnAwaitPrimaryReplica() {
createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MAX_VALUE)
);
- when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn(
+ when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn(
awaitPrimaryReplicaFuture0,
awaitPrimaryReplicaFuture1
);
@@ -212,7 +213,7 @@ void testTimeoutAndSuccessOnAwaitPrimaryReplica() {
assertThat(task.start(), willCompleteSuccessfully());
assertEquals(BUILDING, actualIndexStatus());
- verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any());
+ verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any());
}
@Test
@@ -223,7 +224,7 @@ void testTimeoutAndExpireOnAwaitPrimaryReplica() {
createLocalNodeReplicaMeta(HybridTimestamp.MIN_VALUE, HybridTimestamp.MIN_VALUE.addPhysicalTime(1))
);
- when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn(
+ when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn(
awaitPrimaryReplicaFuture0,
awaitPrimaryReplicaFuture1
);
@@ -231,7 +232,7 @@ void testTimeoutAndExpireOnAwaitPrimaryReplica() {
assertThat(task.start(), willThrow(IndexTaskStoppingException.class));
assertEquals(REGISTERED, actualIndexStatus());
- verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any());
+ verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any());
}
@Test
@@ -240,7 +241,7 @@ void testTimeoutAndErrorOnAwaitPrimaryReplica() {
CompletableFuture awaitPrimaryReplicaFuture1 = failedFuture(primaryReplicaAwaitException());
- when(placementDriver.awaitPrimaryReplica(any(), any(), anyLong(), any())).thenReturn(
+ when(placementDriver.awaitPrimaryReplicaForTable(any(), any(), anyLong(), any())).thenReturn(
awaitPrimaryReplicaFuture0,
awaitPrimaryReplicaFuture1
);
@@ -248,7 +249,7 @@ void testTimeoutAndErrorOnAwaitPrimaryReplica() {
assertThat(task.start(), willThrow(PrimaryReplicaAwaitException.class));
assertEquals(REGISTERED, actualIndexStatus());
- verify(placementDriver, times(2)).awaitPrimaryReplica(any(), any(), anyLong(), any());
+ verify(placementDriver, times(2)).awaitPrimaryReplicaForTable(any(), any(), anyLong(), any());
}
@Test
@@ -326,7 +327,7 @@ private CatalogIndexStatus actualIndexStatus() {
}
private ReplicaMeta createLocalNodeReplicaMeta(HybridTimestamp startTime, HybridTimestamp expirationTime) {
- return newPrimaryReplicaMeta(LOCAL_NODE, new TablePartitionId(indexDescriptor.tableId(), 0), startTime, expirationTime);
+ return newPrimaryReplicaMeta(LOCAL_NODE, new ZonePartitionId(0, indexDescriptor.tableId(), 0), startTime, expirationTime);
}
private static ClusterService createClusterService() {
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java
index 6a23070f4b6..062845bcb7f 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexBuildControllerTest.java
@@ -64,6 +64,7 @@
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.placementdriver.leases.Lease;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.storage.MvPartitionStorage;
import org.apache.ignite.internal.storage.engine.MvTableStorage;
import org.apache.ignite.internal.storage.index.IndexStorage;
@@ -377,12 +378,16 @@ private int tableId(String tableName) {
return getTableIdStrict(catalogManager, tableName, clock.nowLong());
}
+ private int zoneId() {
+ return catalogManager.catalog(catalogManager.latestCatalogVersion()).table(tableId()).zoneId();
+ }
+
private int indexId(String indexName) {
return getIndexIdStrict(catalogManager, indexName, clock.nowLong());
}
- private TablePartitionId replicaId(int partitionId) {
- return new TablePartitionId(tableId(), partitionId);
+ private ZonePartitionId replicaId(int partitionId) {
+ return new ZonePartitionId(zoneId(), tableId(), partitionId);
}
private ReplicaMeta replicaMetaForOneSecond(String leaseholder, String leaseholderId, HybridTimestamp startTime) {
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java
index ad290ab97b5..0fc2792e615 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/IndexManagementUtilsTest.java
@@ -51,7 +51,7 @@
import org.apache.ignite.internal.network.ClusterNodeImpl;
import org.apache.ignite.internal.network.ClusterService;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
import org.apache.ignite.network.ClusterNode;
@@ -102,7 +102,7 @@ void testExtractIndexIdFromPartitionBuildIndexKey() {
@Test
void testIsPrimaryReplicaTrue() {
- TablePartitionId replicaGroupId = new TablePartitionId(1, 0);
+ ZonePartitionId replicaGroupId = new ZonePartitionId(0, 1, 0);
HybridTimestamp startTime = clock.now();
long dayInMillis = TimeUnit.DAYS.toMillis(1);
@@ -114,7 +114,7 @@ void testIsPrimaryReplicaTrue() {
@Test
void testIsPrimaryReplicaFalse() {
- TablePartitionId replicaGroupId = new TablePartitionId(1, 0);
+ ZonePartitionId replicaGroupId = new ZonePartitionId(0, 1, 0);
ClusterNode otherNode = new ClusterNodeImpl(NODE_ID + "-other", NODE_NAME + "-other", mock(NetworkAddress.class));
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java b/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java
index d8992a9c196..df17605a8c5 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/TestIndexManagementUtils.java
@@ -48,7 +48,7 @@
import org.apache.ignite.internal.network.ClusterNodeImpl;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.placementdriver.leases.Lease;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.table.TableTestUtils;
import org.apache.ignite.network.ClusterNode;
import org.apache.ignite.network.NetworkAddress;
@@ -129,7 +129,7 @@ static void assertMetastoreKeyPresent(MetaStorageManager metaStorageManager, Byt
static ReplicaMeta newPrimaryReplicaMeta(
ClusterNode clusterNode,
- TablePartitionId replicaGroupId,
+ ZonePartitionId replicaGroupId,
HybridTimestamp startTime,
HybridTimestamp expirationTime
) {
diff --git a/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java b/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java
index fe207e75888..73ea06f9d2b 100644
--- a/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java
+++ b/modules/index/src/test/java/org/apache/ignite/internal/index/TestPlacementDriver.java
@@ -18,6 +18,7 @@
package org.apache.ignite.internal.index;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
@@ -28,7 +29,8 @@
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
+import org.apache.ignite.internal.util.CompletableFutures;
/** Implementation for tests. */
class TestPlacementDriver extends AbstractEventProducer implements PlacementDriver {
@@ -40,6 +42,23 @@ public CompletableFuture awaitPrimaryReplica(
HybridTimestamp timestamp,
long timeout,
TimeUnit unit
+ ) {
+ assert groupId instanceof ZonePartitionId : "Unexpected replication group type [type=" + groupId.getClass().getSimpleName() + ']';
+
+ return awaitPrimaryReplicaForTable(
+ groupId,
+ timestamp,
+ timeout,
+ unit
+ );
+ }
+
+ @Override
+ public CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
) {
return primaryReplicaMetaFutureById.get(groupId);
}
@@ -56,7 +75,7 @@ public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId)
CompletableFuture setPrimaryReplicaMeta(
long causalityToken,
- TablePartitionId replicaId,
+ ZonePartitionId replicaId,
CompletableFuture replicaMetaFuture
) {
primaryReplicaMetaFutureById.put(replicaId, replicaMetaFuture);
@@ -72,4 +91,18 @@ CompletableFuture setPrimaryReplicaMeta(
)
));
}
+
+ @Override
+ public CompletableFuture addSubgroups(
+ ZonePartitionId zoneId,
+ Long enlistmentConsistencyToken,
+ Set subGrps
+ ) {
+ return CompletableFutures.nullCompletedFuture();
+ }
+
+ @Override
+ public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) {
+ return null;
+ }
}
diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java
index f06e28a8b3d..d4b45a00049 100644
--- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java
+++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriver.java
@@ -17,6 +17,7 @@
package org.apache.ignite.internal.placementdriver;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import org.apache.ignite.internal.event.EventProducer;
@@ -24,6 +25,7 @@
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
/**
* Service that provides an ability to await and retrieve primary replicas for replication groups.
@@ -59,6 +61,28 @@ CompletableFuture awaitPrimaryReplica(
TimeUnit unit
);
+ /**
+ * Temporary solution for awaiting {@link ReplicaMeta}. Waits for
+ * {@link ReplicaMeta} for {@link org.apache.ignite.internal.replicator.TablePartitionId}
+ * based on the {@link ZonePartitionId#tableId()}.
+ *
+ * @param groupId Replication group id.
+ * @param timestamp CLOCK_SKEW aware timestamp reference value.
+ * @param timeout How long to wait before completing exceptionally with a TimeoutException, in units of unit.
+ * @param unit A TimeUnit determining how to interpret the timeout parameter.
+ * @return Primary replica future.
+ * @throws PrimaryReplicaAwaitTimeoutException If primary replica await timed out.
+ * @throws PrimaryReplicaAwaitException If primary replica await failed with any other reason except timeout.
+ */
+ // TODO: https://issues.apache.org/jira/browse/IGNITE-20362
+ @Deprecated
+ CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
+ );
+
/**
* Same as {@link #awaitPrimaryReplica(ReplicationGroupId, HybridTimestamp, long, TimeUnit)} despite the fact that given method await
* logic is bounded. It will wait for a primary replica for a reasonable period of time, and complete a future with null if a matching
@@ -78,4 +102,24 @@ CompletableFuture awaitPrimaryReplica(
* @return Future.
*/
CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId);
+
+ /**
+ * Gets a cached lease by a zone replication group.
+ *
+ * @param grpId Replication group id.
+ * @return Lease or {@code null}.
+ */
+ ReplicaMeta getLeaseMeta(ReplicationGroupId grpId);
+
+ /**
+ * Tries to update the lease in order to include the new subgroup. The set of groups will be added to the set of lease subgroups
+ * ({@link ReplicaMeta#subgroups()}) for the specific lease determined by the zone id.
+ * TODO: IGNITE-20362 When replicas are started by zone, the method is removed.
+ *
+ * @param zoneId Zone id.
+ * @param enlistmentConsistencyToken Lease token.
+ * @param subGrps Table ids.
+ * @return Future to complete.
+ */
+ CompletableFuture addSubgroups(ZonePartitionId zoneId, Long enlistmentConsistencyToken, Set subGrps);
}
diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java
index 01bbf819cda..40b8ec78ea8 100644
--- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java
+++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/ReplicaMeta.java
@@ -18,7 +18,9 @@
package org.apache.ignite.internal.placementdriver;
import java.io.Serializable;
+import java.util.Set;
import org.apache.ignite.internal.hlc.HybridTimestamp;
+import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.jetbrains.annotations.Nullable;
/** Replica lease meta. */
@@ -34,4 +36,11 @@ public interface ReplicaMeta extends Serializable {
/** Gets a lease expiration timestamp. */
HybridTimestamp getExpirationTime();
+
+ /**
+ * Gets partition replication groups.
+ *
+ * @return Set of replication sub groups.
+ */
+ Set subgroups();
}
diff --git a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java
index ad30b64481f..f2b408f2490 100644
--- a/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java
+++ b/modules/placement-driver-api/src/main/java/org/apache/ignite/internal/placementdriver/message/LeaseGrantedMessageResponse.java
@@ -17,7 +17,10 @@
package org.apache.ignite.internal.placementdriver.message;
+import java.util.Set;
+import org.apache.ignite.internal.network.annotations.Marshallable;
import org.apache.ignite.internal.network.annotations.Transferable;
+import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.jetbrains.annotations.Nullable;
/**
@@ -29,4 +32,9 @@ public interface LeaseGrantedMessageResponse extends PlacementDriverMessage {
@Nullable
String redirectProposal();
+
+ /** List of applied groups. */
+ @Nullable
+ @Marshallable
+ Set appliedGroups();
}
diff --git a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java
index 440cbbb3097..3bdda01db52 100644
--- a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java
+++ b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestPlacementDriver.java
@@ -21,6 +21,7 @@
import static java.util.concurrent.CompletableFuture.failedFuture;
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
@@ -29,6 +30,7 @@
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.network.ClusterNode;
import org.jetbrains.annotations.TestOnly;
@@ -68,6 +70,16 @@ public CompletableFuture awaitPrimaryReplica(
return getReplicaMetaFuture();
}
+ @Override
+ public CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
+ ) {
+ return getReplicaMetaFuture();
+ }
+
@Override
public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) {
return getReplicaMetaFuture();
@@ -98,4 +110,18 @@ public Supplier extends ReplicaMeta> getPrimaryReplicaSupplier() {
public void setPrimaryReplicaSupplier(Supplier extends ReplicaMeta> primaryReplicaSupplier) {
this.primaryReplicaSupplier = primaryReplicaSupplier;
}
+
+ @Override
+ public CompletableFuture addSubgroups(
+ ZonePartitionId zoneId,
+ Long enlistmentConsistencyToken,
+ Set subGrps
+ ) {
+ return nullCompletedFuture();
+ }
+
+ @Override
+ public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) {
+ return null;
+ }
}
diff --git a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java
index 886a04e1bb9..04831a72ef9 100644
--- a/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java
+++ b/modules/placement-driver-api/src/testFixtures/java/org/apache/ignite/internal/placementdriver/TestReplicaMetaImpl.java
@@ -21,7 +21,9 @@
import static org.apache.ignite.internal.hlc.HybridTimestamp.MIN_VALUE;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.util.Set;
import org.apache.ignite.internal.hlc.HybridTimestamp;
+import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.apache.ignite.network.ClusterNode;
import org.jetbrains.annotations.Nullable;
import org.jetbrains.annotations.TestOnly;
@@ -129,4 +131,9 @@ public HybridTimestamp getStartTime() {
public HybridTimestamp getExpirationTime() {
return expirationTime;
}
+
+ @Override
+ public Set subgroups() {
+ return Set.of();
+ }
}
diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java
index 5b3fae8a9ff..115f1cef1f1 100644
--- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java
+++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ActiveActorTest.java
@@ -47,6 +47,7 @@
import org.apache.ignite.internal.raft.PeersAndLearners;
import org.apache.ignite.internal.raft.client.AbstractTopologyAwareGroupServiceTest;
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory;
+import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.util.Cursor;
import org.apache.ignite.raft.jraft.rpc.impl.RaftGroupEventsClientListener;
import org.junit.jupiter.api.AfterEach;
@@ -62,6 +63,7 @@
*/
@ExtendWith(MockitoExtension.class)
@MockitoSettings(strictness = Strictness.LENIENT)
+@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false")
public class ActiveActorTest extends AbstractTopologyAwareGroupServiceTest {
private final Map placementDriverManagers = new HashMap<>();
@@ -132,7 +134,8 @@ protected void afterNodeStart(
logicalTopologyService,
mockRaftMgr,
raftGroupServiceFactory,
- new TestClockService(new HybridClockImpl())
+ new TestClockService(new HybridClockImpl()),
+ grp -> ZONE_GROUP_ID
);
assertThat(placementDriverManager.startAsync(), willCompleteSuccessfully());
diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java
index 30003ad3842..25910eb9e07 100644
--- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java
+++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/BasePlacementDriverTest.java
@@ -33,7 +33,7 @@
import org.apache.ignite.internal.placementdriver.leases.Lease;
import org.apache.ignite.internal.placementdriver.leases.LeaseBatch;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.IgniteAbstractTest;
import org.jetbrains.annotations.Nullable;
@@ -44,22 +44,22 @@ abstract class BasePlacementDriverTest extends IgniteAbstractTest {
*
* @return Replication group id.
*/
- protected TablePartitionId createTableAssignment(MetaStorageManager metastore, int tableId, List dataNodes) {
+ protected ZonePartitionId createZoneAssignment(MetaStorageManager metastore, int zoneId, List dataNodes) {
List> assignments = AffinityUtils.calculateAssignments(dataNodes, 1, dataNodes.size());
Map partitionAssignments = new HashMap<>(assignments.size());
for (int i = 0; i < assignments.size(); i++) {
partitionAssignments.put(
- stablePartAssignmentsKey(new TablePartitionId(tableId, i)),
+ stablePartAssignmentsKey(new ZonePartitionId(zoneId, i)),
Assignments.toBytes(assignments.get(i)));
}
metastore.putAll(partitionAssignments).join();
- var grpPart0 = new TablePartitionId(tableId, 0);
+ var grpPart0 = new ZonePartitionId(zoneId, 0);
- log.info("Fake table created [id={}, repGrp={}]", tableId, grpPart0);
+ log.info("Fake zone created [id={}, repGrp={}]", zoneId, grpPart0);
return grpPart0;
}
diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java
index c6523efc825..c8ca65ceceb 100644
--- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java
+++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/ItPrimaryReplicaChoiceTest.java
@@ -59,6 +59,7 @@
import org.apache.ignite.internal.table.NodeUtils;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
+import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.testframework.flow.TestFlowUtils;
import org.apache.ignite.internal.tx.InternalTransaction;
import org.apache.ignite.internal.tx.impl.ReadWriteTransactionImpl;
@@ -78,6 +79,7 @@
* The test class checks invariant of a primary replica choice.
*/
@SuppressWarnings("resource")
+@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false")
public class ItPrimaryReplicaChoiceTest extends ClusterPerTestIntegrationTest {
private static final int AWAIT_PRIMARY_REPLICA_TIMEOUT = 10;
diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java
index fe784efb883..b6358c9edfa 100644
--- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java
+++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/MultiActorPlacementDriverTest.java
@@ -35,6 +35,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
@@ -71,6 +72,7 @@
import org.apache.ignite.internal.raft.configuration.RaftConfiguration;
import org.apache.ignite.internal.raft.service.RaftGroupService;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.IgniteUtils;
import org.apache.ignite.network.NetworkAddress;
import org.apache.ignite.raft.jraft.rpc.impl.RaftGroupEventsClientListener;
@@ -85,7 +87,11 @@
*/
@ExtendWith(ConfigurationExtension.class)
public class MultiActorPlacementDriverTest extends BasePlacementDriverTest {
- public static final int BASE_PORT = 1234;
+ private static final int BASE_PORT = 1234;
+
+ private static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0);
+
+ private static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 0);
private static final PlacementDriverMessagesFactory PLACEMENT_DRIVER_MESSAGES_FACTORY = new PlacementDriverMessagesFactory();
@@ -110,7 +116,7 @@ public class MultiActorPlacementDriverTest extends BasePlacementDriverTest {
/** This closure handles {@link LeaseGrantedMessage} to check the placement driver manager behavior. */
private IgniteTriFunction leaseGrantHandler;
- private final AtomicInteger nextTableId = new AtomicInteger(1);
+ private final AtomicInteger nextZoneId = new AtomicInteger(1);
@BeforeEach
public void beforeTest(TestInfo testInfo) {
@@ -172,6 +178,7 @@ private NetworkMessageHandler leaseGrantMessageHandler(ClusterService handlerSer
if (resp == null) {
resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
}
@@ -277,7 +284,8 @@ private List extends AutoCloseable> startPlacementDriver(
logicalTopologyService,
raftManager,
topologyAwareRaftGroupServiceFactory,
- new TestClockService(nodeClock)
+ new TestClockService(nodeClock),
+ grp -> ZONE_GROUP_ID
);
res.add(new Node(nodeName, clusterService, raftManager, metaStorageManager, placementDriverManager));
@@ -290,7 +298,7 @@ private List extends AutoCloseable> startPlacementDriver(
@Test
public void testLeaseCreate() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createTableAssignment();
checkLeaseCreated(grpPart0, true);
}
@@ -303,11 +311,12 @@ public void testLeaseProlong() throws Exception {
acceptedNodeRef.compareAndSet(null, to);
return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createTableAssignment();
Lease lease = checkLeaseCreated(grpPart0, true);
Lease leaseRenew = waitForProlong(grpPart0, lease);
@@ -323,11 +332,12 @@ public void prolongAfterActiveActorChanged() throws Exception {
acceptedNodeRef.compareAndSet(null, to);
return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createTableAssignment();
Lease lease = checkLeaseCreated(grpPart0, true);
@@ -375,12 +385,13 @@ public void testLeaseProlongAfterRedirect() throws Exception {
log.info("Lease is accepted [leaseholder={}]", to);
return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
}
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createTableAssignment();
Lease lease = checkLeaseCreated(grpPart0, true);
@@ -399,11 +410,12 @@ public void testDeclineLeaseByLeaseholder() throws Exception {
activeActorRef.set(from);
return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
};
- TablePartitionId grpPart = createTableAssignment();
+ ZonePartitionId grpPart = createTableAssignment();
Lease lease = checkLeaseCreated(grpPart, true);
@@ -424,6 +436,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception {
.build();
} else {
return PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
}
@@ -435,7 +448,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception {
service.messagingService().send(
clusterServices.get(activeActorRef.get()).topologyService().localMember(),
PLACEMENT_DRIVER_MESSAGES_FACTORY.stopLeaseProlongationMessage()
- .groupId(grpPart)
+ .groupId(GROUP_ID)
.redirectProposal(proposedLeaseholder)
.build()
);
@@ -455,7 +468,7 @@ public void testDeclineLeaseByLeaseholder() throws Exception {
* @return Renewed lease.
* @throws InterruptedException If the waiting is interrupted.
*/
- private Lease waitNewLeaseholder(TablePartitionId grpPart, Lease lease) throws InterruptedException {
+ private Lease waitNewLeaseholder(ZonePartitionId grpPart, Lease lease) throws InterruptedException {
var leaseRenewRef = new AtomicReference();
assertTrue(waitForCondition(() -> {
@@ -489,7 +502,7 @@ private Lease waitNewLeaseholder(TablePartitionId grpPart, Lease lease) throws I
* @return Renewed lease.
* @throws InterruptedException If the waiting is interrupted.
*/
- private Lease waitForProlong(TablePartitionId grpPart, Lease lease) throws InterruptedException {
+ private Lease waitForProlong(ZonePartitionId grpPart, Lease lease) throws InterruptedException {
var leaseRenewRef = new AtomicReference();
assertTrue(waitForCondition(() -> {
@@ -534,7 +547,7 @@ private Lease waitForProlong(TablePartitionId grpPart, Lease lease) throws Inter
* @return A lease that is read from Meta storage.
* @throws InterruptedException If the waiting is interrupted.
*/
- private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) throws InterruptedException {
+ private Lease checkLeaseCreated(ZonePartitionId grpPartId, boolean waitAccept) throws InterruptedException {
AtomicReference leaseRef = new AtomicReference<>();
assertTrue(waitForCondition(() -> {
@@ -567,7 +580,7 @@ private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept)
*
* @return Replication group id.
*/
- private TablePartitionId createTableAssignment() {
- return createTableAssignment(metaStorageManager, nextTableId.get(), nodeNames);
+ private ZonePartitionId createTableAssignment() {
+ return createZoneAssignment(metaStorageManager, nextZoneId.get(), nodeNames);
}
}
diff --git a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java
index 4171b9f9acd..c5e64514c83 100644
--- a/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java
+++ b/modules/placement-driver/src/integrationTest/java/org/apache/ignite/internal/placementdriver/PlacementDriverManagerTest.java
@@ -87,6 +87,7 @@
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory;
import org.apache.ignite.internal.raft.configuration.RaftConfiguration;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.network.ClusterNode;
import org.apache.ignite.network.NetworkAddress;
@@ -104,6 +105,10 @@
public class PlacementDriverManagerTest extends BasePlacementDriverTest {
public static final int PORT = 1234;
+ protected static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0);
+
+ protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 0);
+
private static final PlacementDriverMessagesFactory PLACEMENT_DRIVER_MESSAGES_FACTORY = new PlacementDriverMessagesFactory();
private String nodeName;
@@ -210,7 +215,8 @@ private void startPlacementDriverManager() {
logicalTopologyService,
raftManager,
topologyAwareRaftGroupServiceFactory,
- new TestClockService(nodeClock)
+ new TestClockService(nodeClock),
+ grp -> ZONE_GROUP_ID
);
assertThat(
@@ -244,6 +250,7 @@ private NetworkMessageHandler leaseGrantMessageHandler(String handlerNode) {
if (resp == null) {
resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(true)
.build();
}
@@ -274,7 +281,7 @@ private void stopPlacementDriverManager() throws Exception {
@Test
public void testLeaseCreate() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, false);
}
@@ -282,7 +289,7 @@ public void testLeaseCreate() throws Exception {
@Test
@WithSystemProperty(key = "IGNITE_LONG_LEASE", value = "200")
public void testLeaseRenew() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, false);
@@ -305,7 +312,7 @@ public void testLeaseRenew() throws Exception {
@Test
@WithSystemProperty(key = "IGNITE_LONG_LEASE", value = "200")
public void testLeaseholderUpdate() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, false);
@@ -337,7 +344,7 @@ public void testLeaseholderUpdate() throws Exception {
@Test
public void testPrimaryReplicaEvents() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName));
+ ZonePartitionId grpPart0 = createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName));
Lease lease1 = checkLeaseCreated(grpPart0, true);
@@ -366,7 +373,7 @@ public void testPrimaryReplicaEvents() throws Exception {
assertTrue(waitForCondition(() -> {
CompletableFuture fut = placementDriverManager.placementDriver()
- .getPrimaryReplica(grpPart0, lease1.getExpirationTime());
+ .getPrimaryReplica(GROUP_ID, lease1.getExpirationTime());
ReplicaMeta meta = fut.join();
@@ -388,7 +395,7 @@ public void testPrimaryReplicaEvents() throws Exception {
assertTrue(waitForCondition(() -> {
CompletableFuture fut = placementDriverManager.placementDriver()
- .getPrimaryReplica(grpPart0, lease2.getExpirationTime());
+ .getPrimaryReplica(GROUP_ID, lease2.getExpirationTime());
ReplicaMeta meta = fut.join();
@@ -456,12 +463,12 @@ private ClusterService startAnotherNode(String nodeName, int port) throws Except
@Test
public void testLeaseRemovedAfterExpirationAndAssignmetnsRemoval() throws Exception {
- List groupIds = List.of(
- createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)),
- createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName))
+ List groupIds = List.of(
+ createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName)),
+ createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName))
);
- Map leaseExpirationMap =
+ Map leaseExpirationMap =
groupIds.stream().collect(Collectors.toMap(id -> id, id -> new AtomicBoolean()));
groupIds.forEach(groupId -> {
@@ -494,7 +501,7 @@ public void testLeaseRemovedAfterExpirationAndAssignmetnsRemoval() throws Except
@Test
public void testLeaseAccepted() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, true);
}
@@ -504,10 +511,11 @@ public void testLeaseForceAccepted() throws Exception {
leaseGrantHandler = (req, handler) ->
PLACEMENT_DRIVER_MESSAGES_FACTORY
.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(req.force())
.build();
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, true);
}
@@ -522,7 +530,7 @@ public void testExceptionOnAcceptance() throws Exception {
throw new RuntimeException("test");
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, false);
@@ -543,25 +551,27 @@ public void testRedirectionAcceptance() throws Exception {
return PLACEMENT_DRIVER_MESSAGES_FACTORY
.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(false)
.redirectProposal(redirect.get())
.build();
} else {
return PLACEMENT_DRIVER_MESSAGES_FACTORY
.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(GROUP_ID))
.accepted(redirect.get().equals(handler))
.build();
}
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, true);
}
@Test
public void testLeaseRestore() throws Exception {
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
checkLeaseCreated(grpPart0, false);
@@ -581,7 +591,7 @@ public void testLeaseMatchGrantMessage() throws Exception {
return null;
};
- TablePartitionId grpPart0 = createTableAssignment();
+ ZonePartitionId grpPart0 = createZoneAssignment();
Lease lease = checkLeaseCreated(grpPart0, false);
@@ -599,7 +609,7 @@ public void testLeaseMatchGrantMessage() throws Exception {
* @return A lease that is read from Meta storage.
* @throws InterruptedException If the waiting is interrupted.
*/
- private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept) throws InterruptedException {
+ private Lease checkLeaseCreated(ZonePartitionId grpPartId, boolean waitAccept) throws InterruptedException {
AtomicReference leaseRef = new AtomicReference<>();
assertTrue(waitForCondition(() -> {
@@ -632,8 +642,8 @@ private Lease checkLeaseCreated(TablePartitionId grpPartId, boolean waitAccept)
*
* @return Replication group id.
*/
- private TablePartitionId createTableAssignment() {
- return createTableAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName, anotherNodeName));
+ private ZonePartitionId createZoneAssignment() {
+ return createZoneAssignment(metaStorageManager, nextTableId.incrementAndGet(), List.of(nodeName, anotherNodeName));
}
/**
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java
index 11e4da4de6f..b8e9edb2606 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/AssignmentsTracker.java
@@ -38,7 +38,7 @@
import org.apache.ignite.internal.metastorage.WatchEvent;
import org.apache.ignite.internal.metastorage.WatchListener;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.Cursor;
/**
@@ -94,7 +94,7 @@ public void startTrack() {
strKey = strKey.replace(STABLE_ASSIGNMENTS_PREFIX, "");
- TablePartitionId grpId = TablePartitionId.fromString(strKey);
+ ZonePartitionId grpId = ZonePartitionId.fromString(strKey);
Set assignments = Assignments.fromBytes(entry.value()).nodes();
@@ -144,7 +144,7 @@ public CompletableFuture onUpdate(WatchEvent event) {
for (EntryEvent evt : event.entryEvents()) {
Entry entry = evt.newEntry();
- var replicationGrpId = TablePartitionId.fromString(
+ var replicationGrpId = ZonePartitionId.fromString(
new String(entry.key(), StandardCharsets.UTF_8).replace(STABLE_ASSIGNMENTS_PREFIX, ""));
if (entry.tombstone()) {
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java
index 3ed175c099c..be26e38123b 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/LeaseUpdater.java
@@ -26,6 +26,7 @@
import static org.apache.ignite.internal.placementdriver.PlacementDriverManager.PLACEMENTDRIVER_LEASES_KEY;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
@@ -34,6 +35,7 @@
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
import org.apache.ignite.internal.affinity.Assignment;
import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologyService;
import org.apache.ignite.internal.hlc.ClockService;
@@ -56,6 +58,8 @@
import org.apache.ignite.internal.placementdriver.negotiation.LeaseAgreement;
import org.apache.ignite.internal.placementdriver.negotiation.LeaseNegotiator;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
+import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.thread.IgniteThread;
import org.apache.ignite.internal.tostring.IgniteToStringInclude;
import org.apache.ignite.internal.tostring.S;
@@ -80,6 +84,8 @@ public class LeaseUpdater {
/** Lease holding interval. */
private static final long LEASE_INTERVAL = 10 * UPDATE_LEASE_MS;
+ private final boolean alwaysForce = IgniteSystemProperties.getBoolean("IGNITE_ALWAYS_FORCE", true);
+
/** The lock is available when the actor is changing state. */
private final IgniteSpinBusyLock stateChangingLock = new IgniteSpinBusyLock();
@@ -106,6 +112,8 @@ public class LeaseUpdater {
/** Cluster clock. */
private final ClockService clockService;
+ private final Function groupIdProvider;
+
/** Closure to update leases. */
private final Updater updater;
@@ -133,13 +141,15 @@ public class LeaseUpdater {
MetaStorageManager msManager,
LogicalTopologyService topologyService,
LeaseTracker leaseTracker,
- ClockService clockService
+ ClockService clockService,
+ Function groupIdProvider
) {
this.nodeName = nodeName;
this.clusterService = clusterService;
this.msManager = msManager;
this.leaseTracker = leaseTracker;
this.clockService = clockService;
+ this.groupIdProvider = groupIdProvider;
this.longLeaseInterval = IgniteSystemProperties.getLong("IGNITE_LONG_LEASE", 120_000);
this.assignmentsTracker = new AssignmentsTracker(msManager);
@@ -368,7 +378,7 @@ private void updateLeaseBatchInternal() {
agreement.checkValid(grpId, topologyTracker.currentTopologySnapshot(), assignments);
if (agreement.isAccepted()) {
- publishLease(grpId, lease, renewedLeases);
+ publishLease(grpId, lease, renewedLeases, agreement.applicableFor());
continue;
} else if (agreement.isDeclined()) {
@@ -438,6 +448,11 @@ private void updateLeaseBatchInternal() {
);
}
+ if (Arrays.equals(leasesCurrent.leasesBytes(), renewedValue)) {
+ LOG.info("No leases to update found.");
+ return;
+ }
+
msManager.invoke(
or(notExists(key), value(key).eq(leasesCurrent.leasesBytes())),
put(key, renewedValue),
@@ -457,7 +472,7 @@ private void updateLeaseBatchInternal() {
for (Map.Entry entry : toBeNegotiated.entrySet()) {
Lease lease = renewedLeases.get(entry.getKey());
- boolean force = entry.getValue();
+ boolean force = alwaysForce || entry.getValue();
leaseNegotiator.negotiate(lease, force);
}
@@ -509,11 +524,18 @@ private void prolongLease(ReplicationGroupId grpId, Lease lease, Map renewedLeases) {
+ private void publishLease(
+ ReplicationGroupId grpId,
+ Lease lease,
+ Map renewedLeases,
+ Set subGrps
+ ) {
var newTs = new HybridTimestamp(clockService.now().getPhysical() + LEASE_INTERVAL, 0);
- Lease renewedLease = lease.acceptLease(newTs);
+ Lease renewedLease = lease.acceptLease(newTs, subGrps);
renewedLeases.put(grpId, renewedLease);
@@ -613,17 +635,23 @@ public void onReceived(NetworkMessage msg0, ClusterNode sender, @Nullable Long c
private void processMessageInternal(String sender, PlacementDriverActorMessage msg) {
ReplicationGroupId grpId = msg.groupId();
- Lease lease = leaseTracker.getLease(grpId);
+ assert grpId instanceof TablePartitionId : "Unexpected replication group type [grp=" + grpId + "].";
+
+ var tblPartId = (TablePartitionId) grpId;
+
+ ReplicationGroupId grpId0 = groupIdProvider.apply(tblPartId);
+
+ Lease lease = leaseTracker.getLease(grpId0);
if (msg instanceof StopLeaseProlongationMessage) {
if (lease.isProlongable() && sender.equals(lease.getLeaseholder())) {
StopLeaseProlongationMessage stopLeaseProlongationMessage = (StopLeaseProlongationMessage) msg;
- denyLease(grpId, lease, stopLeaseProlongationMessage.redirectProposal()).whenComplete((res, th) -> {
+ denyLease(grpId0, lease, stopLeaseProlongationMessage.redirectProposal()).whenComplete((res, th) -> {
if (th != null) {
- LOG.warn("Prolongation denial failed due to exception [groupId={}]", th, grpId);
+ LOG.warn("Prolongation denial failed due to exception [groupId={}]", th, grpId0);
} else {
- LOG.info("Stop lease prolongation message was handled [groupId={}, sender={}, deny={}]", grpId, sender, res);
+ LOG.info("Stop lease prolongation message was handled [groupId={}, sender={}, deny={}]", grpId0, sender, res);
}
});
}
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java
index 9edc6dbbfb9..f35b084c33a 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/PlacementDriverManager.java
@@ -25,6 +25,7 @@
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;
+import java.util.function.Function;
import java.util.function.Supplier;
import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologyService;
import org.apache.ignite.internal.hlc.ClockService;
@@ -41,6 +42,8 @@
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupService;
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
+import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
import org.apache.ignite.network.ClusterNode;
import org.jetbrains.annotations.TestOnly;
@@ -110,7 +113,8 @@ public PlacementDriverManager(
LogicalTopologyService logicalTopologyService,
RaftManager raftManager,
TopologyAwareRaftGroupServiceFactory topologyAwareRaftGroupServiceFactory,
- ClockService clockService
+ ClockService clockService,
+ Function tablePartIdToZoneIdProvider
) {
this.replicationGroupId = replicationGroupId;
this.clusterService = clusterService;
@@ -121,7 +125,13 @@ public PlacementDriverManager(
this.raftClientFuture = new CompletableFuture<>();
- this.leaseTracker = new LeaseTracker(metastore, clusterService.topologyService(), clockService);
+ this.leaseTracker = new LeaseTracker(
+ nodeName,
+ metastore,
+ clusterService.topologyService(),
+ clockService,
+ tablePartIdToZoneIdProvider
+ );
this.leaseUpdater = new LeaseUpdater(
nodeName,
@@ -129,7 +139,8 @@ public PlacementDriverManager(
metastore,
logicalTopologyService,
leaseTracker,
- clockService
+ clockService,
+ tablePartIdToZoneIdProvider
);
}
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java
index d613fa3e6d5..f26c2181067 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/Lease.java
@@ -27,7 +27,9 @@
import static org.apache.ignite.internal.util.ByteUtils.toBytes;
import java.nio.ByteBuffer;
+import java.util.Collections;
import java.util.Objects;
+import java.util.Set;
import org.apache.ignite.internal.hlc.HybridTimestamp;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
@@ -36,8 +38,7 @@
import org.jetbrains.annotations.Nullable;
/**
- * A lease representation in memory.
- * The real lease is stored in Meta storage.
+ * A lease representation in memory. The real lease is stored in Meta storage.
*/
public class Lease implements ReplicaMeta {
private static final long serialVersionUID = 394641185393949608L;
@@ -67,6 +68,9 @@ public class Lease implements ReplicaMeta {
/** ID of replication group. */
private final ReplicationGroupId replicationGroupId;
+ /** Table partition replication groups. */
+ private final Set subgroups;
+
/**
* Creates a new lease.
*
@@ -83,7 +87,7 @@ public Lease(
HybridTimestamp leaseExpirationTime,
ReplicationGroupId replicationGroupId
) {
- this(leaseholder, leaseholderId, startTime, leaseExpirationTime, false, false, null, replicationGroupId);
+ this(leaseholder, leaseholderId, startTime, leaseExpirationTime, false, false, null, replicationGroupId, Collections.emptySet());
}
/**
@@ -95,9 +99,10 @@ public Lease(
* @param leaseExpirationTime Lease expiration timestamp.
* @param prolong Lease is available to prolong.
* @param accepted The flag is {@code true} when the holder accepted the lease.
- * @param proposedCandidate The name of a node that is proposed to be a next leaseholder. This is not null in case when the lease
- * is not prolongable.
+ * @param proposedCandidate The name of a node that is proposed to be a next leaseholder. This is not null in case when the
+ * lease is not prolongable.
* @param replicationGroupId ID of replication group.
+ * @param subgroups Table partition replication groups.
*/
public Lease(
@Nullable String leaseholder,
@@ -107,7 +112,8 @@ public Lease(
boolean prolong,
boolean accepted,
@Nullable String proposedCandidate,
- ReplicationGroupId replicationGroupId
+ ReplicationGroupId replicationGroupId,
+ Set subgroups
) {
assert (leaseholder == null) == (leaseholderId == null) : "leaseholder=" + leaseholder + ", leaseholderId=" + leaseholderId;
@@ -121,6 +127,7 @@ public Lease(
this.accepted = accepted;
this.replicationGroupId = replicationGroupId;
this.proposedCandidate = proposedCandidate;
+ this.subgroups = subgroups;
}
/**
@@ -133,7 +140,17 @@ public Lease prolongLease(HybridTimestamp to) {
assert accepted : "The lease should be accepted by leaseholder before prolongation: [lease=" + this + ", to=" + to + ']';
assert prolongable : "The lease should be available to prolong: [lease=" + this + ", to=" + to + ']';
- return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId);
+ return new Lease(
+ leaseholder,
+ leaseholderId,
+ startTime,
+ to,
+ true,
+ true,
+ null,
+ replicationGroupId,
+ subgroups
+ );
}
/**
@@ -142,10 +159,10 @@ public Lease prolongLease(HybridTimestamp to) {
* @param to The new lease expiration timestamp.
* @return A accepted lease.
*/
- public Lease acceptLease(HybridTimestamp to) {
+ public Lease acceptLease(HybridTimestamp to, Set parts) {
assert !accepted : "The lease is already accepted: " + this;
- return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId);
+ return new Lease(leaseholder, leaseholderId, startTime, to, true, true, null, replicationGroupId, parts);
}
/**
@@ -156,7 +173,17 @@ public Lease acceptLease(HybridTimestamp to) {
public Lease denyLease(String proposedCandidate) {
assert accepted : "The lease is not accepted: " + this;
- return new Lease(leaseholder, leaseholderId, startTime, expirationTime, false, true, proposedCandidate, replicationGroupId);
+ return new Lease(
+ leaseholder,
+ leaseholderId,
+ startTime,
+ expirationTime,
+ false,
+ true,
+ proposedCandidate,
+ replicationGroupId,
+ subgroups
+ );
}
@Override
@@ -179,6 +206,11 @@ public HybridTimestamp getExpirationTime() {
return expirationTime;
}
+ @Override
+ public Set subgroups() {
+ return subgroups;
+ }
+
/** Returns {@code true} if the lease might be prolonged. */
public boolean isProlongable() {
return prolongable;
@@ -210,11 +242,15 @@ public byte[] bytes() {
byte[] leaseholderIdBytes = stringToBytes(leaseholderId);
byte[] proposedCandidateBytes = stringToBytes(proposedCandidate);
byte[] groupIdBytes = toBytes(replicationGroupId);
+ byte[] subgroupsBytes = toBytes(subgroups);
int bufSize = 2 // accepted + prolongable
+ HYBRID_TIMESTAMP_SIZE * 2 // startTime + expirationTime
- + bytesSizeForWrite(leaseholderBytes) + bytesSizeForWrite(leaseholderIdBytes) + bytesSizeForWrite(proposedCandidateBytes)
- + bytesSizeForWrite(groupIdBytes);
+ + bytesSizeForWrite(leaseholderBytes)
+ + bytesSizeForWrite(leaseholderIdBytes)
+ + bytesSizeForWrite(proposedCandidateBytes)
+ + bytesSizeForWrite(groupIdBytes)
+ + bytesSizeForWrite(subgroupsBytes);
ByteBuffer buf = ByteBuffer.allocate(bufSize).order(LITTLE_ENDIAN);
@@ -228,6 +264,7 @@ public byte[] bytes() {
putBytes(buf, leaseholderIdBytes);
putBytes(buf, proposedCandidateBytes);
putBytes(buf, groupIdBytes);
+ putBytes(buf, subgroupsBytes);
return buf.array();
}
@@ -252,8 +289,9 @@ public static Lease fromBytes(ByteBuffer buf) {
String proposedCandidate = stringFromBytes(getBytes(buf));
ReplicationGroupId groupId = ByteUtils.fromBytes(getBytes(buf));
+ Set parts = ByteUtils.fromBytes(getBytes(buf));
- return new Lease(leaseholder, leaseholderId, startTime, expirationTime, prolongable, accepted, proposedCandidate, groupId);
+ return new Lease(leaseholder, leaseholderId, startTime, expirationTime, prolongable, accepted, proposedCandidate, groupId, parts);
}
/**
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java
index b671009dc90..bc7010b1116 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/leases/LeaseTracker.java
@@ -23,6 +23,11 @@
import static java.util.concurrent.CompletableFuture.allOf;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static org.apache.ignite.internal.hlc.HybridTimestamp.MIN_VALUE;
+import static org.apache.ignite.internal.metastorage.dsl.Conditions.notExists;
+import static org.apache.ignite.internal.metastorage.dsl.Conditions.or;
+import static org.apache.ignite.internal.metastorage.dsl.Conditions.value;
+import static org.apache.ignite.internal.metastorage.dsl.Operations.noop;
+import static org.apache.ignite.internal.metastorage.dsl.Operations.put;
import static org.apache.ignite.internal.placementdriver.PlacementDriverManager.PLACEMENTDRIVER_LEASES_KEY;
import static org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent.PRIMARY_REPLICA_ELECTED;
import static org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent.PRIMARY_REPLICA_EXPIRED;
@@ -35,17 +40,21 @@
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
import org.apache.ignite.internal.event.AbstractEventProducer;
import org.apache.ignite.internal.hlc.ClockService;
import org.apache.ignite.internal.hlc.HybridTimestamp;
-import org.apache.ignite.internal.lang.IgniteStringFormatter;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
import org.apache.ignite.internal.metastorage.Entry;
@@ -60,7 +69,12 @@
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
+import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
+import org.apache.ignite.internal.replicator.exception.PrimaryReplicaMissException;
+import org.apache.ignite.internal.thread.NamedThreadFactory;
import org.apache.ignite.internal.util.IgniteSpinBusyLock;
+import org.apache.ignite.internal.util.IgniteUtils;
import org.apache.ignite.internal.util.PendingIndependentComparableValuesTracker;
import org.apache.ignite.network.ClusterNode;
import org.apache.ignite.network.ClusterNodeResolver;
@@ -99,18 +113,35 @@ public class LeaseTracker extends AbstractEventProducer tablePartIdToZoneIdProvider;
+
private final ClockService clockService;
+ /** Node name. */
+ private final String nodeName;
+
+ /** Repeated Meta storage lease subgroup updates will be handled in this thread pool. */
+ private ExecutorService leaseUpdateRetryExecutor;
+
/**
* Constructor.
*
+ * @param nodeName Node name.
* @param msManager Meta storage manager.
* @param clockService Clock service.
*/
- public LeaseTracker(MetaStorageManager msManager, ClusterNodeResolver clusterNodeResolver, ClockService clockService) {
+ public LeaseTracker(
+ String nodeName,
+ MetaStorageManager msManager,
+ ClusterNodeResolver clusterNodeResolver,
+ ClockService clockService,
+ Function tablePartIdToZoneIdProvider
+ ) {
+ this.nodeName = nodeName;
this.msManager = msManager;
this.clusterNodeResolver = clusterNodeResolver;
this.clockService = clockService;
+ this.tablePartIdToZoneIdProvider = tablePartIdToZoneIdProvider;
}
/**
@@ -123,9 +154,96 @@ public void startTrack(long recoveryRevision) {
msManager.registerExactWatch(PLACEMENTDRIVER_LEASES_KEY, updateListener);
loadLeasesBusyAsync(recoveryRevision);
+
+ leaseUpdateRetryExecutor = Executors.newSingleThreadExecutor(
+ NamedThreadFactory.create(nodeName, "lease-update-retry-executor", LOG)
+ );
});
}
+ @Override
+ public CompletableFuture addSubgroups(
+ ZonePartitionId zoneId,
+ Long enlistmentConsistencyToken,
+ Set subGrps
+ ) {
+ if (leases.leaseByGroupId().get(zoneId).subgroups().containsAll(subGrps)) {
+ return nullCompletedFuture();
+ }
+
+ CompletableFuture resultFut = new CompletableFuture<>();
+
+ Leases leasesCurrent = leases;
+ Map previousLeasesMap = leasesCurrent.leaseByGroupId();
+ Map renewedLeases = new HashMap<>(previousLeasesMap);
+
+ Lease previousLease = previousLeasesMap.get(zoneId);
+
+ if (previousLease != null && enlistmentConsistencyToken.equals(previousLease.getStartTime().longValue())) {
+ HashSet subgroups = new HashSet<>(previousLease.subgroups());
+
+ subgroups.addAll(subGrps);
+
+ renewedLeases.put(zoneId, new Lease(
+ previousLease.getLeaseholder(),
+ previousLease.getLeaseholderId(),
+ previousLease.getStartTime(),
+ previousLease.getExpirationTime(),
+ previousLease.isProlongable(),
+ previousLease.isAccepted(),
+ null,
+ previousLease.replicationGroupId(),
+ subgroups));
+ } else {
+ resultFut.completeExceptionally(new PrimaryReplicaMissException(
+ nodeName,
+ null,
+ "localNode.id()",
+ null,
+ null,
+ null,
+ null
+ ));
+
+ return resultFut;
+ }
+
+ byte[] renewedValue = new LeaseBatch(renewedLeases.values()).bytes();
+
+ msManager.invoke(
+ or(notExists(PLACEMENTDRIVER_LEASES_KEY), value(PLACEMENTDRIVER_LEASES_KEY).eq(leasesCurrent.leasesBytes())),
+ put(PLACEMENTDRIVER_LEASES_KEY, renewedValue),
+ noop()
+ ).whenCompleteAsync((invokeResult, throwable) -> {
+ if (throwable != null) {
+ resultFut.completeExceptionally(throwable);
+
+ return;
+ }
+
+ if (invokeResult) {
+ resultFut.complete(null);
+ } else {
+ try {
+ // Throttling.
+ Thread.sleep(200);
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+
+ addSubgroups(zoneId, enlistmentConsistencyToken, subGrps).whenComplete((unused, throwable1) -> {
+ if (throwable1 != null) {
+ resultFut.completeExceptionally(throwable1);
+ }
+
+ resultFut.complete(null);
+ });
+ }
+ }, leaseUpdateRetryExecutor);
+
+ return resultFut;
+ }
+
/** Stops the tracker. */
public void stopTrack() {
if (!stopGuard.compareAndSet(false, true)) {
@@ -137,6 +255,8 @@ public void stopTrack() {
primaryReplicaWaiters.forEach((groupId, pendingTracker) -> pendingTracker.close());
primaryReplicaWaiters.clear();
+ IgniteUtils.shutdownAndAwaitTermination(leaseUpdateRetryExecutor, 10, TimeUnit.SECONDS);
+
msManager.unregisterWatch(updateListener);
}
@@ -152,6 +272,8 @@ public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId)
* @return A lease is associated with the group.
*/
public Lease getLease(ReplicationGroupId grpId) {
+ assert grpId instanceof ZonePartitionId : "Unexpected replication group type [grp=" + grpId + "].";
+
Leases leases = this.leases;
assert leases != null : "Leases not initialized, probably the local placement driver actor hasn't started lease tracking.";
@@ -161,6 +283,13 @@ public Lease getLease(ReplicationGroupId grpId) {
return lease == null ? emptyLease(grpId) : lease;
}
+ @Override
+ public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) {
+ Lease lease = getLease(grpId);
+
+ return lease.isAccepted() ? lease : null;
+ }
+
/** Returns collection of leases, ordered by replication group. */
public Leases leasesCurrent() {
return leases;
@@ -172,7 +301,7 @@ private class UpdateListener implements WatchListener {
public CompletableFuture onUpdate(WatchEvent event) {
return inBusyLockAsync(busyLock, () -> {
List> fireEventFutures = new ArrayList<>();
- List expiredLeases = new ArrayList<>();
+ HashMap expiredLeases = new HashMap<>();
for (EntryEvent entry : event.entryEvents()) {
Entry msEntry = entry.newEntry();
@@ -189,35 +318,45 @@ public CompletableFuture onUpdate(WatchEvent event) {
leasesMap.put(grpId, lease);
+ Lease previousLease = previousLeasesMap.get(grpId);
+
if (lease.isAccepted()) {
primaryReplicaWaiters
.computeIfAbsent(grpId, groupId -> new PendingIndependentComparableValuesTracker<>(MIN_VALUE))
.update(lease.getExpirationTime(), lease);
- if (needFireEventReplicaBecomePrimary(previousLeasesMap.get(grpId), lease)) {
- fireEventFutures.add(fireEventReplicaBecomePrimary(event.revision(), lease));
+ for (ReplicationGroupId groupToNotify : needFireEventReplicaBecomePrimary(previousLease, lease)) {
+ fireEventFutures.add(fireEventReplicaBecomePrimary(groupToNotify, event.revision(), lease));
}
}
- if (needToFireEventReplicaExpired(grpId, lease)) {
- expiredLeases.add(leases.leaseByGroupId().get(grpId));
+ if (previousLease != null && previousLease.isAccepted()) {
+ for (ReplicationGroupId groupToNotify : needFireEventReplicaExpired(previousLease, lease)) {
+ expiredLeases.put(groupToNotify, previousLease);
+ }
}
}
- for (ReplicationGroupId grpId : leases.leaseByGroupId().keySet()) {
+ for (Map.Entry replicaLease : previousLeasesMap.entrySet()) {
+ ReplicationGroupId grpId = replicaLease.getKey();
+
if (!leasesMap.containsKey(grpId)) {
tryRemoveTracker(grpId);
- if (needToFireEventReplicaExpired(grpId, null)) {
- expiredLeases.add(leases.leaseByGroupId().get(grpId));
+ Lease previousLease = previousLeasesMap.get(grpId);
+
+ if (previousLease.isAccepted()) {
+ for (ReplicationGroupId groupToNotify : needFireEventReplicaExpired(previousLease, null)) {
+ expiredLeases.put(groupToNotify, previousLease);
+ }
}
}
}
leases = new Leases(unmodifiableMap(leasesMap), leasesBytes);
- for (Lease expiredLease : expiredLeases) {
- firePrimaryReplicaExpiredEvent(event.revision(), expiredLease);
+ for (Map.Entry expiredLease : expiredLeases.entrySet()) {
+ fireEventPrimaryReplicaExpired(expiredLease.getKey(), event.revision(), expiredLease.getValue());
}
}
@@ -260,25 +399,56 @@ public CompletableFuture awaitPrimaryReplica(
long timeout,
TimeUnit unit
) {
+ assert groupId instanceof TablePartitionId : "Unexpected replication group type [grp=" + groupId + "].";
+
+ var tblPartId = (TablePartitionId) groupId;
+
+ ReplicationGroupId groupId0 = tablePartIdToZoneIdProvider.apply(tblPartId);
+
+ return awaitPrimaryReplicaForTable(
+ groupId0,
+ timestamp,
+ timeout,
+ unit
+ );
+ }
+
+ @Override
+ public CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
+ ) {
+ assert groupId instanceof ZonePartitionId : "Unexpected replication group type [grp=" + groupId + "].";
+
+ var zonePartId = ((ZonePartitionId) groupId).purify();
+
CompletableFuture future = new CompletableFuture<>();
- awaitPrimaryReplica(groupId, timestamp, future);
+ awaitPrimaryReplica(zonePartId, timestamp, future);
return future
.orTimeout(timeout, unit)
.exceptionally(e -> {
if (e instanceof TimeoutException) {
- throw new PrimaryReplicaAwaitTimeoutException(groupId, timestamp, leases.leaseByGroupId().get(groupId), e);
+ throw new PrimaryReplicaAwaitTimeoutException(zonePartId, timestamp, leases.leaseByGroupId().get(zonePartId), e);
}
- throw new PrimaryReplicaAwaitException(groupId, timestamp, e);
+ throw new PrimaryReplicaAwaitException(zonePartId, timestamp, e);
});
}
@Override
- public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) {
+ public CompletableFuture getPrimaryReplica(ReplicationGroupId groupId, HybridTimestamp timestamp) {
+ assert groupId instanceof TablePartitionId : "Unexpected replication group type [grp=" + groupId + "].";
+
+ var tblPartId = (TablePartitionId) groupId;
+
+ ReplicationGroupId groupId0 = tablePartIdToZoneIdProvider.apply(tblPartId);
+
return inBusyLockAsync(busyLock, () -> {
- Lease lease = getLease(replicationGroupId);
+ Lease lease = getLease(groupId0);
if (lease.isAccepted() && clockService.after(lease.getExpirationTime(), timestamp)) {
return completedFuture(lease);
@@ -288,7 +458,7 @@ public CompletableFuture getPrimaryReplica(ReplicationGroupId repli
.clusterTime()
.waitFor(timestamp.addPhysicalTime(clockService.maxClockSkewMillis()))
.thenApply(ignored -> inBusyLock(busyLock, () -> {
- Lease lease0 = getLease(replicationGroupId);
+ Lease lease0 = getLease(groupId0);
if (lease0.isAccepted() && clockService.after(lease0.getExpirationTime(), timestamp)) {
return lease0;
@@ -352,60 +522,79 @@ private void loadLeasesBusyAsync(long recoveryRevision) {
/**
* Fires the primary replica expire event if it needs.
*
- * @param grpId Group id, used for the cases when the {@code lease} parameter is null. Should be always not null.
- * @param lease Lease to check on expiration.
- * @return Whether the event is needed.
+ * @param previousLease Lease to check on expiration.
+ * @param newLease A new lease.
+ * @return Collection of replication group ids, which are needed to be notified.
*/
- private boolean needToFireEventReplicaExpired(ReplicationGroupId grpId, @Nullable Lease lease) {
- assert lease == null || lease.replicationGroupId().equals(grpId)
- : IgniteStringFormatter.format("Group id mismatch [groupId={}, lease={}]", grpId, lease);
+ private Set needFireEventReplicaExpired(Lease previousLease, @Nullable Lease newLease) {
+ assert previousLease.isAccepted() : previousLease;
- Lease currentLease = leases.leaseByGroupId().get(grpId);
+ if (newLease == null || !newLease.isAccepted() || !newLease.getStartTime().equals(previousLease.getStartTime())) {
+ return previousLease.subgroups();
+ }
- if (currentLease != null && currentLease.isAccepted()) {
- boolean sameLease = lease != null && currentLease.getStartTime().equals(lease.getStartTime());
+ Set needToBeNotified = new HashSet<>(previousLease.subgroups());
- if (!sameLease) {
- return true;
- }
- }
+ needToBeNotified.removeAll(newLease.subgroups());
- return false;
+ return needToBeNotified;
}
/**
* Fires the primary replica expire event.
*
+ * @param groupId Replication group id.
* @param causalityToken Causality token.
* @param expiredLease Expired lease.
*/
- private void firePrimaryReplicaExpiredEvent(long causalityToken, Lease expiredLease) {
- ReplicationGroupId grpId = expiredLease.replicationGroupId();
+ private void fireEventPrimaryReplicaExpired(ReplicationGroupId groupId, long causalityToken, Lease expiredLease) {
+ TablePartitionId tablePartitionId = (TablePartitionId) groupId;
+
+ ZonePartitionId zonePartitionId = (ZonePartitionId) expiredLease.replicationGroupId();
- CompletableFuture prev = expirationFutureByGroup.put(grpId, fireEvent(
+ CompletableFuture fut = fireEvent(
PRIMARY_REPLICA_EXPIRED,
new PrimaryReplicaEventParameters(
causalityToken,
- grpId,
+ new ZonePartitionId(zonePartitionId.zoneId(), tablePartitionId.tableId(), zonePartitionId.partitionId()),
expiredLease.getLeaseholderId(),
expiredLease.getLeaseholder(),
expiredLease.getStartTime()
)
- ));
+ );
+
+ CompletableFuture prev = expirationFutureByGroup.put(
+ groupId,
+ fut
+ );
- assert prev == null || prev.isDone() : "Previous lease expiration process has not completed yet [grpId=" + grpId + ']';
+ assert prev == null || prev.isDone() :
+ "Previous lease expiration process has not completed yet [grpId=" + expiredLease.replicationGroupId()
+ + ", subGrpId=" + groupId + ']';
}
- private CompletableFuture fireEventReplicaBecomePrimary(long causalityToken, Lease lease) {
+ /**
+ * Fires the replica become primary event.
+ *
+ * @param groupId Replication group id.
+ * @param causalityToken Causality token.
+ * @param lease A new lease.
+ * @return Future to notification complete.
+ */
+ private CompletableFuture fireEventReplicaBecomePrimary(ReplicationGroupId groupId, long causalityToken, Lease lease) {
String leaseholderId = lease.getLeaseholderId();
+ ZonePartitionId zonePartitionId = (ZonePartitionId) lease.replicationGroupId();
+
+ TablePartitionId tablePartitionId = (TablePartitionId) groupId;
+
assert leaseholderId != null : lease;
return fireEvent(
PRIMARY_REPLICA_ELECTED,
new PrimaryReplicaEventParameters(
causalityToken,
- lease.replicationGroupId(),
+ new ZonePartitionId(zonePartitionId.zoneId(), tablePartitionId.tableId(), zonePartitionId.partitionId()),
leaseholderId,
lease.getLeaseholder(),
lease.getStartTime()
@@ -418,11 +607,19 @@ private CompletableFuture fireEventReplicaBecomePrimary(long causalityToke
*
* @param previousLease Previous group lease, {@code null} if absent.
* @param newLease New group lease.
- * @return {@code true} if there is no previous lease for the group or the new lease is not prolongation.
+ * @return Collection of replication group ids, which are needed to be notified.
*/
- private static boolean needFireEventReplicaBecomePrimary(@Nullable Lease previousLease, Lease newLease) {
+ private static Set needFireEventReplicaBecomePrimary(@Nullable Lease previousLease, Lease newLease) {
assert newLease.isAccepted() : newLease;
- return previousLease == null || !previousLease.isAccepted() || !previousLease.getStartTime().equals(newLease.getStartTime());
+ if (previousLease == null || !previousLease.isAccepted() || !previousLease.getStartTime().equals(newLease.getStartTime())) {
+ return newLease.subgroups();
+ }
+
+ Set needToBeNotified = new HashSet<>(newLease.subgroups());
+
+ needToBeNotified.removeAll(previousLease.subgroups());
+
+ return needToBeNotified;
}
}
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java
index fa1fce72e9e..faabba6012d 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseAgreement.java
@@ -22,6 +22,7 @@
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
import static org.apache.ignite.internal.util.IgniteUtils.findAny;
+import java.util.Collections;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import org.apache.ignite.internal.affinity.Assignment;
@@ -116,6 +117,19 @@ public String getRedirectTo() {
return resp != null ? resp.redirectProposal() : null;
}
+ /**
+ * The lease was considered by the set of replication subgroups.
+ *
+ * @return A set of applied groups.
+ */
+ public Set applicableFor() {
+ assert ready() : "The method should be invoked only after the agreement is ready";
+
+ LeaseGrantedMessageResponse resp = responseFut.join();
+
+ return resp != null ? resp.appliedGroups() : Collections.emptySet();
+ }
+
/**
* Returns true if the agreement is negotiated, false otherwise.
*
diff --git a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java
index ca3b719888f..8bd9a76e955 100644
--- a/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java
+++ b/modules/placement-driver/src/main/java/org/apache/ignite/internal/placementdriver/negotiation/LeaseNegotiator.java
@@ -90,6 +90,8 @@ public void negotiate(Lease lease, boolean force) {
LeaseGrantedMessageResponse response = (LeaseGrantedMessageResponse) msg;
+ assert !response.accepted() || response.appliedGroups() != null : response;
+
fut.complete(response);
} else {
if (!(unwrapCause(throwable) instanceof NodeStoppingException)) {
diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java
index 82b217f184b..80535fcb221 100644
--- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java
+++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseNegotiationTest.java
@@ -59,6 +59,7 @@
import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse;
import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessagesFactory;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.network.NetworkAddress;
import org.apache.ignite.network.TopologyService;
@@ -72,7 +73,7 @@
public class LeaseNegotiationTest extends BaseIgniteAbstractTest {
private static final PlacementDriverMessagesFactory MSG_FACTORY = new PlacementDriverMessagesFactory();
- private static final TablePartitionId GROUP_ID = new TablePartitionId(0, 0);
+ private static final ZonePartitionId ZONE_PARTITION_ID = new ZonePartitionId(0, 0);
private static final String NODE_0_NAME = "node0";
private static final LogicalNode CLUSTER_NODE_0 = new LogicalNode(randomUUID().toString(), NODE_0_NAME, mock(NetworkAddress.class));
@@ -144,9 +145,11 @@ private LeaseUpdater createLeaseUpdater() {
when(pdClusterService.topologyService()).thenAnswer(inv -> pdTopologyService);
LeaseTracker leaseTracker = new LeaseTracker(
+ NODE_0_NAME,
metaStorageManager,
pdClusterService.topologyService(),
- new TestClockService(new HybridClockImpl())
+ new TestClockService(new HybridClockImpl()),
+ grpId -> ZONE_PARTITION_ID
);
leaseTracker.startTrack(0L);
@@ -157,12 +160,16 @@ private LeaseUpdater createLeaseUpdater() {
metaStorageManager,
pdLogicalTopologyService,
leaseTracker,
- new TestClockService(new HybridClockImpl())
+ new TestClockService(new HybridClockImpl()),
+ grpId -> ZONE_PARTITION_ID
);
}
private static LeaseGrantedMessageResponse createLeaseGrantedMessageResponse(boolean accept) {
- return MSG_FACTORY.leaseGrantedMessageResponse().accepted(accept).build();
+ return MSG_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(new TablePartitionId(42, 0)))
+ .accepted(accept)
+ .build();
}
@Test
@@ -180,11 +187,11 @@ public void testAssignmentChangeOnNegotiation() throws InterruptedException {
return createLeaseGrantedMessageResponse(true);
};
- metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
+ metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
assertThat(lgmReceived, willCompleteSuccessfully());
- metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_1_NAME))));
+ metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_1_NAME))));
waitForAcceptedLease();
@@ -207,7 +214,7 @@ public void testAssignmentChangeOnNegotiationAndReplicaRejectsLease() throws Int
return createLeaseGrantedMessageResponse(true);
};
- metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
+ metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
assertThat(lgmReceived, willCompleteSuccessfully());
@@ -231,7 +238,10 @@ public void testAssignmentChangeOnNegotiationNodeLeftTopology() throws Interrupt
return createLeaseGrantedMessageResponse(true);
};
- metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME), forPeer(NODE_1_NAME))));
+ metaStorageManager.put(
+ stablePartAssignmentsKey(ZONE_PARTITION_ID),
+ Assignments.toBytes(Set.of(forPeer(NODE_0_NAME), forPeer(NODE_1_NAME)))
+ );
assertThat(lgmReceived, willCompleteSuccessfully());
@@ -258,7 +268,7 @@ public void testNetworkExceptionOnNegotiation() throws InterruptedException {
return createLeaseGrantedMessageResponse(true);
};
- metaStorageManager.put(stablePartAssignmentsKey(GROUP_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
+ metaStorageManager.put(stablePartAssignmentsKey(ZONE_PARTITION_ID), Assignments.toBytes(Set.of(forPeer(NODE_0_NAME))));
assertThat(lgmReceived, willCompleteSuccessfully());
diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java
index 826bd4d228e..835607cf8fe 100644
--- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java
+++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseTrackerTest.java
@@ -29,6 +29,7 @@
import static org.mockito.Mockito.when;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.ignite.internal.hlc.HybridClockImpl;
import org.apache.ignite.internal.hlc.HybridTimestamp;
@@ -45,6 +46,7 @@
import org.apache.ignite.internal.placementdriver.leases.LeaseBatch;
import org.apache.ignite.internal.placementdriver.leases.LeaseTracker;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.network.ClusterNodeResolver;
import org.junit.jupiter.api.Test;
@@ -71,9 +73,11 @@ public void testLeaseCleanup() {
when(msManager.getLocally(any(), anyLong())).thenAnswer(invocation -> emptyEntry);
LeaseTracker leaseTracker = new LeaseTracker(
+ "testNode",
msManager,
mock(ClusterNodeResolver.class),
- new TestClockService(new HybridClockImpl())
+ new TestClockService(new HybridClockImpl()),
+ tablePartitionId -> new ZonePartitionId(123, tablePartitionId.partitionId())
);
leaseTracker.startTrack(0L);
@@ -83,8 +87,11 @@ public void testLeaseCleanup() {
return falseCompletedFuture();
});
- TablePartitionId partId0 = new TablePartitionId(0, 0);
- TablePartitionId partId1 = new TablePartitionId(0, 1);
+ ZonePartitionId partId0 = new ZonePartitionId(123, 0);
+ ZonePartitionId partId1 = new ZonePartitionId(123, 1);
+
+ TablePartitionId tablePartitionId = new TablePartitionId(1, 1);
+ ZonePartitionId partId1FromEvent = new ZonePartitionId(123, 1, 1);
HybridTimestamp startTime = new HybridTimestamp(1, 0);
HybridTimestamp expirationTime = new HybridTimestamp(1000, 0);
@@ -94,7 +101,7 @@ public void testLeaseCleanup() {
Lease lease0 = new Lease(leaseholder0, leaseholder0 + "_id", startTime, expirationTime, partId0);
Lease lease1 = new Lease(leaseholder1, leaseholder1 + "_id", startTime, expirationTime, partId1)
- .acceptLease(new HybridTimestamp(2000, 0));
+ .acceptLease(new HybridTimestamp(2000, 0), Set.of(tablePartitionId));
// In entry0, there are leases for partition ids partId0 and partId1. In entry1, there is only partId0, so partId1 is expired.
Entry entry0 = new EntryImpl(PLACEMENTDRIVER_LEASES_KEY.bytes(), new LeaseBatch(List.of(lease0, lease1)).bytes(), 0, 0);
@@ -106,7 +113,7 @@ public void testLeaseCleanup() {
// Check that the absence of accepted lease triggers the event.
listenerRef.get().onUpdate(new WatchEvent(new EntryEvent(emptyEntry, entry1)));
assertNotNull(parametersRef.get());
- assertEquals(partId1, parametersRef.get().groupId());
+ assertEquals(partId1FromEvent, parametersRef.get().groupId());
// Check that the absence of not accepted lease doesn't trigger the event.
parametersRef.set(null);
diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java
index 7caff8bb83d..f94afdfcb86 100644
--- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java
+++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/LeaseUpdaterTest.java
@@ -66,7 +66,7 @@
import org.apache.ignite.internal.placementdriver.leases.LeaseTracker;
import org.apache.ignite.internal.placementdriver.leases.Leases;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
import org.apache.ignite.internal.util.Cursor;
@@ -107,7 +107,7 @@ public class LeaseUpdaterTest extends BaseIgniteAbstractTest {
@BeforeEach
void setUp() {
Entry entry = new EntryImpl(
- stablePartAssignmentsKey(new TablePartitionId(1, 0)).bytes(),
+ stablePartAssignmentsKey(new ZonePartitionId(1, 0)).bytes(),
Assignments.of(Assignment.forPeer(node.name())).toBytes(),
1,
0
@@ -143,7 +143,8 @@ void setUp() {
metaStorageManager,
topologyService,
leaseTracker,
- new TestClockService(new HybridClockImpl())
+ new TestClockService(new HybridClockImpl()),
+ grp -> new ZonePartitionId(grp.tableId(), grp.partitionId())
);
leaseUpdater.init();
diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java
index c177c5c7ef3..70e41a025bd 100644
--- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java
+++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/PlacementDriverTest.java
@@ -44,7 +44,9 @@
import static org.mockito.Mockito.mock;
import java.util.List;
+import java.util.Map;
import java.util.Objects;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
@@ -63,6 +65,7 @@
import org.apache.ignite.internal.placementdriver.leases.LeaseTracker;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.internal.util.PendingComparableValuesTracker;
import org.apache.ignite.network.ClusterNode;
@@ -83,6 +86,10 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest {
private static final TablePartitionId GROUP_1 = new TablePartitionId(1000, 0);
+ private static final ZonePartitionId ZONE_GROUP_1 = new ZonePartitionId(2000, 0);
+
+ private static final Map tableIdToZoneIdMapper = Map.of(GROUP_1, ZONE_GROUP_1);
+
private static final String LEASEHOLDER_1 = "leaseholder1";
private static final String LEASEHOLDER_ID_1 = "leaseholder1_id";
@@ -97,7 +104,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest {
false,
true,
null,
- GROUP_1
+ ZONE_GROUP_1,
+ Set.of(GROUP_1)
);
private static final Lease LEASE_FROM_1_TO_15_000 = new Lease(
@@ -108,7 +116,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest {
false,
true,
null,
- GROUP_1
+ ZONE_GROUP_1,
+ Set.of(GROUP_1)
);
private static final Lease LEASE_FROM_15_000_TO_30_000 = new Lease(
@@ -119,7 +128,8 @@ public class PlacementDriverTest extends BaseIgniteAbstractTest {
false,
true,
null,
- GROUP_1
+ ZONE_GROUP_1,
+ Set.of(GROUP_1)
);
private static final int AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES = 1_000;
@@ -194,7 +204,10 @@ public void testAwaitPrimaryReplicaInInterval() throws Exception {
publishLease(LEASE_FROM_1_TO_5_000);
// Await local node to be notified about new primary replica.
- assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_5_000), 1_000));
+ assertTrue(waitForCondition(
+ () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_5_000),
+ 1_000)
+ );
// Assert that primary await future isn't completed yet because corresponding await time 10 is greater than lease expiration time 5.
assertFalse(primaryReplicaFuture.isDone());
@@ -232,7 +245,10 @@ public void testAwaitPrimaryReplicaBeforeInterval() throws Exception {
publishLease(LEASE_FROM_1_TO_5_000);
// Await local node to be notified about new primary replica.
- assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_5_000), 1_000));
+ assertTrue(waitForCondition(
+ () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_5_000),
+ 1_000
+ ));
// Assert that primary await future isn't completed yet because corresponding await time 10 is greater than lease expiration time 5.
assertFalse(primaryReplicaFuture.isDone());
@@ -262,8 +278,10 @@ public void testAwaitPrimaryReplicaBeforeIntervalAfterPublishing() throws Except
publishLease(LEASE_FROM_1_TO_15_000);
// Await local node to be notified about new primary replica.
- assertTrue(waitForCondition(() -> placementDriver.getLease(GROUP_1).equals(LEASE_FROM_1_TO_15_000),
- AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES));
+ assertTrue(waitForCondition(
+ () -> placementDriver.getLease(ZONE_GROUP_1).equals(LEASE_FROM_1_TO_15_000),
+ AWAIT_PERIOD_FOR_LOCAL_NODE_TO_BE_NOTIFIED_ABOUT_LEASE_UPDATES
+ ));
// Await primary replica for time 10.
CompletableFuture primaryReplicaFuture = placementDriver.awaitPrimaryReplica(GROUP_1, AWAIT_TIME_10_000,
@@ -311,7 +329,8 @@ private void testAwaitCurrentPrimaryIsOffline(
false,
true,
null,
- GROUP_1
+ ZONE_GROUP_1,
+ Set.of(GROUP_1)
);
publishLease(firstLease);
@@ -333,7 +352,8 @@ private void testAwaitCurrentPrimaryIsOffline(
false,
true,
null,
- GROUP_1
+ ZONE_GROUP_1,
+ Set.of(GROUP_1)
);
if (newLeaseholderIsOnline) {
@@ -617,7 +637,7 @@ void testListenNeighborGroupReplicaBecomePrimaryEvent() {
publishLease(lease);
- TablePartitionId groupId = (TablePartitionId) lease.replicationGroupId();
+ ZonePartitionId groupId = (ZonePartitionId) lease.replicationGroupId();
CompletableFuture eventParametersFuture = listenSpecificGroupReplicaBecomePrimaryEvent(groupId);
@@ -629,7 +649,8 @@ void testListenNeighborGroupReplicaBecomePrimaryEvent() {
false,
true,
null,
- new TablePartitionId(groupId.tableId() + 1, groupId.partitionId() + 1)
+ new ZonePartitionId(groupId.zoneId() + 1, groupId.partitionId() + 1),
+ Set.of(new TablePartitionId(groupId.zoneId() + 1, groupId.partitionId() + 1))
);
publishLeases(lease, neighborGroupLease);
@@ -683,21 +704,27 @@ private static void checkReplicaBecomePrimaryEventParameters(
Lease expLease,
PrimaryReplicaEventParameters parameters
) {
- assertThat(parameters.groupId(), equalTo(expLease.replicationGroupId()));
+ assertThat(parameters.groupId().toString(), equalTo(expLease.replicationGroupId().toString()));
assertThat(parameters.leaseholderId(), equalTo(expLease.getLeaseholderId()));
}
private LeaseTracker createPlacementDriver() {
- return new LeaseTracker(metastore, new ClusterNodeResolver() {
- @Override
- public @Nullable ClusterNode getByConsistentId(String consistentId) {
- return leaseholder;
- }
-
- @Override
- public @Nullable ClusterNode getById(String id) {
- return leaseholder;
- }
- }, clockService);
+ return new LeaseTracker(
+ LEASEHOLDER_ID_1,
+ metastore,
+ new ClusterNodeResolver() {
+ @Override
+ public @Nullable ClusterNode getByConsistentId(String consistentId) {
+ return leaseholder;
+ }
+
+ @Override
+ public @Nullable ClusterNode getById(String id) {
+ return leaseholder;
+ }
+ },
+ clockService,
+ tableIdToZoneIdMapper::get
+ );
}
}
diff --git a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java
index 43f0bee08b7..0182913c249 100644
--- a/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java
+++ b/modules/placement-driver/src/test/java/org/apache/ignite/internal/placementdriver/leases/LeaseSerializationTest.java
@@ -22,6 +22,7 @@
import java.nio.ByteBuffer;
import java.util.ArrayList;
+import java.util.Set;
import org.apache.ignite.internal.hlc.HybridTimestamp;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.apache.ignite.internal.replicator.TablePartitionId;
@@ -94,7 +95,8 @@ private static Lease newLease(
prolong,
accepted,
proposedCandidate,
- replicationGroupId
+ replicationGroupId,
+ Set.of(replicationGroupId)
);
}
diff --git a/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java b/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java
index 28d9bc8aee0..c24fa5021a5 100644
--- a/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java
+++ b/modules/replicator/src/integrationTest/java/org/apache/ignite/internal/replicator/ItPlacementDriverReplicaSideTest.java
@@ -108,7 +108,7 @@
public class ItPlacementDriverReplicaSideTest extends IgniteAbstractTest {
private static final int BASE_PORT = 1234;
- private static final TestReplicationGroupId GROUP_ID = new TestReplicationGroupId("group_1");
+ private static final TablePartitionId GROUP_ID = new TablePartitionId(1, 0);
private static final ReplicaMessagesFactory REPLICA_MESSAGES_FACTORY = new ReplicaMessagesFactory();
@@ -488,6 +488,7 @@ private CompletableFuture createReplicationGroup(
try {
return replicaManager.startReplica(
groupId,
+ new ZonePartitionId(0, 0),
(request, senderId) -> {
log.info("Handle request [type={}]", request.getClass().getSimpleName());
diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java
index 380e5c7b227..299ce368f23 100644
--- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java
+++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/Replica.java
@@ -23,8 +23,9 @@
import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause;
import static org.apache.ignite.internal.util.IgniteUtils.retryOperationUntilSuccess;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
-import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
@@ -33,7 +34,6 @@
import org.apache.ignite.internal.lang.IgniteStringFormatter;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
-import org.apache.ignite.internal.network.NetworkMessage;
import org.apache.ignite.internal.placementdriver.PlacementDriver;
import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessage;
import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse;
@@ -43,8 +43,11 @@
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupService;
import org.apache.ignite.internal.replicator.listener.ReplicaListener;
import org.apache.ignite.internal.replicator.message.PrimaryReplicaChangeCommand;
+import org.apache.ignite.internal.replicator.message.PrimaryReplicaRequest;
import org.apache.ignite.internal.replicator.message.ReplicaMessagesFactory;
import org.apache.ignite.internal.replicator.message.ReplicaRequest;
+import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage;
+import org.apache.ignite.internal.util.FastTimestamps;
import org.apache.ignite.internal.util.PendingComparableValuesTracker;
import org.apache.ignite.network.ClusterNode;
@@ -63,6 +66,9 @@ public class Replica {
/** Replica group identity, this id is the same as the considered partition's id. */
private final ReplicationGroupId replicaGrpId;
+ /** Zone partition id. */
+ private final ZonePartitionId zonePartitionId;
+
/** Replica listener. */
private final ReplicaListener listener;
@@ -88,16 +94,19 @@ public class Replica {
/** External executor. */
// TODO: IGNITE-20063 Maybe get rid of it
- private final ExecutorService executor;
+ private final Executor executor;
private final PlacementDriver placementDriver;
private final ClockService clockService;
+ private final CompletableFuture waitForActualStateFuture = new CompletableFuture<>();
+
/**
* The constructor of a replica server.
*
* @param replicaGrpId Replication group id.
+ * @param zonePartitionId Zone partition id.
* @param listener Replica listener.
* @param storageIndexTracker Storage index tracker.
* @param raftClient Topology aware Raft client.
@@ -108,15 +117,17 @@ public class Replica {
*/
public Replica(
ReplicationGroupId replicaGrpId,
+ ZonePartitionId zonePartitionId,
ReplicaListener listener,
PendingComparableValuesTracker storageIndexTracker,
TopologyAwareRaftGroupService raftClient,
ClusterNode localNode,
- ExecutorService executor,
+ Executor executor,
PlacementDriver placementDriver,
ClockService clockService
) {
this.replicaGrpId = replicaGrpId;
+ this.zonePartitionId = zonePartitionId;
this.listener = listener;
this.storageIndexTracker = storageIndexTracker;
this.raftClient = raftClient;
@@ -141,6 +152,41 @@ public CompletableFuture processRequest(ReplicaRequest request, S
request.groupId(),
replicaGrpId);
+ if (request instanceof PrimaryReplicaRequest) {
+ var targetPrimaryReq = (PrimaryReplicaRequest) request;
+
+ if (request instanceof WaitReplicaStateMessage) {
+ if (!waitForActualStateFuture.isDone()) {
+ return processWaitReplicaStateMessage((WaitReplicaStateMessage) request)
+ .thenComposeAsync(
+ v -> sendPrimaryReplicaChangeToReplicationGroup(targetPrimaryReq.enlistmentConsistencyToken()),
+ executor
+ )
+ .thenComposeAsync(
+ unused -> completedFuture(new ReplicaResult(null, null)),
+ executor
+ );
+ } else {
+ return completedFuture(new ReplicaResult(null, null));
+ }
+ }
+
+ if (!waitForActualStateFuture.isDone()) {
+ return placementDriver.addSubgroups(
+ zonePartitionId,
+ targetPrimaryReq.enlistmentConsistencyToken(),
+ Set.of(replicaGrpId)
+ )
+ // TODO: https://issues.apache.org/jira/browse/IGNITE-22122
+ .thenComposeAsync(unused -> waitForActualState(FastTimestamps.coarseCurrentTimeMillis() + 10_000), executor)
+ .thenComposeAsync(
+ v -> sendPrimaryReplicaChangeToReplicationGroup(targetPrimaryReq.enlistmentConsistencyToken()),
+ executor
+ )
+ .thenComposeAsync(unused -> listener.invoke(request, senderId), executor);
+ }
+ }
+
return listener.invoke(request, senderId);
}
@@ -171,7 +217,7 @@ private CompletableFuture leaderFuture() {
* @param msg Message to process.
* @return Future that contains a result.
*/
- public CompletableFuture extends NetworkMessage> processPlacementDriverMessage(PlacementDriverReplicaMessage msg) {
+ public CompletableFuture processPlacementDriverMessage(PlacementDriverReplicaMessage msg) {
if (msg instanceof LeaseGrantedMessage) {
return processLeaseGrantedMessage((LeaseGrantedMessage) msg)
.handle((v, e) -> {
@@ -203,7 +249,7 @@ public CompletableFuture extends NetworkMessage> processPlacementDriverMessage
private CompletableFuture processLeaseGrantedMessage(LeaseGrantedMessage msg) {
LOG.info("Received LeaseGrantedMessage for replica belonging to group=" + groupId() + ", force=" + msg.force());
- return placementDriver.previousPrimaryExpired(groupId()).thenCompose(unused -> leaderFuture().thenCompose(leader -> {
+ return placementDriver.previousPrimaryExpired(msg.groupId()).thenCompose(unused -> leaderFuture().thenCompose(leader -> {
HybridTimestamp leaseExpirationTime = this.leaseExpirationTime;
if (leaseExpirationTime != null) {
@@ -240,6 +286,18 @@ private CompletableFuture processLeaseGrantedMessag
}));
}
+ /**
+ * Process {@link WaitReplicaStateMessage}.
+ *
+ * @param msg Message to process.
+ * @return Future that contains a result.
+ */
+ private CompletableFuture processWaitReplicaStateMessage(WaitReplicaStateMessage msg) {
+ LOG.info("WaitReplicaStateMessage was received [groupId = {}]", groupId());
+
+ return waitForActualState(FastTimestamps.coarseCurrentTimeMillis() + TimeUnit.SECONDS.toMillis(msg.timeout()));
+ }
+
private CompletableFuture sendPrimaryReplicaChangeToReplicationGroup(long leaseStartTime) {
PrimaryReplicaChangeCommand cmd = REPLICA_MESSAGES_FACTORY.primaryReplicaChangeCommand()
.leaseStartTime(leaseStartTime)
@@ -257,6 +315,7 @@ private CompletableFuture acceptLease(
this.leaseExpirationTime = leaseExpirationTime;
LeaseGrantedMessageResponse resp = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(Set.of(replicaGrpId))
.accepted(true)
.build();
@@ -292,7 +351,8 @@ private CompletableFuture waitForActualState(long expirationTime) {
return retryOperationUntilSuccess(raftClient::readIndex, e -> currentTimeMillis() > expirationTime, executor)
.orTimeout(timeout, TimeUnit.MILLISECONDS)
- .thenCompose(storageIndexTracker::waitFor);
+ .thenCompose(idx -> storageIndexTracker.waitFor(idx))
+ .thenRun(() -> waitForActualStateFuture.complete(null));
}
/**
diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java
new file mode 100644
index 00000000000..acb366fc44f
--- /dev/null
+++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaAwareLeaseTracker.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.replicator;
+
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import org.apache.ignite.internal.event.AbstractEventProducer;
+import org.apache.ignite.internal.event.EventListener;
+import org.apache.ignite.internal.hlc.HybridTimestamp;
+import org.apache.ignite.internal.placementdriver.PlacementDriver;
+import org.apache.ignite.internal.placementdriver.ReplicaMeta;
+import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEvent;
+import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
+import org.apache.ignite.internal.replicator.message.ReplicaMessagesFactory;
+import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage;
+import org.apache.ignite.network.ClusterNodeResolver;
+
+/**
+ * Implementation of {@link PlacementDriver} that is aware if {@link ReplicaService}.
+ * It delegates calls to the original {@link PlacementDriver} and after that sends {@link WaitReplicaStateMessage}
+ * which calls {@link org.apache.ignite.internal.replicator.Replica#waitForActualState(long)}.
+ */
+// TODO https://issues.apache.org/jira/browse/IGNITE-20362
+@Deprecated
+public class ReplicaAwareLeaseTracker extends AbstractEventProducer implements
+ PlacementDriver {
+ /** Replicator network message factory. */
+ private static final ReplicaMessagesFactory REPLICA_MESSAGES_FACTORY = new ReplicaMessagesFactory();
+
+ private final PlacementDriver delegate;
+ private final ReplicaService replicaService;
+
+ /** Resolver that resolves a node consistent ID to cluster node. */
+ private final ClusterNodeResolver clusterNodeResolver;
+
+
+ /**
+ * Constructor.
+ *
+ * @param delegate Delegate Placement Driver.
+ * @param replicaService Replica Service.
+ * @param clusterNodeResolver Cluster node resolver.
+ */
+ public ReplicaAwareLeaseTracker(PlacementDriver delegate, ReplicaService replicaService, ClusterNodeResolver clusterNodeResolver) {
+ this.delegate = delegate;
+ this.replicaService = replicaService;
+ this.clusterNodeResolver = clusterNodeResolver;
+ }
+
+ @Override
+ public void listen(PrimaryReplicaEvent evt, EventListener extends PrimaryReplicaEventParameters> listener) {
+ delegate.listen(evt, listener);
+ }
+
+ @Override
+ public void removeListener(PrimaryReplicaEvent evt, EventListener extends PrimaryReplicaEventParameters> listener) {
+ delegate.removeListener(evt, listener);
+ }
+
+ @Override
+ public CompletableFuture awaitPrimaryReplica(ReplicationGroupId groupId, HybridTimestamp timestamp, long timeout,
+ TimeUnit unit) {
+ return delegate.awaitPrimaryReplica(groupId, timestamp, timeout, unit);
+ }
+
+ @Override
+ public CompletableFuture awaitPrimaryReplicaForTable(
+ ReplicationGroupId groupId,
+ HybridTimestamp timestamp,
+ long timeout,
+ TimeUnit unit
+ ) {
+ ZonePartitionId zonePartitionId = (ZonePartitionId) groupId;
+
+ assert zonePartitionId.tableId() != 0 : "Table id should be defined.";
+
+ ZonePartitionId pureZonePartId = zonePartitionId.purify();
+
+ return delegate.awaitPrimaryReplicaForTable(pureZonePartId, timestamp, timeout, unit);
+ }
+
+ @Override
+ public CompletableFuture getPrimaryReplica(ReplicationGroupId replicationGroupId, HybridTimestamp timestamp) {
+ return delegate.getPrimaryReplica(replicationGroupId, timestamp);
+ }
+
+ @Override
+ public CompletableFuture previousPrimaryExpired(ReplicationGroupId grpId) {
+ return delegate.previousPrimaryExpired(grpId);
+ }
+
+ @Override
+ public ReplicaMeta getLeaseMeta(ReplicationGroupId grpId) {
+ return delegate.getLeaseMeta(grpId);
+ }
+
+ @Override
+ public CompletableFuture addSubgroups(ZonePartitionId zoneId, Long enlistmentConsistencyToken, Set subGrps) {
+ return delegate.addSubgroups(zoneId, enlistmentConsistencyToken, subGrps);
+ }
+}
diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
index 1b7c8ef725b..936cabebb57 100644
--- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
+++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
@@ -25,12 +25,16 @@
import static org.apache.ignite.internal.thread.ThreadOperation.STORAGE_READ;
import static org.apache.ignite.internal.thread.ThreadOperation.STORAGE_WRITE;
import static org.apache.ignite.internal.thread.ThreadOperation.TX_STATE_STORAGE_ACCESS;
+import static org.apache.ignite.internal.util.CompletableFutures.allOf;
import static org.apache.ignite.internal.util.CompletableFutures.isCompletedSuccessfully;
import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause;
import static org.apache.ignite.internal.util.IgniteUtils.shutdownAndAwaitTermination;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
@@ -38,11 +42,8 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
-import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
-import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
@@ -63,6 +64,9 @@
import org.apache.ignite.internal.network.NetworkMessage;
import org.apache.ignite.internal.network.NetworkMessageHandler;
import org.apache.ignite.internal.placementdriver.PlacementDriver;
+import org.apache.ignite.internal.placementdriver.ReplicaMeta;
+import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessage;
+import org.apache.ignite.internal.placementdriver.message.LeaseGrantedMessageResponse;
import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessageGroup;
import org.apache.ignite.internal.placementdriver.message.PlacementDriverMessagesFactory;
import org.apache.ignite.internal.placementdriver.message.PlacementDriverReplicaMessage;
@@ -80,6 +84,7 @@
import org.apache.ignite.internal.replicator.message.ReplicaRequest;
import org.apache.ignite.internal.replicator.message.ReplicaSafeTimeSyncRequest;
import org.apache.ignite.internal.replicator.message.TimestampAware;
+import org.apache.ignite.internal.replicator.message.WaitReplicaStateMessage;
import org.apache.ignite.internal.thread.ExecutorChooser;
import org.apache.ignite.internal.thread.NamedThreadFactory;
import org.apache.ignite.internal.thread.PublicApiThreading;
@@ -137,9 +142,13 @@ public class ReplicaManager extends AbstractEventProducer> zonePartIdToTablePartId = new ConcurrentHashMap<>();
+
/** Scheduled executor for idle safe time sync. */
private final ScheduledExecutorService scheduledIdleSafeTimeSyncExecutor;
+ private final ScheduledExecutorService scheduledTableLeaseUpdateExecutor;
+
private final Executor requestsExecutor;
private final FailureProcessor failureProcessor;
@@ -147,10 +156,6 @@ public class ReplicaManager extends AbstractEventProducer> messageGroupsToHandle;
- /** Executor. */
- // TODO: IGNITE-20063 Maybe get rid of it
- private final ExecutorService executor;
-
private String localNodeId;
/**
@@ -226,15 +231,9 @@ public ReplicaManager(
NamedThreadFactory.create(nodeName, "scheduled-idle-safe-time-sync-thread", LOG)
);
- int threadCount = Runtime.getRuntime().availableProcessors();
-
- executor = new ThreadPoolExecutor(
- threadCount,
- threadCount,
- 30,
- TimeUnit.SECONDS,
- new LinkedBlockingQueue<>(),
- NamedThreadFactory.create(nodeName, "replica", LOG)
+ scheduledTableLeaseUpdateExecutor = Executors.newScheduledThreadPool(
+ 1,
+ NamedThreadFactory.create(nodeName, "scheduled-table-lease-update-thread", LOG)
);
}
@@ -414,7 +413,7 @@ private void onPlacementDriverMessageReceived(NetworkMessage msg0, ClusterNode s
assert correlationId != null;
- var msg = (PlacementDriverReplicaMessage) msg0;
+ var msg = (LeaseGrantedMessage) msg0;
if (!busyLock.enterBusy()) {
if (LOG.isInfoEnabled()) {
@@ -425,17 +424,37 @@ private void onPlacementDriverMessageReceived(NetworkMessage msg0, ClusterNode s
}
try {
- CompletableFuture replicaFut = replicas.computeIfAbsent(msg.groupId(), k -> new CompletableFuture<>());
-
- replicaFut
- .thenCompose(replica -> replica.processPlacementDriverMessage(msg))
- .whenComplete((response, ex) -> {
- if (ex == null) {
- clusterNetSvc.messagingService().respond(senderConsistentId, response, correlationId);
- } else if (!(unwrapCause(ex) instanceof NodeStoppingException)) {
- LOG.error("Failed to process placement driver message [msg={}].", ex, msg);
- }
- });
+ Set replicationGroupIds = zonePartIdToTablePartId.getOrDefault((ZonePartitionId) msg.groupId(), Set.of());
+
+ CompletableFuture[] futures = new CompletableFuture[replicationGroupIds.size()];
+
+ int i = 0;
+
+ for (ReplicationGroupId grpId : replicationGroupIds) {
+ CompletableFuture replicaFut = replicas.computeIfAbsent(grpId, k -> new CompletableFuture<>());
+ futures[i++] = replicaFut.thenCompose(replica -> replica.processPlacementDriverMessage(msg));
+ }
+
+ allOf(futures).whenComplete((responses, ex) -> {
+ if (ex == null) {
+ boolean accepted = responses.stream().allMatch(LeaseGrantedMessageResponse::accepted);
+
+ assert !msg.force() || accepted : "We do not give a replica possibility to decline a forced request.";
+
+ String redirect = accepted ? null :
+ responses.stream().filter(leaseGranResp -> !leaseGranResp.accepted()).findAny().get().redirectProposal();
+
+ LeaseGrantedMessageResponse response = PLACEMENT_DRIVER_MESSAGES_FACTORY.leaseGrantedMessageResponse()
+ .appliedGroups(replicationGroupIds == null ? Collections.emptySet() : replicationGroupIds)
+ .redirectProposal(redirect)
+ .accepted(accepted)
+ .build();
+
+ clusterNetSvc.messagingService().respond(senderConsistentId, response, correlationId);
+ } else if (!(unwrapCause(ex) instanceof NodeStoppingException)) {
+ LOG.error("Failed to process placement driver message [msg={}].", ex, msg);
+ }
+ });
} finally {
busyLock.leaveBusy();
}
@@ -478,6 +497,7 @@ private void stopLeaseProlongation(ReplicationGroupId groupId, @Nullable String
*/
public CompletableFuture startReplica(
ReplicationGroupId replicaGrpId,
+ ZonePartitionId zonePartitionId,
ReplicaListener listener,
TopologyAwareRaftGroupService raftClient,
PendingComparableValuesTracker storageIndexTracker
@@ -487,7 +507,7 @@ public CompletableFuture startReplica(
}
try {
- return startReplicaInternal(replicaGrpId, listener, raftClient, storageIndexTracker);
+ return startReplicaInternal(replicaGrpId, zonePartitionId, listener, raftClient, storageIndexTracker);
} finally {
busyLock.leaveBusy();
}
@@ -503,6 +523,7 @@ public CompletableFuture startReplica(
*/
private CompletableFuture startReplicaInternal(
ReplicationGroupId replicaGrpId,
+ ZonePartitionId zonePartitionId,
ReplicaListener listener,
TopologyAwareRaftGroupService raftClient,
PendingComparableValuesTracker storageIndexTracker
@@ -513,16 +534,27 @@ private CompletableFuture startReplicaInternal(
Replica newReplica = new Replica(
replicaGrpId,
+ zonePartitionId,
listener,
storageIndexTracker,
raftClient,
localNode,
- executor,
+ requestsExecutor,
placementDriver,
clockService
);
CompletableFuture replicaFuture = replicas.compute(replicaGrpId, (k, existingReplicaFuture) -> {
+ zonePartIdToTablePartId.compute(zonePartitionId, (key, tablePartIds) -> {
+ if (tablePartIds == null) {
+ tablePartIds = new HashSet<>();
+ }
+
+ tablePartIds.add(replicaGrpId);
+
+ return tablePartIds;
+ });
+
if (existingReplicaFuture == null || existingReplicaFuture.isDone()) {
assert existingReplicaFuture == null || isCompletedSuccessfully(existingReplicaFuture);
LOG.info("Replica is started [replicationGroupId={}].", replicaGrpId);
@@ -612,6 +644,10 @@ private CompletableFuture stopReplicaInternal(ReplicationGroupId replic
});
}
+ zonePartIdToTablePartId.forEach((zonePartId, tblPartIds) -> {
+ tblPartIds.remove(replicaGrpId);
+ });
+
return null;
});
} finally {
@@ -639,6 +675,18 @@ public CompletableFuture startAsync() {
TimeUnit.MILLISECONDS
);
+ scheduledTableLeaseUpdateExecutor.scheduleAtFixedRate(() -> {
+ if (!busyLock.enterBusy()) {
+ return;
+ }
+
+ try {
+ updateTableGroupsInternal();
+ } finally {
+ busyLock.leaveBusy();
+ }
+ }, 0, 1, TimeUnit.SECONDS);
+
cmgMgr.metaStorageNodes().whenComplete((nodes, e) -> {
if (e != null) {
msNodes.completeExceptionally(e);
@@ -652,6 +700,59 @@ public CompletableFuture startAsync() {
return nullCompletedFuture();
}
+ /**
+ * Updates list of replication groups for each distributed zone.
+ */
+ private void updateTableGroupsInternal() {
+ for (Entry> entry : zonePartIdToTablePartId.entrySet()) {
+ ZonePartitionId repGrp = entry.getKey();
+
+ ReplicaMeta meta = placementDriver.getLeaseMeta(repGrp);
+
+ if (meta != null) {
+ HashSet diff = new HashSet<>(entry.getValue());
+ diff.removeAll(meta.subgroups());
+
+ if (meta.getLeaseholderId().equals(localNodeId) && !diff.isEmpty()) {
+ LOG.info("New subgroups are found for existing lease [repGrp={}, subGroups={}].", repGrp, diff);
+
+ try {
+ placementDriver.addSubgroups(repGrp, meta.getStartTime().longValue(), diff)
+ .thenComposeAsync(unused -> {
+ ArrayList> requestToReplicas = new ArrayList<>();
+
+ for (ReplicationGroupId partId : diff) {
+ WaitReplicaStateMessage req = REPLICA_MESSAGES_FACTORY.waitReplicaStateMessage()
+ .enlistmentConsistencyToken(meta.getStartTime().longValue())
+ .groupId(partId)
+ // TODO: https://issues.apache.org/jira/browse/IGNITE-22122
+ .timeout(10)
+ .build();
+
+ CompletableFuture replicaFut = replicas.get(repGrp);
+
+ if (replicaFut != null) {
+ requestToReplicas.add(replicaFut.thenCompose(
+ replica -> replica.processRequest(req, localNodeId)));
+ }
+ }
+
+ return allOf(requestToReplicas.toArray(CompletableFuture[]::new));
+ }, scheduledTableLeaseUpdateExecutor)
+ .get(500, TimeUnit.MILLISECONDS);
+ } catch (Exception ex) {
+ LOG.error(
+ "Failed to add new subgroups to the replication group [repGrp={}, subGroups={}].",
+ ex,
+ repGrp,
+ diff
+ );
+ }
+ }
+ }
+ }
+ }
+
/** {@inheritDoc} */
@Override
public CompletableFuture stopAsync() {
@@ -662,7 +763,7 @@ public CompletableFuture stopAsync() {
busyLock.block();
shutdownAndAwaitTermination(scheduledIdleSafeTimeSyncExecutor, 10, TimeUnit.SECONDS);
- shutdownAndAwaitTermination(executor, 10, TimeUnit.SECONDS);
+ shutdownAndAwaitTermination(scheduledTableLeaseUpdateExecutor, 10, TimeUnit.SECONDS);
assert replicas.values().stream().noneMatch(CompletableFuture::isDone)
: "There are replicas alive [replicas="
diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java
index b8407d992fc..d45aa87c8bc 100644
--- a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java
+++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/ReplicaMessageGroup.java
@@ -54,4 +54,7 @@ public interface ReplicaMessageGroup {
/** Message type for {@link PrimaryReplicaChangeCommand}. */
short PRIMARY_REPLICA_CHANGE_COMMAND = 41;
+
+ /** Message type for {@link WaitReplicaStateMessage}. */
+ short WAIT_REPLICA_STATE = 42;
}
diff --git a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java
new file mode 100644
index 00000000000..d25e59c204d
--- /dev/null
+++ b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/message/WaitReplicaStateMessage.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.replicator.message;
+
+import org.apache.ignite.internal.network.annotations.Transferable;
+
+/**
+ * Wait for replica state being up to date with a leader.
+ */
+@Transferable(ReplicaMessageGroup.WAIT_REPLICA_STATE)
+public interface WaitReplicaStateMessage extends PrimaryReplicaRequest {
+ long timeout();
+}
diff --git a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java
index 676eb5ec362..3c4d3ef837c 100644
--- a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java
+++ b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/PlacementDriverReplicaSideTest.java
@@ -117,6 +117,7 @@ private Replica startReplica() {
return new Replica(
GRP_ID,
+ new ZonePartitionId(1, 0),
mock(ReplicaListener.class),
storageIndexTracker,
raftClient,
@@ -172,7 +173,7 @@ private CompletableFuture sendLeaseGranted(
.force(force)
.build();
- return replica.processPlacementDriverMessage(msg).thenApply(LeaseGrantedMessageResponse.class::cast);
+ return replica.processPlacementDriverMessage(msg);
}
private HybridTimestamp hts(long physical) {
diff --git a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java
index 9963fabfa60..d2a4729e815 100644
--- a/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java
+++ b/modules/replicator/src/test/java/org/apache/ignite/internal/replicator/ReplicaManagerTest.java
@@ -153,9 +153,11 @@ void testReplicaEvents(
replicaManager.listen(BEFORE_REPLICA_STOPPED, removeReplicaListener);
var groupId = new TablePartitionId(0, 0);
+ var zonePartId = new ZonePartitionId(0, 0);
CompletableFuture startReplicaFuture = replicaManager.startReplica(
groupId,
+ zonePartId,
replicaListener,
raftGroupService,
new PendingComparableValuesTracker<>(0L)
diff --git a/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java b/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java
index 60ebeafa68c..f6f17602ac3 100644
--- a/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java
+++ b/modules/replicator/src/testFixtures/java/org/apache/ignite/internal/raft/client/AbstractTopologyAwareGroupServiceTest.java
@@ -55,7 +55,8 @@
import org.apache.ignite.internal.raft.server.RaftGroupOptions;
import org.apache.ignite.internal.raft.server.impl.JraftServerImpl;
import org.apache.ignite.internal.raft.util.ThreadLocalOptimizedMarshaller;
-import org.apache.ignite.internal.replicator.TestReplicationGroupId;
+import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.IgniteAbstractTest;
import org.apache.ignite.internal.thread.NamedThreadFactory;
import org.apache.ignite.internal.topology.LogicalTopologyServiceTestImpl;
@@ -88,7 +89,9 @@ public abstract class AbstractTopologyAwareGroupServiceTest extends IgniteAbstra
/** Wait timeout, in milliseconds. */
protected static final int WAIT_TIMEOUT_MILLIS = 10_000;
- protected static final TestReplicationGroupId GROUP_ID = new TestReplicationGroupId("group_1");
+ protected static final TablePartitionId GROUP_ID = new TablePartitionId(1, 1);
+
+ protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 1);
/** RPC executor. */
protected final ScheduledExecutorService executor = new ScheduledThreadPoolExecutor(
diff --git a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java
index cdb511f090c..5d63268482e 100644
--- a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java
+++ b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteNodeRestartTest.java
@@ -26,6 +26,7 @@
import static org.apache.ignite.internal.TestWrappers.unwrapTableViewInternal;
import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE;
import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.alterZone;
+import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.REBALANCE_SCHEDULER_POOL_SIZE;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.stablePartAssignmentsKey;
@@ -117,6 +118,7 @@
import org.apache.ignite.internal.hlc.ClockServiceImpl;
import org.apache.ignite.internal.hlc.ClockWaiter;
import org.apache.ignite.internal.hlc.HybridClockImpl;
+import org.apache.ignite.internal.hlc.TestClockService;
import org.apache.ignite.internal.index.IndexManager;
import org.apache.ignite.internal.lang.ByteArray;
import org.apache.ignite.internal.lang.IgniteInternalException;
@@ -152,9 +154,11 @@
import org.apache.ignite.internal.raft.configuration.RaftConfiguration;
import org.apache.ignite.internal.raft.server.impl.JraftServerImpl;
import org.apache.ignite.internal.raft.storage.impl.LocalLogStorageFactory;
+import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker;
import org.apache.ignite.internal.replicator.ReplicaManager;
import org.apache.ignite.internal.replicator.ReplicaService;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration;
import org.apache.ignite.internal.schema.SchemaManager;
import org.apache.ignite.internal.schema.configuration.GcConfiguration;
@@ -232,6 +236,8 @@ public class ItIgniteNodeRestartTest extends BaseIgniteRestartTest {
/** Test table name. */
private static final String TABLE_NAME_2 = "Table2";
+ protected static final ZonePartitionId ZONE_GROUP_ID = new ZonePartitionId(1, 1);
+
@InjectConfiguration("mock: " + RAFT_CFG)
private static RaftConfiguration raftConfiguration;
@@ -443,10 +449,19 @@ public CompletableFuture invoke(Condition condition, Collection TestIgnitionManager.DEFAULT_DELAY_DURATION_MS;
+
+ var catalogManager = new CatalogManagerImpl(
+ new UpdateLogImpl(metaStorageMgr),
+ new TestClockService(hybridClock, clockWaiter),
+ delayDurationMsSupplier,
+ partitionIdleSafeTimePropagationPeriodMsSupplier
+ );
+
+ ConfigurationRegistry clusterConfigRegistry = clusterCfgMgr.configurationRegistry();
+
SchemaSynchronizationConfiguration schemaSyncConfiguration = clusterConfigRegistry.getConfiguration(
SchemaSynchronizationConfiguration.KEY
);
@@ -465,7 +480,8 @@ public CompletableFuture invoke(Condition condition, Collection ZONE_GROUP_ID
);
ReplicaManager replicaMgr = new ReplicaManager(
@@ -544,15 +560,6 @@ public CompletableFuture invoke(Condition condition, Collection TestIgnitionManager.DEFAULT_DELAY_DURATION_MS;
-
- var catalogManager = new CatalogManagerImpl(
- new UpdateLogImpl(metaStorageMgr),
- clockService,
- delayDurationMsSupplier,
- partitionIdleSafeTimePropagationPeriodMsSupplier
- );
-
SchemaManager schemaManager = new SchemaManager(registry, catalogManager);
var dataNodesMock = dataNodesMockByNode.get(idx);
@@ -645,7 +652,7 @@ public CompletableFuture> dataNodes(long causalityToken, int catalog
new SystemViewManagerImpl(name, catalogManager),
failureProcessor,
partitionIdleSafeTimePropagationPeriodMsSupplier,
- placementDriverManager.placementDriver(),
+ new ReplicaAwareLeaseTracker(placementDriverManager.placementDriver(), replicaService, clusterSvc.topologyService()),
clusterConfigRegistry.getConfiguration(SqlDistributedConfiguration.KEY),
nodeCfgMgr.configurationRegistry().getConfiguration(SqlLocalConfiguration.KEY),
transactionInflights
@@ -1419,7 +1426,11 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException
inhibitor.startInhibit();
- alterZone(nodes.get(0).catalogManager(), String.format("ZONE_%s", TABLE_NAME.toUpperCase()), 1);
+ String zoneName = String.format("ZONE_%s", TABLE_NAME.toUpperCase());
+
+ alterZone(nodes.get(0).catalogManager(), zoneName, 1);
+
+ int zoneId = getZoneIdStrict(nodes.get(0).catalogManager(), zoneName, nodes.get(0).clock().nowLong());
stopNode(restartedNodeIndex);
@@ -1437,9 +1448,9 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException
.collect(toSet()), Set.of());
for (int p = 0; p < partitions; p++) {
- TablePartitionId tablePartitionId = new TablePartitionId(table.tableId(), p);
+ ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, p);
- Entry e = restartedNode.metaStorageManager().getLocally(stablePartAssignmentsKey(tablePartitionId), recoveryRevision);
+ Entry e = restartedNode.metaStorageManager().getLocally(stablePartAssignmentsKey(zonePartitionId), recoveryRevision);
Set assignment = Assignments.fromBytes(e.value()).nodes();
@@ -1447,7 +1458,7 @@ public void testCorrectPartitionRecoveryOnSnapshot() throws InterruptedException
Peer peer = configuration.peer(restartedNode.name());
- boolean isStarted = restartedNode.raftManager().isStarted(new RaftNodeId(tablePartitionId, peer));
+ boolean isStarted = restartedNode.raftManager().isStarted(new RaftNodeId(new TablePartitionId(table.tableId(), p), peer));
assertEquals(shouldBe, isStarted);
}
@@ -1500,7 +1511,10 @@ public void createTableCallOnMultipleNodesTest(boolean populateStableAssignments
);
}
- var partId = new TablePartitionId(TABLE_ID, 0);
+ // Assume that the zone id will always be 7 for the test table. There is an assertion below to check this is true.
+ int zoneId = 7;
+
+ var partId = new ZonePartitionId(zoneId, 0);
// Populate the stable assignments before calling table create, if needed.
if (populateStableAssignmentsBeforeTableCreation) {
@@ -1525,7 +1539,7 @@ public void createTableCallOnMultipleNodesTest(boolean populateStableAssignments
sql.execute(null, "CREATE TABLE " + TABLE_NAME
+ "(id INT PRIMARY KEY, name VARCHAR) WITH PRIMARY_ZONE='" + zoneName + "';");
- assertEquals(TABLE_ID, tableId(node, TABLE_NAME));
+ assertEquals(zoneId, zoneId(node, zoneName));
node.metaStorageManager().put(new ByteArray(testPrefix.getBytes(StandardCharsets.UTF_8)), new byte[0]);
@@ -1603,7 +1617,10 @@ public void tableRecoveryOnMultipleRestartingNodes(int nodeThatWrittenAssignment
String tableName = "TEST";
String zoneName = "ZONE_TEST";
- var assignmentsKey = stablePartAssignmentsKey(new TablePartitionId(TABLE_ID, 0));
+ // Assume that the zone id will always be 7 for the test table. There is an assertion below to check this is true.
+ int zoneId = 7;
+
+ var assignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, 0));
var metaStorageInterceptorFut = new CompletableFuture<>();
var metaStorageInterceptorInnerFut = new CompletableFuture<>();
@@ -1679,7 +1696,7 @@ public void tableRecoveryOnMultipleRestartingNodes(int nodeThatWrittenAssignment
nodeInhibitor0.stopInhibit();
waitForValueInLocalMs(node0.metaStorageManager(), assignmentsKey);
- assertEquals(TABLE_ID, tableId(node0, tableName));
+ assertEquals(zoneId, zoneId(node0, zoneName));
Set expectedAssignments = dataNodesMockByNode.get(nodeThatWrittenAssignments).get().join()
.stream().map(Assignment::forPeer).collect(toSet());
@@ -1709,7 +1726,9 @@ public void testSequentialAsyncTableCreationThenAlterZoneThenRestartOnMsSnapshot
nodeInhibitor0.startInhibit();
nodeInhibitor1.startInhibit();
- var assignmentsKey = stablePartAssignmentsKey(new TablePartitionId(TABLE_ID, 0));
+ int zoneId = zoneId(node0, zoneName);
+
+ var assignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, 0));
var tableFut = createTableInCatalog(node0.catalogManager(), tableName, zoneName);
@@ -1744,7 +1763,7 @@ public void testSequentialAsyncTableCreationThenAlterZoneThenRestartOnMsSnapshot
assertThat(tableFut, willCompleteSuccessfully());
assertThat(alterZoneFut, willCompleteSuccessfully());
- assertEquals(TABLE_ID, tableId(node0, tableName));
+ assertEquals(zoneId, zoneId(node0, zoneName));
waitForValueInLocalMs(node0.metaStorageManager(), assignmentsKey);
@@ -1828,8 +1847,10 @@ private Set getAssignmentsFromMetaStorage(MetaStorageManager metaSto
: Assignments.fromBytes(e.value()).nodes();
}
- private int tableId(Ignite node, String tableName) {
- return (unwrapTableImpl(node.tables().table(tableName))).tableId();
+ private int zoneId(IgniteImpl node, String zoneName) {
+ int zoneId = getZoneIdStrict(node.catalogManager(), zoneName.toUpperCase(), node.clock().nowLong());
+
+ return zoneId;
}
/**
diff --git a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java
index a36a3886854..57063c7fb2c 100644
--- a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java
+++ b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/table/ItDurableFinishTest.java
@@ -50,6 +50,7 @@
import org.apache.ignite.internal.replicator.TablePartitionId;
import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
+import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.tx.InternalTransaction;
import org.apache.ignite.internal.tx.MismatchingTransactionOutcomeException;
import org.apache.ignite.internal.tx.TxMeta;
@@ -70,6 +71,7 @@
/**
* Test resending the finish request from the coordinator when the previous attempts failed for any reason.
*/
+@WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false")
public class ItDurableFinishTest extends ClusterPerTestIntegrationTest {
private static final int AWAIT_PRIMARY_REPLICA_TIMEOUT = 10;
diff --git a/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java b/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java
index c90b014454a..f17f670cdfa 100644
--- a/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java
+++ b/modules/runner/src/main/java/org/apache/ignite/internal/app/IgniteImpl.java
@@ -64,6 +64,7 @@
import org.apache.ignite.internal.catalog.CatalogManager;
import org.apache.ignite.internal.catalog.CatalogManagerImpl;
import org.apache.ignite.internal.catalog.configuration.SchemaSynchronizationConfiguration;
+import org.apache.ignite.internal.catalog.descriptors.CatalogTableDescriptor;
import org.apache.ignite.internal.catalog.sql.IgniteCatalogSqlImpl;
import org.apache.ignite.internal.catalog.storage.UpdateLogImpl;
import org.apache.ignite.internal.cluster.management.ClusterInitializer;
@@ -157,8 +158,10 @@
import org.apache.ignite.internal.raft.client.TopologyAwareRaftGroupServiceFactory;
import org.apache.ignite.internal.raft.configuration.RaftConfiguration;
import org.apache.ignite.internal.raft.storage.impl.VolatileLogStorageFactoryCreator;
+import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker;
import org.apache.ignite.internal.replicator.ReplicaManager;
import org.apache.ignite.internal.replicator.ReplicaService;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration;
import org.apache.ignite.internal.rest.RestComponent;
import org.apache.ignite.internal.rest.RestFactory;
@@ -592,6 +595,19 @@ public class IgniteImpl implements Ignite {
Consumer>> registry = c -> metaStorageMgr.registerRevisionUpdateListener(c::apply);
+ ReplicationConfiguration replicationConfig = clusterConfigRegistry.getConfiguration(ReplicationConfiguration.KEY);
+
+ LongSupplier partitionIdleSafeTimePropagationPeriodMsSupplier = partitionIdleSafeTimePropagationPeriodMsSupplier(replicationConfig);
+
+ LongSupplier delayDurationMsSupplier = delayDurationMsSupplier(schemaSyncConfig);
+
+ CatalogManagerImpl catalogManager = new CatalogManagerImpl(
+ new UpdateLogImpl(metaStorageMgr),
+ clockService,
+ delayDurationMsSupplier,
+ partitionIdleSafeTimePropagationPeriodMsSupplier
+ );
+
placementDriverMgr = new PlacementDriverManager(
name,
metaStorageMgr,
@@ -601,10 +617,18 @@ public class IgniteImpl implements Ignite {
logicalTopologyService,
raftMgr,
topologyAwareRaftGroupServiceFactory,
- clockService
- );
+ clockService,
+ tablePartId -> {
+ CatalogTableDescriptor tbl = catalogManager.table(tablePartId.tableId(), catalogManager.latestCatalogVersion());
- ReplicationConfiguration replicationConfig = clusterConfigRegistry.getConfiguration(ReplicationConfiguration.KEY);
+ int zoneId = tbl == null ? 2 : tbl.zoneId();
+
+ return new ZonePartitionId(
+ zoneId,
+ tablePartId.partitionId()
+ );
+ }
+ );
ReplicaService replicaSvc = new ReplicaService(
messagingServiceReturningToStorageOperationsPool,
@@ -613,8 +637,6 @@ public class IgniteImpl implements Ignite {
replicationConfig
);
- LongSupplier partitionIdleSafeTimePropagationPeriodMsSupplier = partitionIdleSafeTimePropagationPeriodMsSupplier(replicationConfig);
-
replicaMgr = new ReplicaManager(
name,
clusterSvc,
@@ -658,16 +680,6 @@ public class IgniteImpl implements Ignite {
volatileLogStorageFactoryCreator = new VolatileLogStorageFactoryCreator(name, workDir.resolve("volatile-log-spillout"));
outgoingSnapshotsManager = new OutgoingSnapshotsManager(name, clusterSvc.messagingService());
-
- LongSupplier delayDurationMsSupplier = delayDurationMsSupplier(schemaSyncConfig);
-
- CatalogManagerImpl catalogManager = new CatalogManagerImpl(
- new UpdateLogImpl(metaStorageMgr),
- clockService,
- delayDurationMsSupplier,
- partitionIdleSafeTimePropagationPeriodMsSupplier
- );
-
systemViewManager = new SystemViewManagerImpl(name, catalogManager);
nodeAttributesCollector.register(systemViewManager);
logicalTopology.addEventListener(systemViewManager);
@@ -800,13 +812,16 @@ public class IgniteImpl implements Ignite {
lowWatermark
);
+ ReplicaAwareLeaseTracker replicaAwarePlacementDriver = new ReplicaAwareLeaseTracker(placementDriverMgr.placementDriver(),
+ replicaSvc, clusterSvc.topologyService());
+
indexBuildingManager = new IndexBuildingManager(
name,
replicaSvc,
catalogManager,
metaStorageMgr,
indexManager,
- placementDriverMgr.placementDriver(),
+ replicaAwarePlacementDriver,
clusterSvc,
logicalTopologyService,
clockService
@@ -827,7 +842,7 @@ public class IgniteImpl implements Ignite {
systemViewManager,
failureProcessor,
partitionIdleSafeTimePropagationPeriodMsSupplier,
- placementDriverMgr.placementDriver(),
+ replicaAwarePlacementDriver,
clusterConfigRegistry.getConfiguration(SqlDistributedConfiguration.KEY),
nodeConfigRegistry.getConfiguration(SqlLocalConfiguration.KEY),
transactionInflights
@@ -859,7 +874,7 @@ public class IgniteImpl implements Ignite {
);
compute = new IgniteComputeImpl(
- placementDriverMgr.placementDriver(),
+ replicaAwarePlacementDriver,
clusterSvc.topologyService(),
distributedTblMgr,
computeComponent,
diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java
index 160bd8fab54..7d9725ad07e 100644
--- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java
+++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/SqlQueryProcessor.java
@@ -74,6 +74,7 @@
import org.apache.ignite.internal.replicator.ReplicaService;
import org.apache.ignite.internal.replicator.ReplicationGroupId;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.schema.SchemaManager;
import org.apache.ignite.internal.sql.api.ResultSetMetadataImpl;
import org.apache.ignite.internal.sql.configuration.distributed.SqlDistributedConfiguration;
@@ -408,8 +409,10 @@ private CompletableFuture> primaryReplicas(Ignite
int partitionId = partId;
ReplicationGroupId partGroupId = new TablePartitionId(table.id(), partitionId);
- CompletableFuture f = placementDriver.awaitPrimaryReplica(
- partGroupId,
+ ZonePartitionId zonePartitionId = new ZonePartitionId(table.zoneId(), table.id(), partId);
+
+ CompletableFuture f = placementDriver.awaitPrimaryReplicaForTable(
+ zonePartitionId,
clockNow,
AWAIT_PRIMARY_REPLICA_TIMEOUT,
SECONDS
diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java
index 7c28ac249e7..ae70d0777a4 100644
--- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java
+++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImpl.java
@@ -43,7 +43,7 @@
import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologySnapshot;
import org.apache.ignite.internal.lang.IgniteInternalException;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.sql.engine.exec.mapping.MappingServiceImpl.LogicalTopologyHolder.TopologySnapshot;
import org.apache.ignite.internal.sql.engine.prepare.Fragment;
import org.apache.ignite.internal.sql.engine.prepare.MultiStepPlan;
@@ -114,9 +114,9 @@ public CompletableFuture> map(MultiStepPlan multiStepPlan,
/** Called when the primary replica has expired. */
public CompletableFuture onPrimaryReplicaExpired(PrimaryReplicaEventParameters parameters) {
assert parameters != null;
- assert parameters.groupId() instanceof TablePartitionId;
+ assert parameters.groupId() instanceof ZonePartitionId;
- int tabId = ((TablePartitionId) parameters.groupId()).tableId();
+ int tabId = ((ZonePartitionId) parameters.groupId()).tableId();
// TODO https://issues.apache.org/jira/browse/IGNITE-21201 Move complex computations to a different thread.
mappingsCache.removeIfValue(value -> value.tableIds.contains(tabId));
diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java
index a7974f3c29f..c827145e736 100644
--- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java
+++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTable.java
@@ -72,4 +72,11 @@ public interface IgniteTable extends IgniteDataSource {
* @return Number of partitions.
*/
int partitions();
+
+ /**
+ * Returns the zone id of this table.
+ *
+ * @return Zone id.
+ */
+ int zoneId();
}
diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java
index 170f23861d6..38dada9742b 100644
--- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java
+++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/IgniteTableImpl.java
@@ -48,6 +48,8 @@ public class IgniteTableImpl extends AbstractIgniteDataSource implements IgniteT
private final int partitions;
+ private final int zoneId;
+
private final Lazy colocationColumnTypes;
/** Constructor. */
@@ -59,13 +61,15 @@ public IgniteTableImpl(
ImmutableIntList keyColumns,
Statistic statistic,
Map indexMap,
- int partitions
+ int partitions,
+ int zoneId
) {
super(name, id, version, desc, statistic);
this.keyColumns = keyColumns;
this.indexMap = indexMap;
this.partitions = partitions;
+ this.zoneId = zoneId;
this.columnsToInsert = deriveColumnsToInsert(desc);
colocationColumnTypes = new Lazy<>(this::evaluateTypes);
@@ -155,6 +159,11 @@ public int partitions() {
return partitions;
}
+ @Override
+ public int zoneId() {
+ return zoneId;
+ }
+
@Override
public ImmutableIntList keyColumns() {
return keyColumns;
diff --git a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java
index 2ab61fbe201..5176a22fb4c 100644
--- a/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java
+++ b/modules/sql-engine/src/main/java/org/apache/ignite/internal/sql/engine/schema/SqlSchemaManagerImpl.java
@@ -384,6 +384,7 @@ private static IgniteTable createTable(
) {
int tableId = catalogTableDescriptor.id();
String tableName = catalogTableDescriptor.name();
+ int zoneId = catalogTableDescriptor.zoneId();
// TODO IGNITE-19558: The table is not available at planning stage.
// Let's fix table statistics keeping in mind IGNITE-19558 issue.
@@ -402,7 +403,8 @@ private static IgniteTable createTable(
primaryIndex.collation().getKeys(),
statistic,
indexes,
- parititions
+ parititions,
+ zoneId
);
}
}
diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java
index b2bc6ca5de9..672169a7c7a 100644
--- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java
+++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/ExecutableTableRegistrySelfTest.java
@@ -158,7 +158,7 @@ CompletableFuture getTable(int tableId) {
when(descriptor.iterator()).thenReturn(Collections.emptyIterator());
IgniteTable sqlTable = new IgniteTableImpl(
- table.name(), tableId, tableVersion, descriptor, ImmutableIntList.of(0), new TestStatistic(1_000.0), Map.of(), 1
+ table.name(), tableId, tableVersion, descriptor, ImmutableIntList.of(0), new TestStatistic(1_000.0), Map.of(), 1, 123
);
when(sqlSchemaManager.table(schemaVersion, tableId)).thenReturn(sqlTable);
diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java
index 4979f140572..43f416b9f02 100644
--- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java
+++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/mapping/MappingServiceImplTest.java
@@ -40,12 +40,12 @@
import java.util.stream.Collectors;
import org.apache.ignite.internal.catalog.Catalog;
import org.apache.ignite.internal.catalog.CatalogService;
-import org.apache.ignite.internal.catalog.descriptors.CatalogObjectDescriptor;
+import org.apache.ignite.internal.catalog.descriptors.CatalogTableDescriptor;
import org.apache.ignite.internal.cluster.management.topology.api.LogicalNode;
import org.apache.ignite.internal.cluster.management.topology.api.LogicalTopologySnapshot;
import org.apache.ignite.internal.hlc.HybridTimestamp;
import org.apache.ignite.internal.placementdriver.event.PrimaryReplicaEventParameters;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.sql.engine.framework.TestBuilders;
import org.apache.ignite.internal.sql.engine.framework.TestCluster;
import org.apache.ignite.internal.sql.engine.prepare.MultiStepPlan;
@@ -235,15 +235,14 @@ public void testCacheInvalidationOnPrimaryExpiration() {
CatalogService catalogService = cluster.catalogManager();
Catalog catalog = catalogService.catalog(catalogService.latestCatalogVersion());
- Optional tblId = catalog.tables().stream()
+ Optional tblDesc = catalog.tables().stream()
.filter(desc -> name.equals(desc.name()))
- .findFirst()
- .map(CatalogObjectDescriptor::id);
+ .findFirst();
- assertTrue(tblId.isPresent());
+ assertTrue(tblDesc.isPresent());
return new PrimaryReplicaEventParameters(
- 0, new TablePartitionId(tblId.get(), 0), "ignored", "ignored", HybridTimestamp.MIN_VALUE);
+ 0, new ZonePartitionId(tblDesc.get().zoneId(), tblDesc.get().id(), 0), "ignored", "ignored", HybridTimestamp.MIN_VALUE);
};
// Initialize mapping service.
diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java
index 5c12103b02a..59cae1afa8d 100644
--- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java
+++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/exec/rel/TableScanNodeExecutionTest.java
@@ -242,6 +242,7 @@ private static class TestInternalTableImpl extends InternalTableImpl {
super(
"test",
1,
+ 123,
PART_CNT,
new SingleClusterNodeResolver(mock(ClusterNode.class)),
txManager,
diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java
index 4194a187f19..de0160615c7 100644
--- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java
+++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/framework/TestBuilders.java
@@ -807,6 +807,7 @@ private static class TableBuilderImpl implements TableBuilder {
private int size = 100_000;
private Integer tableId;
private int partitions = CatalogUtils.DEFAULT_PARTITION_COUNT;
+ private int zoneId = 123;
/** {@inheritDoc} */
@Override
@@ -929,7 +930,8 @@ public IgniteTable build() {
findPrimaryKey(tableDescriptor, indexes.values()),
new TestStatistic(size),
indexes,
- partitions
+ partitions,
+ zoneId
);
}
}
diff --git a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java
index ce236bdf17b..3649e650d7a 100644
--- a/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java
+++ b/modules/sql-engine/src/test/java/org/apache/ignite/internal/sql/engine/prepare/TypeCoercionTest.java
@@ -664,6 +664,11 @@ public int partitions() {
return 1;
}
+ @Override
+ public int zoneId() {
+ return 123;
+ }
+
@Override
public String name() {
return name;
diff --git a/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java b/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java
index ca2913f8faf..cc7b31a16b5 100644
--- a/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java
+++ b/modules/storage-api/src/test/java/org/apache/ignite/internal/storage/index/StorageIndexDescriptorTest.java
@@ -76,6 +76,7 @@ private static CatalogHashIndexDescriptor createHashIndexDescriptor(int indexId,
false,
AVAILABLE,
1,
+ 0,
List.of(COLUMN_NAME)
);
}
@@ -88,6 +89,7 @@ private static CatalogSortedIndexDescriptor createSortedIndexDescriptor(int inde
false,
AVAILABLE,
1,
+ 0,
List.of(new CatalogIndexColumnDescriptor(COLUMN_NAME, ASC_NULLS_FIRST))
);
}
diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java
index 7b661792838..10e1d0bb45f 100644
--- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java
+++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/AbstractMvTableStorageTest.java
@@ -366,6 +366,7 @@ public void testDestroySortedIndexIndependence() {
false,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST))
);
@@ -376,6 +377,7 @@ public void testDestroySortedIndexIndependence() {
false,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST))
);
@@ -419,6 +421,7 @@ public void testDestroyHashIndexIndependence() {
true,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of("STRKEY")
);
@@ -429,6 +432,7 @@ public void testDestroyHashIndexIndependence() {
true,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of("STRKEY")
);
@@ -1056,6 +1060,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) {
false,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of(new CatalogIndexColumnDescriptor("STRKEY", ASC_NULLS_LAST))
);
@@ -1066,6 +1071,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) {
true,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of("STRKEY")
);
@@ -1076,6 +1082,7 @@ private static void createTestTableAndIndexes(CatalogService catalogService) {
true,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of(pkColumnName)
);
diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java
index 7a0b604629c..3f140d41730 100644
--- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java
+++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractHashIndexStorageTest.java
@@ -74,6 +74,7 @@ CatalogHashIndexDescriptor createCatalogIndexDescriptor(int tableId, int indexId
false,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
Stream.of(columnTypes).map(AbstractIndexStorageTest::columnName).collect(toList())
);
diff --git a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java
index 99b9fe07055..df1855b6d1c 100644
--- a/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java
+++ b/modules/storage-api/src/testFixtures/java/org/apache/ignite/internal/storage/index/AbstractSortedIndexStorageTest.java
@@ -188,6 +188,7 @@ private CatalogSortedIndexDescriptor createCatalogIndexDescriptor(
false,
AVAILABLE,
catalogService.latestCatalogVersion(),
+ 0,
List.of(columns)
);
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java
index 333ae059a90..4cd561bde70 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ItTxDistributedTestThreeNodesThreeReplicas.java
@@ -76,6 +76,9 @@ public void testPrimaryReplicaDirectUpdateForExplicitTxn() throws InterruptedExc
JraftServerImpl server = (JraftServerImpl) txTestCluster.raftServers.get(leader.consistentId()).server();
var groupId = new TablePartitionId(accounts.tableId(), 0);
+ // TODO: IGNITE-20362 It need to be don before the message blocking to update lease subgroups.
+ accounts.recordView().insert(null, makeValue(1, 500.0));
+
// BLock replication messages to both replicas.
server.blockMessages(new RaftNodeId(groupId, leader), (msg, peerId) -> {
if (msg instanceof RpcRequests.AppendEntriesRequest) {
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java
index f5f4fede066..d9843bae18f 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/distributed/ReplicaUnavailableTest.java
@@ -27,6 +27,7 @@
import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedFast;
import static org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedIn;
import static org.apache.ignite.internal.util.CompletableFutures.emptySetCompletedFuture;
+import static org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
import static org.apache.ignite.internal.util.ExceptionUtils.unwrapCause;
import static org.apache.ignite.lang.ErrorGroups.Replicator.REPLICA_TIMEOUT_ERR;
import static org.apache.ignite.raft.jraft.test.TestUtils.getLocalAddress;
@@ -35,6 +36,7 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -62,6 +64,7 @@
import org.apache.ignite.internal.replicator.ReplicaResult;
import org.apache.ignite.internal.replicator.ReplicaService;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration;
import org.apache.ignite.internal.replicator.exception.ReplicaStoppingException;
import org.apache.ignite.internal.replicator.exception.ReplicationException;
@@ -180,6 +183,7 @@ public void testWithReplicaStartedAfterRequestSending() throws Exception {
ClusterNode clusterNode = clusterService.topologyService().localMember();
TablePartitionId tablePartitionId = new TablePartitionId(1, 1);
+ ZonePartitionId zonePartitionId = new ZonePartitionId(1, 1);
ReadWriteSingleRowReplicaRequest request = getRequest(tablePartitionId);
@@ -188,12 +192,17 @@ public void testWithReplicaStartedAfterRequestSending() throws Exception {
try {
log.info("Replica msg " + message.getClass().getSimpleName());
+ var mockRaftClient = mock(TopologyAwareRaftGroupService.class);
+ when(mockRaftClient.readIndex()).thenReturn(completedFuture(-1L));
+ when(mockRaftClient.run(any())).thenReturn(nullCompletedFuture());
+
replicaManager.startReplica(
tablePartitionId,
+ zonePartitionId,
(request0, senderId) -> completedFuture(new ReplicaResult(replicaMessageFactory.replicaResponse()
.result(5)
.build(), null)),
- mock(TopologyAwareRaftGroupService.class),
+ mockRaftClient,
new PendingComparableValuesTracker<>(0L)
);
} catch (NodeStoppingException e) {
@@ -295,16 +304,22 @@ public void testWithNotReadyReplica() {
ClusterNode clusterNode = clusterService.topologyService().localMember();
TablePartitionId tablePartitionId = new TablePartitionId(1, 1);
+ ZonePartitionId zonePartitionId = new ZonePartitionId(1, 1);
clusterService.messagingService().addMessageHandler(ReplicaMessageGroup.class, (message, sender, correlationId) -> {
runAsync(() -> {
try {
log.info("Replica msg " + message.getClass().getSimpleName());
+ var mockRaftClient = mock(TopologyAwareRaftGroupService.class);
+ when(mockRaftClient.readIndex()).thenReturn(completedFuture(-1L));
+ when(mockRaftClient.run(any())).thenReturn(nullCompletedFuture());
+
replicaManager.startReplica(
tablePartitionId,
+ zonePartitionId,
(request, senderId) -> new CompletableFuture<>(),
- mock(TopologyAwareRaftGroupService.class),
+ mockRaftClient,
new PendingComparableValuesTracker<>(0L)
);
} catch (NodeStoppingException e) {
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java
index 341658cdf6a..bb2ec1a0036 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/disaster/ItDisasterRecoveryReconfigurationTest.java
@@ -374,8 +374,7 @@ private void waitForScale(IgniteImpl node, int targetDataNodesCount) throws Inte
assertTrue(IgniteTestUtils.waitForCondition(() -> {
long causalityToken = node.metaStorageManager().appliedRevision();
- long msSafeTime = node.metaStorageManager().timestampByRevision(causalityToken).longValue();
- int catalogVersion = node.catalogManager().activeCatalogVersion(msSafeTime);
+ int catalogVersion = node.catalogManager().latestCatalogVersion();
CompletableFuture> dataNodes = dzManager.dataNodes(causalityToken, catalogVersion, zoneId);
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java
index f95ac9cc6b9..8ace6f51702 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/raftsnapshot/ItTableRaftSnapshotsTest.java
@@ -71,6 +71,7 @@
import org.apache.ignite.internal.table.distributed.schema.PartitionCommandsMarshallerImpl;
import org.apache.ignite.internal.test.WatchListenerInhibitor;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
+import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.testframework.WorkDirectory;
import org.apache.ignite.internal.testframework.log4j2.LogInspector;
import org.apache.ignite.internal.testframework.log4j2.LogInspector.Handler;
@@ -472,6 +473,7 @@ void entriesKeepAppendedDuringSnapshotInstallation() throws Exception {
* (and can install a RAFT snapshot on the ex-leader).
*/
@Test
+ @WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false")
void nodeCanInstallSnapshotsAfterSnapshotInstalledToIt() throws Exception {
feedNode2WithSnapshotOfOneRow();
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java
index 0c742a98016..b2459f87e23 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java
@@ -25,6 +25,7 @@
import static org.apache.ignite.internal.TestDefaultProfilesNames.DEFAULT_TEST_PROFILE_NAME;
import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_SCHEMA_NAME;
import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE;
+import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.REBALANCE_SCHEDULER_POOL_SIZE;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber;
@@ -161,6 +162,7 @@
import org.apache.ignite.internal.replicator.ReplicaManager;
import org.apache.ignite.internal.replicator.ReplicaService;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.replicator.configuration.ReplicationConfiguration;
import org.apache.ignite.internal.rest.configuration.RestConfiguration;
import org.apache.ignite.internal.schema.SchemaManager;
@@ -675,7 +677,7 @@ void testRaftClientsUpdatesAfterRebalance() throws Exception {
// Write the new assignments to metastore as a pending assignments.
{
- TablePartitionId partId = new TablePartitionId(getTableId(node, TABLE_NAME), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node.catalogManager, ZONE_NAME, node.hybridClock.nowLong()), 0);
ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId);
@@ -733,7 +735,7 @@ void testClientsAreUpdatedAfterPendingRebalanceHandled() throws Exception {
Set newAssignment = Set.of(Assignment.forPeer(newNodeNameForAssignment));
// Write the new assignments to metastore as a pending assignments.
- TablePartitionId partId = new TablePartitionId(getTableId(node, TABLE_NAME), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node.catalogManager, ZONE_NAME, node.hybridClock.nowLong()), 0);
ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId);
@@ -800,7 +802,7 @@ private void directUpdateMetastoreRebalanceAssignmentKeys() throws Exception {
Node node0 = getNode(0);
- TablePartitionId partId = new TablePartitionId(getTableId(node0, TABLE_NAME), 0);
+ ZonePartitionId partId = new ZonePartitionId(getZoneIdStrict(node0.catalogManager, ZONE_NAME, node0.hybridClock.nowLong()), 0);
ByteArray partAssignmentsPendingKey = pendingPartAssignmentsKey(partId);
ByteArray partAssignmentsPlannedKey = plannedPartAssignmentsKey(partId);
@@ -818,7 +820,7 @@ private void verifyThatRaftNodesAndReplicasWereStartedOnlyOnce() throws Exceptio
verify(getNode(i).raftManager, timeout(AWAIT_TIMEOUT_MILLIS).times(1))
.startRaftGroupNodeWithoutService(any(), any(), any(), any(), any(RaftGroupOptions.class));
verify(getNode(i).replicaManager, timeout(AWAIT_TIMEOUT_MILLIS).times(1))
- .startReplica(any(), any(), any(), any());
+ .startReplica(any(), any(), any(), any(), any());
}
}
@@ -878,40 +880,40 @@ private static Set getPartitionClusterNodes(Node node, int partNum)
}
private static Set getPartitionClusterNodes(Node node, String tableName, int partNum) {
- return Optional.ofNullable(getTableId(node, tableName))
- .map(tableId -> partitionAssignments(node.metaStorageManager, tableId, partNum).join())
+ return Optional.ofNullable(getTableZoneId(node, tableName))
+ .map(zoneId -> partitionAssignments(node.metaStorageManager, zoneId, partNum)
+ .thenApply(a -> a == null ? Set.of() : a).join()
+ )
.orElse(Set.of());
}
private static Set getPartitionPendingClusterNodes(Node node, int partNum) {
- return Optional.ofNullable(getTableId(node, TABLE_NAME))
- .map(tableId -> partitionPendingAssignments(node.metaStorageManager, tableId, partNum).join())
- .orElse(Set.of());
+ return partitionPendingAssignments(node.metaStorageManager, getZoneId(node, ZONE_NAME), partNum)
+ .thenApply(a -> a == null ? Set.of() : a).join();
}
private static Set getPartitionPlannedClusterNodes(Node node, int partNum) {
- return Optional.ofNullable(getTableId(node, TABLE_NAME))
- .map(tableId -> partitionPlannedAssignments(node.metaStorageManager, tableId, partNum).join())
- .orElse(Set.of());
+ return partitionPlannedAssignments(node.metaStorageManager, getZoneId(node, ZONE_NAME), partNum)
+ .thenApply(a -> a == null ? Set.of() : a).join();
}
private static CompletableFuture> partitionPendingAssignments(
MetaStorageManager metaStorageManager,
- int tableId,
+ int zoneId,
int partitionNumber
) {
return metaStorageManager
- .get(pendingPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber)))
+ .get(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber)))
.thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes());
}
private static CompletableFuture> partitionPlannedAssignments(
MetaStorageManager metaStorageManager,
- int tableId,
+ int zoneId,
int partitionNumber
) {
return metaStorageManager
- .get(plannedPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber)))
+ .get(plannedPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber)))
.thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes());
}
@@ -1562,6 +1564,12 @@ private static void createTable(Node node, String zoneName, String tableName) {
return TableTestUtils.getTableId(node.catalogManager, tableName, node.hybridClock.nowLong());
}
+ private static @Nullable Integer getTableZoneId(Node node, String tableName) {
+ CatalogTableDescriptor tblDesc = TableTestUtils.getTable(node.catalogManager, tableName, node.hybridClock.nowLong());
+
+ return tblDesc == null ? null : tblDesc.zoneId();
+ }
+
private Node getNode(int nodeIndex) {
return nodes.get(nodeIndex);
}
@@ -1577,4 +1585,8 @@ private void checkPartitionNodes(String tableName, int partitionId, int expNodeC
assertEquals(expNodeCount, getPartitionClusterNodes(node, tableName, partitionId).size(), node.name);
}
}
+
+ private static int getZoneId(Node node, String zoneName) {
+ return getZoneIdStrict(node.catalogManager, zoneName, node.hybridClock.nowLong());
+ }
}
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java
index 97a81caf0ff..64c414be8c5 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTest.java
@@ -220,8 +220,11 @@ private void waitForStableAssignmentsInMetastore(Set expectedNodes, int
Set[] lastAssignmentsHolderForLog = new Set[1];
assertTrue(waitForCondition(() -> {
+ int zoneId = cluster.aliveNode().catalogManager().table(table, cluster.aliveNode().catalogManager().latestCatalogVersion())
+ .zoneId();
+
Set assignments =
- await(partitionAssignments(cluster.aliveNode().metaStorageManager(), table, 0))
+ await(partitionAssignments(cluster.aliveNode().metaStorageManager(), zoneId, 0))
.stream()
.map(Assignment::consistentId)
.collect(Collectors.toSet());
@@ -236,8 +239,11 @@ private void waitForStableAssignmentsInMetastore(int expectedNodesNumber, int ta
Set[] lastAssignmentsHolderForLog = new Set[1];
assertTrue(waitForCondition(() -> {
+ int zoneId = cluster.aliveNode().catalogManager().table(table, cluster.aliveNode().catalogManager().latestCatalogVersion())
+ .zoneId();
+
Set assignments =
- await(partitionAssignments(cluster.aliveNode().metaStorageManager(), table, 0))
+ await(partitionAssignments(cluster.aliveNode().metaStorageManager(), zoneId, 0))
.stream()
.map(Assignment::consistentId)
.collect(Collectors.toSet());
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java
index 24357590de0..6d977a93381 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceTriggersRecoveryTest.java
@@ -19,8 +19,9 @@
import static org.apache.ignite.internal.TestWrappers.unwrapTableManager;
import static org.apache.ignite.internal.catalog.CatalogService.DEFAULT_STORAGE_PROFILE;
+import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneId;
+import static org.apache.ignite.internal.distributionzones.DistributionZonesTestUtil.getZoneIdStrict;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.pendingPartAssignmentsKey;
-import static org.apache.ignite.internal.table.TableTestUtils.getTableId;
import static org.apache.ignite.internal.testframework.IgniteTestUtils.bypassingThreadAssertions;
import static org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -38,7 +39,7 @@
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.hlc.HybridClockImpl;
import org.apache.ignite.internal.metastorage.MetaStorageManager;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.storage.MvPartitionStorage;
import org.apache.ignite.internal.table.distributed.TableManager;
import org.apache.ignite.internal.test.WatchListenerInhibitor;
@@ -115,10 +116,10 @@ void testRebalanceTriggersRecoveryAfterFilterUpdate() throws InterruptedExceptio
10_000));
// Remove the pending keys in a barbarian way. So, the rebalance can be triggered only by the recovery logic now.
- Integer tableId = getTableId(node(0).catalogManager(), "TEST", new HybridClockImpl().nowLong());
+ int zoneId = getZoneIdStrict(node(0).catalogManager(), "TEST_ZONE", node(0).clock().nowLong());
node(0)
.metaStorageManager()
- .remove(pendingPartAssignmentsKey(new TablePartitionId(tableId, 0))).join();
+ .remove(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, 0))).join();
restartNode(1);
restartNode(2);
@@ -160,10 +161,10 @@ void testRebalanceTriggersRecoveryAfterReplicasUpdate() throws InterruptedExcept
10_000));
// Remove the pending keys in a barbarian way. So, the rebalance can be triggered only by the recovery logic now.
- Integer tableId = getTableId(node(0).catalogManager(), "TEST", new HybridClockImpl().nowLong());
+ int zoneId = getZoneIdStrict(node(0).catalogManager(), "TEST_ZONE", node(0).clock().nowLong());
node(0)
.metaStorageManager()
- .remove(pendingPartAssignmentsKey(new TablePartitionId(tableId, 0))).join();
+ .remove(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, 0))).join();
restartNode(1);
restartNode(2);
@@ -202,22 +203,22 @@ void testRebalanceTriggersRecoveryWhenUpdatesWereProcessedByAnotherNodesAlready(
(() -> getPartitionPendingClusterNodes(node(0), 0).equals(Set.of())),
10_000));
- TablePartitionId tablePartitionId =
- new TablePartitionId(
- getTableId(node(0).catalogManager(),
- "TEST",
+ ZonePartitionId zonePartitionId =
+ new ZonePartitionId(
+ getZoneIdStrict(node(0).catalogManager(),
+ "TEST_ZONE",
new HybridClockImpl().nowLong()),
0
);
long pendingsKeysRevisionBeforeRecovery = node(0).metaStorageManager()
- .get(pendingPartAssignmentsKey(tablePartitionId))
+ .get(pendingPartAssignmentsKey(zonePartitionId))
.get(10, TimeUnit.SECONDS).revision();
startNode(3, GLOBAL_NODE_BOOTSTRAP_CFG_TEMPLATE);
long pendingsKeysRevisionAfterRecovery = node(0).metaStorageManager()
- .get(pendingPartAssignmentsKey(tablePartitionId))
+ .get(pendingPartAssignmentsKey(zonePartitionId))
.get(10, TimeUnit.SECONDS).revision();
// Check that recovered node doesn't produce new rebalances for already processed triggers.
@@ -225,18 +226,18 @@ void testRebalanceTriggersRecoveryWhenUpdatesWereProcessedByAnotherNodesAlready(
}
private static Set getPartitionPendingClusterNodes(IgniteImpl node, int partNum) {
- return Optional.ofNullable(getTableId(node.catalogManager(), "TEST", new HybridClockImpl().nowLong()))
- .map(tableId -> partitionPendingAssignments(node.metaStorageManager(), tableId, partNum).join())
+ return Optional.ofNullable(getZoneId(node.catalogManager(), "TEST_ZONE", new HybridClockImpl().nowLong()))
+ .map(zoneId -> partitionPendingAssignments(node.metaStorageManager(), zoneId, partNum).join())
.orElse(Set.of());
}
private static CompletableFuture> partitionPendingAssignments(
MetaStorageManager metaStorageManager,
- int tableId,
+ int zoneId,
int partitionNumber
) {
return metaStorageManager
- .get(pendingPartAssignmentsKey(new TablePartitionId(tableId, partitionNumber)))
+ .get(pendingPartAssignmentsKey(new ZonePartitionId(zoneId, partitionNumber)))
.thenApply(e -> (e.value() == null) ? null : Assignments.fromBytes(e.value()).nodes());
}
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java
index 36e61a1d872..de78916c02c 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItColocationTest.java
@@ -291,6 +291,7 @@ public CompletableFuture finish(
intTable = new InternalTableImpl(
"PUBLIC.TEST",
tblId,
+ 123,
PARTS,
new SingleClusterNodeResolver(clusterNode),
txManager,
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java
index 08b79a07acb..17eb7fa7476 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTransactionPrimaryChangeTest.java
@@ -35,6 +35,7 @@
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.replicator.TablePartitionId;
import org.apache.ignite.internal.table.distributed.command.UpdateCommand;
+import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.tx.impl.ReadWriteTransactionImpl;
import org.apache.ignite.raft.jraft.rpc.WriteActionRequest;
import org.apache.ignite.table.RecordView;
@@ -105,6 +106,7 @@ protected String getNodeBootstrapConfigTemplate() {
}
@Test
+ @WithSystemProperty(key = "IGNITE_ALWAYS_FORCE", value = "false")
public void testFullTxConsistency() throws InterruptedException {
TableImpl tbl = unwrapTableImpl(node(0).tables().table(TABLE_NAME));
diff --git a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java
index ccb91a867b8..f9b14accd35 100644
--- a/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java
+++ b/modules/table/src/integrationTest/java/org/apache/ignite/internal/table/ItTxResourcesVacuumTest.java
@@ -34,6 +34,7 @@
import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.tableId;
import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.txId;
import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.waitAndGetPrimaryReplica;
+import static org.apache.ignite.internal.tx.test.ItTransactionTestUtils.zoneId;
import static org.apache.ignite.internal.util.IgniteUtils.shutdownAndAwaitTermination;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -57,6 +58,7 @@
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.placementdriver.ReplicaMeta;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.testframework.SystemPropertiesExtension;
import org.apache.ignite.internal.testframework.WithSystemProperty;
import org.apache.ignite.internal.thread.IgniteThreadFactory;
@@ -205,7 +207,7 @@ public void testVacuum() throws InterruptedException {
int partId = partitionIdForTuple(node, TABLE_NAME, tuple, tx);
- Set nodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), partId));
+ Set nodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), partId));
view.upsert(tx, tuple);
view.upsert(parallelTx1, tupleForParallelTx);
@@ -296,7 +298,7 @@ public void testAbandonedTxnsAreNotVacuumizedUntilRecovered() throws Interrupted
int partId = partitionIdForTuple(anyNode(), TABLE_NAME, tuple, null);
- TablePartitionId groupId = new TablePartitionId(tableId(anyNode(), TABLE_NAME), partId);
+ ZonePartitionId groupId = new ZonePartitionId(zoneId(anyNode(), TABLE_NAME), partId);
Set txNodes = partitionAssignment(anyNode(), groupId);
@@ -380,7 +382,7 @@ public void testVacuumWithCleanupDelay() throws InterruptedException {
ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId);
IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId()));
- Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId));
+ Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId));
log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes);
@@ -484,7 +486,7 @@ public void testCommitPartitionPrimaryChangesBeforeVacuum() throws InterruptedEx
ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId);
IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId()));
- Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId));
+ Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId));
log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes);
@@ -568,7 +570,7 @@ public void testVacuumPersistentStateAfterCleanupDelayAndVolatileStateVacuum() t
ReplicaMeta replicaMeta = waitAndGetPrimaryReplica(node, commitPartGrpId);
IgniteImpl commitPartitionLeaseholder = findNode(n -> n.id().equals(replicaMeta.getLeaseholderId()));
- Set commitPartNodes = partitionAssignment(node, new TablePartitionId(tableId(node, TABLE_NAME), commitPartId));
+ Set commitPartNodes = partitionAssignment(node, new ZonePartitionId(zoneId(node, TABLE_NAME), commitPartId));
log.info("Test: Commit partition [leaseholder={}, hostingNodes={}].", commitPartitionLeaseholder.name(), commitPartNodes);
@@ -655,7 +657,7 @@ public void testRecoveryAfterPersistentStateVacuumized() throws InterruptedExcep
int commitPartId = partitionIdForTuple(commitPartitionLeaseholder, TABLE_NAME, tuple0, null);
Set commitPartitionNodes = partitionAssignment(commitPartitionLeaseholder,
- new TablePartitionId(tableId(commitPartitionLeaseholder, TABLE_NAME), commitPartId));
+ new ZonePartitionId(zoneId(commitPartitionLeaseholder, TABLE_NAME), commitPartId));
// Choose some node that doesn't host the partition as a tx coordinator.
IgniteImpl coord0 = findNode(n -> !commitPartitionNodes.contains(n.name()));
diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java b/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java
index a59ebba8fcd..8c0fc97e920 100644
--- a/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java
+++ b/modules/table/src/main/java/org/apache/ignite/internal/table/InternalTable.java
@@ -83,6 +83,13 @@ public interface InternalTable extends ManuallyCloseable {
*/
int partitionId(BinaryRowEx row);
+ /**
+ * Returns zone id in which the table is presented.
+ *
+ * @return Zone id.
+ */
+ int zoneId();
+
/**
* Asynchronously gets a row with same key columns values as given one from the table.
*
diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java
index fc34504bd51..af06e638e9c 100644
--- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java
+++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/PartitionReplicatorNodeRecovery.java
@@ -40,7 +40,7 @@
import org.apache.ignite.internal.network.MessagingService;
import org.apache.ignite.internal.raft.Peer;
import org.apache.ignite.internal.raft.PeersAndLearners;
-import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.storage.MvPartitionStorage;
import org.apache.ignite.internal.storage.RowId;
import org.apache.ignite.internal.storage.engine.MvTableStorage;
@@ -127,13 +127,13 @@ private void addMessageHandler() {
/**
* Returns a future that completes with a decision: should we start the corresponding group locally or not.
*
- * @param tablePartitionId ID of the table partition.
+ * @param zonePartitionId ID of the zone partition.
* @param internalTable Table we are working with.
* @param newConfiguration New configuration that is going to be applied if we'll start the group.
* @param localMemberAssignment Assignment of this node in this group.
*/
CompletableFuture shouldStartGroup(
- TablePartitionId tablePartitionId,
+ ZonePartitionId zonePartitionId,
InternalTable internalTable,
PeersAndLearners newConfiguration,
Assignment localMemberAssignment
@@ -141,7 +141,7 @@ CompletableFuture shouldStartGroup(
// If Raft is running in in-memory mode or the PDS has been cleared, we need to remove the current node
// from the Raft group in order to avoid the double vote problem.
if (mightNeedGroupRecovery(internalTable)) {
- return performGroupRecovery(tablePartitionId, newConfiguration, localMemberAssignment);
+ return performGroupRecovery(zonePartitionId, newConfiguration, localMemberAssignment, internalTable.tableId());
}
return trueCompletedFuture();
@@ -154,12 +154,12 @@ private static boolean mightNeedGroupRecovery(InternalTable internalTable) {
}
private CompletableFuture performGroupRecovery(
- TablePartitionId tablePartitionId,
+ ZonePartitionId zonePartitionId,
PeersAndLearners newConfiguration,
- Assignment localMemberAssignment
+ Assignment localMemberAssignment,
+ int tableId
) {
- int tableId = tablePartitionId.tableId();
- int partId = tablePartitionId.partitionId();
+ int partId = zonePartitionId.partitionId();
// No majority and not a full partition restart - need to 'remove, then add' nodes
// with current partition.
@@ -174,7 +174,7 @@ private CompletableFuture performGroupRecovery(
boolean majorityAvailable = dataNodesCount >= (newConfiguration.peers().size() / 2) + 1;
if (majorityAvailable) {
- RebalanceUtilEx.startPeerRemoval(tablePartitionId, localMemberAssignment, metaStorageManager);
+ RebalanceUtilEx.startPeerRemoval(zonePartitionId, localMemberAssignment, metaStorageManager);
return false;
} else {
diff --git a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java
index 273d92b2720..e58cacfcadb 100644
--- a/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java
+++ b/modules/table/src/main/java/org/apache/ignite/internal/table/distributed/TableManager.java
@@ -35,7 +35,7 @@
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.PENDING_ASSIGNMENTS_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.STABLE_ASSIGNMENTS_PREFIX;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractPartitionNumber;
-import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractTableId;
+import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.extractZoneId;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.intersect;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.partitionAssignmentsGetLocally;
import static org.apache.ignite.internal.distributionzones.rebalance.RebalanceUtil.pendingPartAssignmentsKey;
@@ -154,9 +154,11 @@
import org.apache.ignite.internal.raft.service.RaftGroupListener;
import org.apache.ignite.internal.raft.service.RaftGroupService;
import org.apache.ignite.internal.raft.storage.impl.LogStorageFactoryCreator;
+import org.apache.ignite.internal.replicator.ReplicaAwareLeaseTracker;
import org.apache.ignite.internal.replicator.ReplicaManager;
import org.apache.ignite.internal.replicator.ReplicaService;
import org.apache.ignite.internal.replicator.TablePartitionId;
+import org.apache.ignite.internal.replicator.ZonePartitionId;
import org.apache.ignite.internal.schema.SchemaManager;
import org.apache.ignite.internal.schema.SchemaRegistry;
import org.apache.ignite.internal.schema.configuration.GcConfiguration;
@@ -510,7 +512,10 @@ public TableManager(
this.nodeName = nodeName;
this.executorInclinedSchemaSyncService = new ExecutorInclinedSchemaSyncService(schemaSyncService, partitionOperationsExecutor);
- this.executorInclinedPlacementDriver = new ExecutorInclinedPlacementDriver(placementDriver, partitionOperationsExecutor);
+ this.executorInclinedPlacementDriver = new ExecutorInclinedPlacementDriver(
+ new ReplicaAwareLeaseTracker(placementDriver, replicaSvc, topologyService),
+ partitionOperationsExecutor
+ );
TxMessageSender txMessageSender = new TxMessageSender(
messagingService,
@@ -697,12 +702,12 @@ private CompletableFuture onTableCreate(CreateTableEventParameters para
* Writes the set of assignments to meta storage. If there are some assignments already, gets them from meta storage. Returns
* the list of assignments that really are in meta storage.
*
- * @param tableId Table id.
+ * @param zoneId Zone id.
* @param assignmentsFuture Assignments future, to get the assignments that should be written.
* @return Real list of assignments.
*/
- public CompletableFuture> writeTableAssignmentsToMetastore(
- int tableId,
+ public CompletableFuture> writeZoneAssignmentsToMetastore(
+ int zoneId,
CompletableFuture> assignmentsFuture
) {
return assignmentsFuture.thenCompose(newAssignments -> {
@@ -711,7 +716,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore(
List partitionAssignments = new ArrayList<>(newAssignments.size());
for (int i = 0; i < newAssignments.size(); i++) {
- ByteArray stableAssignmentsKey = stablePartAssignmentsKey(new TablePartitionId(tableId, i));
+ ByteArray stableAssignmentsKey = stablePartAssignmentsKey(new ZonePartitionId(zoneId, i));
byte[] anAssignment = newAssignments.get(i).toBytes();
Operation op = put(stableAssignmentsKey, anAssignment);
partitionAssignments.add(op);
@@ -738,15 +743,15 @@ public CompletableFuture> writeTableAssignmentsToMetastore(
if (invokeResult) {
LOG.info(
"Assignments calculated from data nodes are successfully written to meta storage"
- + " [tableId={}, assignments={}].",
- tableId,
+ + " [zoneId={}, assignments={}].",
+ zoneId,
Assignments.assignmentListToString(newAssignments)
);
return completedFuture(newAssignments);
} else {
Set partKeys = IntStream.range(0, newAssignments.size())
- .mapToObj(p -> stablePartAssignmentsKey(new TablePartitionId(tableId, p)))
+ .mapToObj(p -> stablePartAssignmentsKey(new ZonePartitionId(zoneId, p)))
.collect(toSet());
CompletableFuture> resFuture = metaStorageMgr.getAll(partKeys);
@@ -755,7 +760,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore(
List realAssignments = new ArrayList<>();
for (int p = 0; p < newAssignments.size(); p++) {
- var partId = new TablePartitionId(tableId, p);
+ var partId = new ZonePartitionId(zoneId, p);
Entry assignmentsEntry = metaStorageAssignments.get(stablePartAssignmentsKey(partId));
assert assignmentsEntry != null && !assignmentsEntry.empty() && !assignmentsEntry.tombstone()
@@ -767,8 +772,8 @@ public CompletableFuture> writeTableAssignmentsToMetastore(
}
LOG.info(
- "Assignments picked up from meta storage [tableId={}, assignments={}].",
- tableId,
+ "Assignments picked up from meta storage [zoneId={}, assignments={}].",
+ zoneId,
Assignments.assignmentListToString(realAssignments)
);
@@ -778,7 +783,7 @@ public CompletableFuture> writeTableAssignmentsToMetastore(
})
.handle((realAssignments, e) -> {
if (e != null) {
- LOG.error("Couldn't get assignments from metastore for table [tableId={}].", e, tableId);
+ LOG.error("Couldn't get assignments from metastore for table [zoneId={}].", e, zoneId);
throw ExceptionUtils.sneakyThrow(e);
}
@@ -852,6 +857,8 @@ private CompletableFuture startLocalPartitionsAndClients(
for (int i = 0; i < partitions; i++) {
int partId = i;
+ LOG.info("Start partition " + new TablePartitionId(tableId, i));
+
CompletableFuture> future = startPartitionAndStartClient(
table,
partId,
@@ -932,10 +939,13 @@ private CompletableFuture startPartitionAndStartClient(
CompletableFuture startGroupFut;
+ // TODO: revisit for in-memory
+ ZonePartitionId zonePartitionId = new ZonePartitionId(zoneId, partId);
+
if (localMemberAssignment != null) {
CompletableFuture shouldStartGroupFut = isRecovery
? partitionReplicatorNodeRecovery.shouldStartGroup(
- replicaGrpId,
+ zonePartitionId,
internalTbl,
newConfiguration,
localMemberAssignment
@@ -957,7 +967,7 @@ private CompletableFuture startPartitionAndStartClient(
try {
startPartitionRaftGroupNode(
- replicaGrpId,
+ zonePartitionId,
raftNodeId,
newConfiguration,
safeTimeTracker,
@@ -965,8 +975,7 @@ private CompletableFuture startPartitionAndStartClient(
table,
partitionStorages.getTxStateStorage(),
partitionDataStorage,
- partitionUpdateHandlers,
- zoneId
+ partitionUpdateHandlers
);
return true;
@@ -1010,6 +1019,7 @@ private CompletableFuture startPartitionAndStartClient(
try {
startReplicaWithNewListener(
replicaGrpId,
+ zonePartitionId,
table,
safeTimeTracker,
storageIndexTracker,
@@ -1037,6 +1047,7 @@ private CompletableFuture startPartitionAndStartClient(
private void startReplicaWithNewListener(
TablePartitionId replicaGrpId,
+ ZonePartitionId zonePartitionId,
TableImpl table,
PendingComparableValuesTracker safeTimeTracker,
PendingComparableValuesTracker storageIndexTracker,
@@ -1046,7 +1057,7 @@ private void startReplicaWithNewListener(
TopologyAwareRaftGroupService raftGroupService
) throws NodeStoppingException {
PartitionReplicaListener listener = createReplicaListener(
- replicaGrpId,
+ replicaGrpId.partitionId(),
table,
safeTimeTracker,
mvPartitionStorage,
@@ -1057,6 +1068,7 @@ private void startReplicaWithNewListener(
replicaMgr.startReplica(
replicaGrpId,
+ zonePartitionId,
listener,
raftGroupService,
storageIndexTracker
@@ -1064,7 +1076,7 @@ private void startReplicaWithNewListener(
}
private PartitionReplicaListener createReplicaListener(
- TablePartitionId tablePartitionId,
+ int partId,
TableImpl table,
PendingComparableValuesTracker safeTimeTracker,
MvPartitionStorage mvPartitionStorage,
@@ -1072,8 +1084,7 @@ private PartitionReplicaListener createReplicaListener(
PartitionUpdateHandlers partitionUpdateHandlers,
RaftGroupService raftClient
) {
- int tableId = tablePartitionId.tableId();
- int partId = tablePartitionId.partitionId();
+ int tableId = table.tableId();
return new PartitionReplicaListener(
mvPartitionStorage,
@@ -1265,7 +1276,7 @@ private CompletableFuture> createTableLocally(
boolean onNodeRecovery
) {
return inBusyLockAsync(busyLock, () -> {
- int tableId = tableDescriptor.id();
+ int zoneId = tableDescriptor.zoneId();
// Retrieve descriptor during synchronous call, before the previous catalog version could be concurrently compacted.
CatalogZoneDescriptor zoneDescriptor = getZoneDescriptor(tableDescriptor, catalogVersion);
@@ -1278,7 +1289,7 @@ private CompletableFuture> createTableLocally(
);
CompletableFuture> assignmentsFutureAfterInvoke =
- writeTableAssignmentsToMetastore(tableId, assignmentsFuture);
+ writeZoneAssignmentsToMetastore(zoneId, assignmentsFuture);
return createTableLocally(
causalityToken,
@@ -1327,6 +1338,7 @@ private CompletableFuture createTableLocally(
InternalTableImpl internalTable = new InternalTableImpl(
tableName,
tableId,
+ zoneDescriptor.id(),
partitions,
topologyService,
txManager,
@@ -1416,12 +1428,12 @@ private CompletableFuture> getOrCreateAssignments(
long causalityToken,
int catalogVersion
) {
- int tableId = tableDescriptor.id();
+ int zoneId = tableDescriptor.zoneId();
CompletableFuture> assignmentsFuture;
- if (partitionAssignmentsGetLocally(metaStorageMgr, tableId, 0, causalityToken) != null) {
+ if (partitionAssignmentsGetLocally(metaStorageMgr, zoneId, 0, causalityToken) != null) {
assignmentsFuture = completedFuture(
- tableAssignmentsGetLocally(metaStorageMgr, tableId, zoneDescriptor.partitions(), causalityToken));
+ tableAssignmentsGetLocally(metaStorageMgr, zoneId, zoneDescriptor.partitions(), causalityToken));
} else {
assignmentsFuture = distributionZoneManager.dataNodes(causalityToken, catalogVersion, zoneDescriptor.id())
.thenApply(dataNodes -> AffinityUtils.calculateAssignments(
@@ -1431,9 +1443,9 @@ private CompletableFuture> getOrCreateAssignments(
).stream().map(Assignments::of).collect(toList()));
assignmentsFuture.thenAccept(assignmentsList -> LOG.info(
- "Assignments calculated from data nodes [table={}, tableId={}, assignments={}, revision={}]",
+ "Assignments calculated from data nodes [table={}, zoneId={}, assignments={}, revision={}]",
tableDescriptor.name(),
- tableId,
+ zoneId,
Assignments.assignmentListToString(assignmentsList),
causalityToken
));
@@ -1489,6 +1501,7 @@ protected TxStateTableStorage createTxStateTableStorage(CatalogTableDescriptor t
*/
private CompletableFuture destroyTableLocally(int tableId) {
TableImpl table = startedTables.remove(tableId);
+
localPartsByTableId.remove(tableId);
assert table != null : tableId;
@@ -1496,12 +1509,6 @@ private CompletableFuture destroyTableLocally(int tableId) {
InternalTable internalTable = table.internalTable();
int partitions = internalTable.partitions();
- // TODO https://issues.apache.org/jira/browse/IGNITE-18991 Move assigment manipulations to Distribution zones.
- Set assignmentKeys = IntStream.range(0, partitions)
- .mapToObj(p -> stablePartAssignmentsKey(new TablePartitionId(tableId, p)))
- .collect(toSet());
- metaStorageMgr.removeAll(assignmentKeys);
-
CompletableFuture>[] stopReplicaFutures = new CompletableFuture>[partitions];
// TODO https://issues.apache.org/jira/browse/IGNITE-19170 Partitions should be stopped on the assignments change
@@ -1796,15 +1803,23 @@ private CompletableFuture handleChangePendingAssignmentEvent(
}
int partId = extractPartitionNumber(pendingAssignmentsEntry.key());
- int tblId = extractTableId(pendingAssignmentsEntry.key(), PENDING_ASSIGNMENTS_PREFIX);
+ int zoneId = extractZoneId(pendingAssignmentsEntry.key(), PENDING_ASSIGNMENTS_PREFIX);
- var replicaGrpId = new TablePartitionId(tblId, partId);
+ var zonePartitionId = new ZonePartitionId(zoneId, partId);
// Stable assignments from the meta store, which revision is bounded by the current pending event.
- Entry stableAssignmentsEntry = metaStorageMgr.getLocally(stablePartAssignmentsKey(replicaGrpId), revision);
+ Entry stableAssignmentsEntry = metaStorageMgr.getLocally(stablePartAssignmentsKey(zonePartitionId), revision);
Assignments pendingAssignments = Assignments.fromBytes(pendingAssignmentsEntry.value());
+ HybridTimestamp msSafeTime = metaStorageMgr.timestampByRevision(revision);
+
+ int catalogVersion = catalogService.activeCatalogVersion(msSafeTime.longValue());
+
+ Set tablesInZone = findTablesByZoneId(zoneId, catalogVersion, catalogService).stream()
+ .map(CatalogObjectDescriptor::id)
+ .collect(toSet());
+
return tablesVv.get(revision)
.thenApply(ignore -> {
if (!busyLock.enterBusy()) {
@@ -1812,42 +1827,47 @@ private CompletableFuture handleChangePendingAssignmentEvent(
}
try {
- TableImpl table = tables.get(tblId);
-
- // Table can be null only recovery, because we use a revision from the future. See comment inside
- // performRebalanceOnRecovery.
- if (table == null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Skipping Pending Assignments update, because table {} does not exist", tblId);
- }
-
- return CompletableFutures.nullCompletedFuture();
- }
-
- if (LOG.isInfoEnabled()) {
- var stringKey = new String(pendingAssignmentsEntry.key(), UTF_8);
-
- LOG.info("Received update on pending assignments. Check if new raft group should be started"
- + " [key={}, partition={}, table={}, localMemberAddress={}, pendingAssignments={}]",
- stringKey, partId, table.name(), localNode().address(), pendingAssignments);
- }
-
Set stableAssignments = stableAssignmentsEntry.value() == null
? emptySet()
: Assignments.fromBytes(stableAssignmentsEntry.value()).nodes();
- return setTablesPartitionCountersForRebalance(replicaGrpId, revision, pendingAssignments.force())
- .thenCompose(r ->
- handleChangePendingAssignmentEvent(
- replicaGrpId,
- table,
- pendingAssignments,
- stableAssignments,
- revision,
- isRecovery
- )
- )
- .thenCompose(v -> changePeersOnRebalance(table, replicaGrpId, pendingAssignments.nodes(), revision));
+ return setTablesPartitionCountersForRebalance(zonePartitionId, revision, pendingAssignments.force())
+ .thenCompose(r -> {
+ List> tableFutures = new ArrayList<>(tables.size());
+
+ for (int tableId : tablesInZone) {
+ TableImpl table = tables.get(tableId);
+
+ if (LOG.isInfoEnabled()) {
+ var stringKey = new String(pendingAssignmentsEntry.key(), UTF_8);
+
+ LOG.info("Received update on pending assignments. Check if new raft group should be started"
+ + " [key={}, partition={}, table={}, "
+ + "localMemberAddress={}, pendingAssignments={}]",
+ stringKey, partId, table.name(), localNode().address(), pendingAssignments);
+ }
+
+ tableFutures.add(
+ handleChangePendingAssignmentEventForTable(
+ zonePartitionId,
+ table,
+ pendingAssignments,
+ stableAssignments,
+ revision,
+ isRecovery
+ ).thenCompose(
+ v -> changePeersOnRebalance(
+ table,
+ zonePartitionId,
+ pendingAssignments.nodes(),
+ revision
+ )
+ )
+ );
+ }
+
+ return allOf(tableFutures.toArray(CompletableFuture[]::new));
+ });
} finally {
busyLock.leaveBusy();
}
@@ -1855,8 +1875,8 @@ private CompletableFuture handleChangePendingAssignmentEvent(
.thenCompose(identity());
}
- private CompletableFuture handleChangePendingAssignmentEvent(
- TablePartitionId replicaGrpId,
+ private CompletableFuture handleChangePendingAssignmentEventForTable(
+ ZonePartitionId zonePartitionId,
TableImpl tbl,
Assignments pendingAssignments,
Set stableAssignments,
@@ -1864,7 +1884,10 @@ private CompletableFuture handleChangePendingAssignmentEvent(
boolean isRecovery
) {
ClusterNode localMember = localNode();
- RaftNodeId raftNodeId = new RaftNodeId(replicaGrpId, new Peer(localNode().name()));
+ RaftNodeId raftNodeId = new RaftNodeId(
+ new TablePartitionId(tbl.tableId(), zonePartitionId.partitionId()),
+ new Peer(localNode().name())
+ );
boolean pendingAssignmentsAreForced = pendingAssignments.force();
Set pendingAssignmentsNodes = pendingAssignments.nodes();
@@ -1876,9 +1899,7 @@ private CompletableFuture handleChangePendingAssignmentEvent(
CompletableFuture localServicesStartFuture;
- int tableId = tbl.tableId();
-
- int zoneId = getTableDescriptor(tableId, catalogService.latestCatalogVersion()).zoneId();
+ int zoneId = zonePartitionId.zoneId();
// This is a set of assignments for nodes that are not the part of stable assignments, i.e. unstable part of the distribution.
// For regular pending assignments we use (old) stable set, so that none of new nodes would be able to propose itself as a leader.
@@ -1897,7 +1918,7 @@ private CompletableFuture handleChangePendingAssignmentEvent(
Assignments nonStableNodeAssignmentsFinal = nonStableNodeAssignments;
- int partitionId = replicaGrpId.partitionId();
+ int partitionId = zonePartitionId.partitionId();
if (shouldStartLocalGroupNode) {
PartitionSet singlePartitionIdSet = PartitionSet.of(partitionId);
@@ -1921,7 +1942,7 @@ private CompletableFuture handleChangePendingAssignmentEvent(
return startPartitionAndStartClient(
tbl,
- replicaGrpId.partitionId(),
+ zonePartitionId.partitionId(),
pendingAssignments,
nonStableNodeAssignmentsFinal,
zoneId,
@@ -1945,21 +1966,17 @@ private CompletableFuture handleChangePendingAssignmentEvent(
tbl.internalTable()
.tableRaftService()
- .partitionRaftGroupService(partitionId)
+ .partitionRaftGroupService(zonePartitionId.partitionId())
.updateConfiguration(configurationFromAssignments(cfg));
}, ioExecutor);
}
- private CompletableFuture setTablesPartitionCountersForRebalance(TablePartitionId replicaGrpId, long revision, boolean force) {
+ private CompletableFuture setTablesPartitionCountersForRebalance(ZonePartitionId zonePartitionId, long revision, boolean force) {
int catalogVersion = catalogService.latestCatalogVersion();
- int tableId = replicaGrpId.tableId();
+ int zoneId = zonePartitionId.zoneId();
- CatalogZoneDescriptor zoneDescriptor = getZoneDescriptor(getTableDescriptor(tableId, catalogVersion), catalogVersion);
-
- int zoneId = zoneDescriptor.id();
-
- int partId = replicaGrpId.partitionId();
+ int partId = zonePartitionId.partitionId();
SimpleCondition revisionMatches = revision(tablesCounterKey(zoneId, partId)).lt(revision);
SimpleCondition counterIsEmpty = value(tablesCounterKey(zoneId, partId)).eq(toBytes(Set.of()));
@@ -2003,7 +2020,7 @@ private CompletableFuture setTablesPartitionCountersForRebalance(TablePart
private CompletableFuture changePeersOnRebalance(
TableImpl table,
- TablePartitionId replicaGrpId,
+ ZonePartitionId replicaGrpId,
Set pendingAssignments,
long revision
) {
@@ -2055,7 +2072,7 @@ private CompletableFuture changePeersOnRebalance(
}
private void startPartitionRaftGroupNode(
- TablePartitionId replicaGrpId,
+ ZonePartitionId zonePartitionId,
RaftNodeId raftNodeId,
PeersAndLearners stableConfiguration,
PendingComparableValuesTracker