Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flink: Reverting the default custom partitioner for bucket column #8848

Merged
merged 4 commits into from Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -525,13 +525,7 @@ private DataStream<RowData> distributeDataStream(
+ "and table is unpartitioned");
return input;
} else {
if (BucketPartitionerUtil.hasOneBucketField(partitionSpec)) {
return input.partitionCustom(
new BucketPartitioner(partitionSpec),
new BucketPartitionKeySelector(partitionSpec, iSchema, flinkRowType));
} else {
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
} else {
if (partitionSpec.isUnpartitioned()) {
Expand Down
Expand Up @@ -114,13 +114,19 @@ private void appendRowsToTable(List<RowData> allRows) throws Exception {
new BoundedTestSource<>(
allRows.stream().map(converter::toExternal).toArray(Row[]::new)),
ROW_TYPE_INFO)
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE));
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE))
.partitionCustom(
new BucketPartitioner(table.spec()),
new BucketPartitionKeySelector(
table.spec(),
table.schema(),
FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA)));

FlinkSink.forRowData(dataStream)
.table(table)
.tableLoader(tableLoader)
.writeParallelism(parallelism)
.distributionMode(DistributionMode.HASH)
.distributionMode(DistributionMode.NONE)
.append();

env.execute("Test Iceberg DataStream");
Expand Down Expand Up @@ -157,26 +163,6 @@ public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) t
}
}

/**
* Verifies the BucketPartitioner is not used when the PartitionSpec has more than 1 bucket, and
* that it should fallback to input.keyBy
*/
@ParameterizedTest
@EnumSource(value = TableSchemaType.class, names = "TWO_BUCKETS")
public void testMultipleBucketsFallback(TableSchemaType tableSchemaType) throws Exception {
setupEnvironment(tableSchemaType);
List<RowData> rows = generateTestDataRows();

appendRowsToTable(rows);
TableTestStats stats = extractPartitionResults(tableSchemaType);

Assertions.assertThat(stats.totalRowCount).isEqualTo(rows.size());
for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) {
// Only 1 file per bucket will be created when falling back to input.keyBy(...)
Assertions.assertThat((int) stats.numFilesPerBucket.get(i)).isEqualTo(1);
}
}

/**
* Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4
* buckets)
Expand Down
Expand Up @@ -525,13 +525,7 @@ private DataStream<RowData> distributeDataStream(
+ "and table is unpartitioned");
return input;
} else {
if (BucketPartitionerUtil.hasOneBucketField(partitionSpec)) {
return input.partitionCustom(
new BucketPartitioner(partitionSpec),
new BucketPartitionKeySelector(partitionSpec, iSchema, flinkRowType));
} else {
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
} else {
if (partitionSpec.isUnpartitioned()) {
Expand Down
Expand Up @@ -114,13 +114,19 @@ private void appendRowsToTable(List<RowData> allRows) throws Exception {
new BoundedTestSource<>(
allRows.stream().map(converter::toExternal).toArray(Row[]::new)),
ROW_TYPE_INFO)
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE));
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE))
.partitionCustom(
new BucketPartitioner(table.spec()),
new BucketPartitionKeySelector(
table.spec(),
table.schema(),
FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA)));

FlinkSink.forRowData(dataStream)
.table(table)
.tableLoader(tableLoader)
.writeParallelism(parallelism)
.distributionMode(DistributionMode.HASH)
.distributionMode(DistributionMode.NONE)
.append();

env.execute("Test Iceberg DataStream");
Expand Down Expand Up @@ -157,26 +163,6 @@ public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) t
}
}

/**
* Verifies the BucketPartitioner is not used when the PartitionSpec has more than 1 bucket, and
* that it should fallback to input.keyBy
*/
@ParameterizedTest
@EnumSource(value = TableSchemaType.class, names = "TWO_BUCKETS")
public void testMultipleBucketsFallback(TableSchemaType tableSchemaType) throws Exception {
setupEnvironment(tableSchemaType);
List<RowData> rows = generateTestDataRows();

appendRowsToTable(rows);
TableTestStats stats = extractPartitionResults(tableSchemaType);

Assertions.assertThat(stats.totalRowCount).isEqualTo(rows.size());
for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) {
// Only 1 file per bucket will be created when falling back to input.keyBy(...)
Assertions.assertThat((int) stats.numFilesPerBucket.get(i)).isEqualTo(1);
}
}

/**
* Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4
* buckets)
Expand Down
Expand Up @@ -525,13 +525,7 @@ private DataStream<RowData> distributeDataStream(
+ "and table is unpartitioned");
return input;
} else {
if (BucketPartitionerUtil.hasOneBucketField(partitionSpec)) {
return input.partitionCustom(
new BucketPartitioner(partitionSpec),
new BucketPartitionKeySelector(partitionSpec, iSchema, flinkRowType));
} else {
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType));
}
} else {
if (partitionSpec.isUnpartitioned()) {
Expand Down
Expand Up @@ -114,13 +114,19 @@ private void appendRowsToTable(List<RowData> allRows) throws Exception {
new BoundedTestSource<>(
allRows.stream().map(converter::toExternal).toArray(Row[]::new)),
ROW_TYPE_INFO)
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE));
.map(converter::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE))
.partitionCustom(
new BucketPartitioner(table.spec()),
new BucketPartitionKeySelector(
table.spec(),
table.schema(),
FlinkSink.toFlinkRowType(table.schema(), SimpleDataUtil.FLINK_SCHEMA)));

FlinkSink.forRowData(dataStream)
.table(table)
.tableLoader(tableLoader)
.writeParallelism(parallelism)
.distributionMode(DistributionMode.HASH)
.distributionMode(DistributionMode.NONE)
.append();

env.execute("Test Iceberg DataStream");
Expand Down Expand Up @@ -157,26 +163,6 @@ public void testSendRecordsToAllBucketsEvenly(TableSchemaType tableSchemaType) t
}
}

/**
* Verifies the BucketPartitioner is not used when the PartitionSpec has more than 1 bucket, and
* that it should fallback to input.keyBy
*/
@ParameterizedTest
@EnumSource(value = TableSchemaType.class, names = "TWO_BUCKETS")
public void testMultipleBucketsFallback(TableSchemaType tableSchemaType) throws Exception {
setupEnvironment(tableSchemaType);
List<RowData> rows = generateTestDataRows();

appendRowsToTable(rows);
TableTestStats stats = extractPartitionResults(tableSchemaType);

Assertions.assertThat(stats.totalRowCount).isEqualTo(rows.size());
for (int i = 0, j = numBuckets; i < numBuckets; i++, j++) {
// Only 1 file per bucket will be created when falling back to input.keyBy(...)
Assertions.assertThat((int) stats.numFilesPerBucket.get(i)).isEqualTo(1);
}
}

/**
* Generating 16 rows to be sent uniformly to all writers (round-robin across 8 writers -> 4
* buckets)
Expand Down