Skip to content

Commit

Permalink
[HUDI-6863] Revert auto-tuning of dedup parallelism (#9722)
Browse files Browse the repository at this point in the history
Before this PR, the auto-tuning logic for dedup parallelism dictates the write parallelism so that the user-configured `hoodie.upsert.shuffle.parallelism` is ignored.  This commit reverts #6802 to fix the issue.
  • Loading branch information
yihua committed Sep 16, 2023
1 parent ead5171 commit ea8f925
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 8 deletions.
Expand Up @@ -60,9 +60,6 @@ public HoodieData<HoodieRecord<T>> deduplicateRecords(
HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism, String schemaStr, TypedProperties props, HoodieRecordMerger merger) {
boolean isIndexingGlobal = index.isGlobal();
final SerializableSchema schema = new SerializableSchema(schemaStr);
// Auto-tunes the parallelism for reduce transformation based on the number of data partitions
// in engine-specific representation
int reduceParallelism = Math.max(1, Math.min(records.getNumPartitions(), parallelism));
return records.mapToPair(record -> {
HoodieKey hoodieKey = record.getKey();
// If index used is global, then records are expected to differ in their partitionPath
Expand All @@ -74,14 +71,14 @@ public HoodieData<HoodieRecord<T>> deduplicateRecords(
}).reduceByKey((rec1, rec2) -> {
HoodieRecord<T> reducedRecord;
try {
reducedRecord = merger.merge(rec1, schema.get(), rec2, schema.get(), props).get().getLeft();
reducedRecord = merger.merge(rec1, schema.get(), rec2, schema.get(), props).get().getLeft();
} catch (IOException e) {
throw new HoodieException(String.format("Error to merge two records, %s, %s", rec1, rec2), e);
}
boolean choosePrev = rec1.getData().equals(reducedRecord.getData());
HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
return reducedRecord.newInstance(reducedKey, operation);
}, reduceParallelism).map(Pair::getRight);
}, parallelism).map(Pair::getRight);
}
}
Expand Up @@ -479,12 +479,12 @@ private void testDeduplication(
// Global dedup should be done based on recordKey only
HoodieIndex index = mock(HoodieIndex.class);
when(index.isGlobal()).thenReturn(true);
int dedupParallelism = records.getNumPartitions() + 100;
int dedupParallelism = records.getNumPartitions() + 2;
HoodieData<HoodieRecord<RawTripTestPayload>> dedupedRecsRdd =
(HoodieData<HoodieRecord<RawTripTestPayload>>) HoodieWriteHelper.newInstance()
.deduplicateRecords(records, index, dedupParallelism, writeConfig.getSchema(), writeConfig.getProps(), HoodiePreCombineAvroRecordMerger.INSTANCE);
List<HoodieRecord<RawTripTestPayload>> dedupedRecs = dedupedRecsRdd.collectAsList();
assertEquals(records.getNumPartitions(), dedupedRecsRdd.getNumPartitions());
assertEquals(dedupParallelism, dedupedRecsRdd.getNumPartitions());
assertEquals(1, dedupedRecs.size());
assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath());
assertNodupesWithinPartition(dedupedRecs);
Expand All @@ -496,7 +496,7 @@ private void testDeduplication(
(HoodieData<HoodieRecord<RawTripTestPayload>>) HoodieWriteHelper.newInstance()
.deduplicateRecords(records, index, dedupParallelism, writeConfig.getSchema(), writeConfig.getProps(), HoodiePreCombineAvroRecordMerger.INSTANCE);
dedupedRecs = dedupedRecsRdd.collectAsList();
assertEquals(records.getNumPartitions(), dedupedRecsRdd.getNumPartitions());
assertEquals(dedupParallelism, dedupedRecsRdd.getNumPartitions());
assertEquals(2, dedupedRecs.size());
assertNodupesWithinPartition(dedupedRecs);

Expand Down

0 comments on commit ea8f925

Please sign in to comment.