Skip to content

Commit 9f21d4c

Browse files
[Bug] [Connector-V2] Fix hive source connector parallelism not work (#2823)
* [Bug][Connector-V2] Hive Source Connector parallelism not work * [Bug][Connector-V2] Fix code style * [Bug][Connector-V2] Fix assigned splits
1 parent 32c1c81 commit 9f21d4c

File tree

2 files changed

+36
-34
lines changed

2 files changed

+36
-34
lines changed

seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/BaseFileSourceReader.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import java.util.Set;
3232

3333
public class BaseFileSourceReader implements SourceReader<SeaTunnelRow, FileSourceSplit> {
34-
private static final long THREAD_WAIT_TIME = 500L;
3534
private final ReadStrategy readStrategy;
3635
private final HadoopConf hadoopConf;
3736
private final SourceReader.Context context;
@@ -56,10 +55,6 @@ public void close() throws IOException {
5655

5756
@Override
5857
public void pollNext(Collector<SeaTunnelRow> output) throws Exception {
59-
if (sourceSplits.isEmpty()) {
60-
Thread.sleep(THREAD_WAIT_TIME);
61-
return;
62-
}
6358
sourceSplits.forEach(source -> {
6459
try {
6560
readStrategy.read(source.splitId(), output);

seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/split/FileSourceSplitEnumerator.java

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,18 @@
1818
package org.apache.seatunnel.connectors.seatunnel.file.source.split;
1919

2020
import org.apache.seatunnel.api.source.SourceSplitEnumerator;
21-
import org.apache.seatunnel.common.config.Common;
2221
import org.apache.seatunnel.connectors.seatunnel.file.source.state.FileSourceState;
2322

23+
import lombok.extern.slf4j.Slf4j;
24+
2425
import java.io.IOException;
2526
import java.util.ArrayList;
26-
import java.util.Collection;
27-
import java.util.Collections;
28-
import java.util.HashMap;
2927
import java.util.HashSet;
3028
import java.util.List;
31-
import java.util.Map;
3229
import java.util.Set;
30+
import java.util.stream.Collectors;
3331

32+
@Slf4j
3433
public class FileSourceSplitEnumerator implements SourceSplitEnumerator<FileSourceSplit, FileSourceState> {
3534
private final Context<FileSourceSplit> context;
3635
private Set<FileSourceSplit> pendingSplit;
@@ -40,6 +39,7 @@ public class FileSourceSplitEnumerator implements SourceSplitEnumerator<FileSour
4039
public FileSourceSplitEnumerator(SourceSplitEnumerator.Context<FileSourceSplit> context, List<String> filePaths) {
4140
this.context = context;
4241
this.filePaths = filePaths;
42+
this.assignedSplit = new HashSet<>();
4343
}
4444

4545
public FileSourceSplitEnumerator(SourceSplitEnumerator.Context<FileSourceSplit> context, List<String> filePaths,
@@ -50,51 +50,59 @@ public FileSourceSplitEnumerator(SourceSplitEnumerator.Context<FileSourceSplit>
5050

5151
@Override
5252
public void open() {
53-
this.assignedSplit = new HashSet<>();
5453
this.pendingSplit = new HashSet<>();
5554
}
5655

5756
@Override
5857
public void run() {
59-
pendingSplit = getHiveFileSplit();
60-
assignSplit(context.registeredReaders());
58+
// do nothing
6159
}
6260

63-
private Set<FileSourceSplit> getHiveFileSplit() {
64-
Set<FileSourceSplit> hiveSourceSplits = new HashSet<>();
65-
filePaths.forEach(k -> hiveSourceSplits.add(new FileSourceSplit(k)));
66-
return hiveSourceSplits;
67-
61+
private Set<FileSourceSplit> getFileSplit() {
62+
Set<FileSourceSplit> fileSourceSplits = new HashSet<>();
63+
filePaths.forEach(k -> fileSourceSplits.add(new FileSourceSplit(k)));
64+
return fileSourceSplits;
6865
}
6966

7067
@Override
7168
public void close() throws IOException {
72-
69+
// do nothing
7370
}
7471

7572
@Override
7673
public void addSplitsBack(List<FileSourceSplit> splits, int subtaskId) {
7774
if (!splits.isEmpty()) {
7875
pendingSplit.addAll(splits);
79-
assignSplit(Collections.singletonList(subtaskId));
76+
assignSplit(subtaskId);
8077
}
8178
}
8279

83-
private void assignSplit(Collection<Integer> taskIDList) {
84-
Map<Integer, List<FileSourceSplit>> readySplit = new HashMap<>(Common.COLLECTION_SIZE);
85-
for (int taskID : taskIDList) {
86-
readySplit.computeIfAbsent(taskID, id -> new ArrayList<>());
80+
private void assignSplit(int taskId) {
81+
ArrayList<FileSourceSplit> currentTaskSplits = new ArrayList<>();
82+
if (context.currentParallelism() == 1) {
83+
// if parallelism == 1, we should assign all the splits to reader
84+
currentTaskSplits.addAll(pendingSplit);
85+
} else {
86+
// if parallelism > 1, according to hashCode of split's id to determine whether to allocate the current task
87+
for (FileSourceSplit fileSourceSplit : pendingSplit) {
88+
int splitOwner = getSplitOwner(fileSourceSplit.splitId(), context.currentParallelism());
89+
if (splitOwner == taskId) {
90+
currentTaskSplits.add(fileSourceSplit);
91+
}
92+
}
8793
}
88-
89-
pendingSplit.forEach(s -> readySplit.get(getSplitOwner(s.splitId(), taskIDList.size()))
90-
.add(s));
91-
readySplit.forEach(context::assignSplit);
92-
assignedSplit.addAll(pendingSplit);
93-
pendingSplit.clear();
94+
// assign splits
95+
context.assignSplit(taskId, currentTaskSplits);
96+
// save the state of assigned splits
97+
assignedSplit.addAll(currentTaskSplits);
98+
// remove the assigned splits from pending splits
99+
currentTaskSplits.forEach(split -> pendingSplit.remove(split));
100+
log.info("SubTask {} is assigned to [{}]", taskId, currentTaskSplits.stream().map(FileSourceSplit::splitId).collect(Collectors.joining(",")));
101+
context.signalNoMoreSplits(taskId);
94102
}
95103

96104
private static int getSplitOwner(String tp, int numReaders) {
97-
return tp.hashCode() % numReaders;
105+
return Math.abs(tp.hashCode()) % numReaders;
98106
}
99107

100108
@Override
@@ -104,9 +112,8 @@ public int currentUnassignedSplitSize() {
104112

105113
@Override
106114
public void registerReader(int subtaskId) {
107-
if (!pendingSplit.isEmpty()) {
108-
assignSplit(Collections.singletonList(subtaskId));
109-
}
115+
pendingSplit = getFileSplit();
116+
assignSplit(subtaskId);
110117
}
111118

112119
@Override

0 commit comments

Comments
 (0)