From d8568c4f9d43b9cd0b9cb47e6f8a9ddd8a57ec1c Mon Sep 17 00:00:00 2001 From: Zhu Zhu Date: Sun, 19 Apr 2020 08:58:53 +0800 Subject: [PATCH] [FLINK-17014][runtime] Implement PipelinedRegionSchedulingStrategy --- .../PipelinedRegionSchedulingStrategy.java | 162 ++++++++++++++++++ .../strategy/SchedulingStrategyUtils.java | 12 ++ ...PipelinedRegionSchedulingStrategyTest.java | 162 ++++++++++++++++++ .../TestingSchedulingResultPartition.java | 4 + 4 files changed, 340 insertions(+) create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategy.java create mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategyTest.java diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategy.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategy.java new file mode 100644 index 00000000000000..6f0400ae98279e --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategy.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.strategy; + +import org.apache.flink.runtime.execution.ExecutionState; +import org.apache.flink.runtime.io.network.partition.ResultPartitionType; +import org.apache.flink.runtime.jobgraph.IntermediateDataSetID; +import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID; +import org.apache.flink.runtime.scheduler.DeploymentOption; +import org.apache.flink.runtime.scheduler.ExecutionVertexDeploymentOption; +import org.apache.flink.runtime.scheduler.SchedulerOperations; +import org.apache.flink.util.IterableUtils; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.flink.util.Preconditions.checkNotNull; +import static org.apache.flink.util.Preconditions.checkState; + +/** + * {@link SchedulingStrategy} instance which schedules tasks in granularity of pipelined regions. + */ +public class PipelinedRegionSchedulingStrategy implements SchedulingStrategy { + + private final SchedulerOperations schedulerOperations; + + private final SchedulingTopology schedulingTopology; + + private final DeploymentOption deploymentOption = new DeploymentOption(false); + + /** Result partitions are correlated if they have the same result id. */ + private final Map> correlatedResultPartitions = new HashMap<>(); + + private final Map> partitionConsumerRegions = new HashMap<>(); + + public PipelinedRegionSchedulingStrategy( + final SchedulerOperations schedulerOperations, + final SchedulingTopology schedulingTopology) { + + this.schedulerOperations = checkNotNull(schedulerOperations); + this.schedulingTopology = checkNotNull(schedulingTopology); + + init(); + } + + private void init() { + for (SchedulingPipelinedRegion region : schedulingTopology.getAllPipelinedRegions()) { + for (SchedulingResultPartition partition : region.getConsumedResults()) { + checkState(partition.getResultType() == ResultPartitionType.BLOCKING); + + partitionConsumerRegions.computeIfAbsent(partition.getId(), pid -> new HashSet<>()).add(region); + correlatedResultPartitions.computeIfAbsent(partition.getResultId(), rid -> new HashSet<>()).add(partition); + } + } + } + + @Override + public void startScheduling() { + final Set sourceRegions = IterableUtils + .toStream(schedulingTopology.getAllPipelinedRegions()) + .filter(region -> !region.getConsumedResults().iterator().hasNext()) + .collect(Collectors.toSet()); + maybeScheduleRegions(sourceRegions); + } + + @Override + public void restartTasks(final Set verticesToRestart) { + final Set regionsToRestart = verticesToRestart.stream() + .map(schedulingTopology::getPipelinedRegionOfVertex) + .collect(Collectors.toSet()); + maybeScheduleRegions(regionsToRestart); + } + + @Override + public void onExecutionStateChange(final ExecutionVertexID executionVertexId, final ExecutionState executionState) { + if (executionState == ExecutionState.FINISHED) { + final Set finishedPartitions = IterableUtils + .toStream(schedulingTopology.getVertex(executionVertexId).getProducedResults()) + .filter(partition -> partitionConsumerRegions.containsKey(partition.getId())) + .filter(partition -> partition.getState() == ResultPartitionState.CONSUMABLE) + .flatMap(partition -> correlatedResultPartitions.get(partition.getResultId()).stream()) + .collect(Collectors.toSet()); + + final Set consumerRegions = finishedPartitions.stream() + .flatMap(partition -> partitionConsumerRegions.get(partition.getId()).stream()) + .collect(Collectors.toSet()); + maybeScheduleRegions(consumerRegions); + } + } + + @Override + public void onPartitionConsumable(final IntermediateResultPartitionID resultPartitionId) { + } + + private void maybeScheduleRegions(final Set regions) { + final List regionsSorted = + SchedulingStrategyUtils.sortPipelinedRegionsInTopologicalOrder(schedulingTopology, regions); + for (SchedulingPipelinedRegion region : regionsSorted) { + maybeScheduleRegion(region); + } + } + + private void maybeScheduleRegion(final SchedulingPipelinedRegion region) { + if (!areRegionInputsAllConsumable(region)) { + return; + } + + checkState(areRegionVerticesAllInCreatedState(region), "BUG: trying to schedule a region which is not in CREATED state"); + + final Set verticesToSchedule = IterableUtils.toStream(region.getVertices()) + .map(SchedulingExecutionVertex::getId) + .collect(Collectors.toSet()); + final List vertexDeploymentOptions = + SchedulingStrategyUtils.createExecutionVertexDeploymentOptionsInTopologicalOrder( + schedulingTopology, + verticesToSchedule, + id -> deploymentOption); + schedulerOperations.allocateSlotsAndDeploy(vertexDeploymentOptions); + } + + private boolean areRegionInputsAllConsumable(final SchedulingPipelinedRegion region) { + return IterableUtils.toStream(region.getConsumedResults()) + .allMatch(partition -> partition.getState() == ResultPartitionState.CONSUMABLE); + } + + private boolean areRegionVerticesAllInCreatedState(final SchedulingPipelinedRegion region) { + return IterableUtils.toStream(region.getVertices()) + .allMatch(vertex -> vertex.getState() == ExecutionState.CREATED); + } + + /** + * The factory for creating {@link PipelinedRegionSchedulingStrategy}. + */ + public static class Factory implements SchedulingStrategyFactory { + @Override + public SchedulingStrategy createInstance( + final SchedulerOperations schedulerOperations, + final SchedulingTopology schedulingTopology) { + return new PipelinedRegionSchedulingStrategy(schedulerOperations, schedulingTopology); + } + } +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/SchedulingStrategyUtils.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/SchedulingStrategyUtils.java index d24cbd0a5f167e..fb41db9f9b7513 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/SchedulingStrategyUtils.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/strategy/SchedulingStrategyUtils.java @@ -60,4 +60,16 @@ static List createExecutionVertexDeploymentOpti deploymentOptionRetriever.apply(executionVertexID))) .collect(Collectors.toList()); } + + static List sortPipelinedRegionsInTopologicalOrder( + final SchedulingTopology topology, + final Set regions) { + + return IterableUtils.toStream(topology.getVertices()) + .map(SchedulingExecutionVertex::getId) + .map(topology::getPipelinedRegionOfVertex) + .filter(regions::contains) + .distinct() + .collect(Collectors.toList()); + } } diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategyTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategyTest.java new file mode 100644 index 00000000000000..e8ed38fb903263 --- /dev/null +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/PipelinedRegionSchedulingStrategyTest.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.strategy; + +import org.apache.flink.runtime.execution.ExecutionState; +import org.apache.flink.runtime.io.network.partition.ResultPartitionType; +import org.apache.flink.runtime.scheduler.ExecutionVertexDeploymentOption; +import org.apache.flink.util.TestLogger; + +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; + +/** + * Unit tests for {@link PipelinedRegionSchedulingStrategy}. + */ +public class PipelinedRegionSchedulingStrategyTest extends TestLogger { + + private TestingSchedulerOperations testingSchedulerOperation; + + private int parallelism = 2; + + private TestingSchedulingTopology testingSchedulingTopology; + + private List source; + + private List map; + + private List sink; + + @Before + public void setUp() { + testingSchedulerOperation = new TestingSchedulerOperations(); + + buildTopology(); + } + + private void buildTopology() { + testingSchedulingTopology = new TestingSchedulingTopology(); + + source = testingSchedulingTopology.addExecutionVertices().withParallelism(parallelism).finish(); + map = testingSchedulingTopology.addExecutionVertices().withParallelism(parallelism).finish(); + sink = testingSchedulingTopology.addExecutionVertices().withParallelism(parallelism).finish(); + + testingSchedulingTopology.connectPointwise(source, map) + .withResultPartitionState(ResultPartitionState.CREATED) + .withResultPartitionType(ResultPartitionType.PIPELINED_BOUNDED) + .finish(); + testingSchedulingTopology.connectAllToAll(map, sink) + .withResultPartitionState(ResultPartitionState.CREATED) + .withResultPartitionType(ResultPartitionType.BLOCKING) + .finish(); + + testingSchedulingTopology.generatePipelinedRegions(); + } + + @Test + public void testStartScheduling() { + startScheduling(testingSchedulingTopology); + + final List> expectedScheduledVertices = new ArrayList<>(); + expectedScheduledVertices.add(Arrays.asList(source.get(0), map.get(0))); + expectedScheduledVertices.add(Arrays.asList(source.get(1), map.get(1))); + assertLatestScheduledVerticesAreEqualTo(expectedScheduledVertices); + } + + @Test + public void testRestartTasks() { + final PipelinedRegionSchedulingStrategy schedulingStrategy = startScheduling(testingSchedulingTopology); + + final Set verticesToRestart = Stream.of(source, map, sink) + .flatMap(List::stream) + .map(TestingSchedulingExecutionVertex::getId) + .collect(Collectors.toSet()); + + schedulingStrategy.restartTasks(verticesToRestart); + + final List> expectedScheduledVertices = new ArrayList<>(); + expectedScheduledVertices.add(Arrays.asList(source.get(0), map.get(0))); + expectedScheduledVertices.add(Arrays.asList(source.get(1), map.get(1))); + assertLatestScheduledVerticesAreEqualTo(expectedScheduledVertices); + } + + @Test + public void testNotifyingBlockingResultPartitionProducerFinished() { + final PipelinedRegionSchedulingStrategy schedulingStrategy = startScheduling(testingSchedulingTopology); + + final TestingSchedulingExecutionVertex map1 = map.get(0); + map1.getProducedResults().iterator().next().setState(ResultPartitionState.CONSUMABLE); + schedulingStrategy.onExecutionStateChange(map1.getId(), ExecutionState.FINISHED); + + // sinks' inputs are not all consumable yet so they are not scheduled + assertThat(testingSchedulerOperation.getScheduledVertices(), hasSize(2)); + + final TestingSchedulingExecutionVertex map2 = map.get(1); + map2.getProducedResults().iterator().next().setState(ResultPartitionState.CONSUMABLE); + schedulingStrategy.onExecutionStateChange(map2.getId(), ExecutionState.FINISHED); + + assertThat(testingSchedulerOperation.getScheduledVertices(), hasSize(4)); + + final List> expectedScheduledVertices = new ArrayList<>(); + expectedScheduledVertices.add(Arrays.asList(sink.get(0))); + expectedScheduledVertices.add(Arrays.asList(sink.get(1))); + assertLatestScheduledVerticesAreEqualTo(expectedScheduledVertices); + } + + private PipelinedRegionSchedulingStrategy startScheduling(TestingSchedulingTopology testingSchedulingTopology) { + final PipelinedRegionSchedulingStrategy schedulingStrategy = new PipelinedRegionSchedulingStrategy( + testingSchedulerOperation, + testingSchedulingTopology); + schedulingStrategy.startScheduling(); + return schedulingStrategy; + } + + private void assertLatestScheduledVerticesAreEqualTo(final List> expected) { + final List> deploymentOptions = testingSchedulerOperation.getScheduledVertices(); + final int expectedScheduledBulks = expected.size(); + assertThat(expectedScheduledBulks, lessThanOrEqualTo(deploymentOptions.size())); + for (int i = 0; i < expectedScheduledBulks; i++) { + assertEquals( + idsFromVertices(expected.get(expectedScheduledBulks - i - 1)), + idsFromDeploymentOptions(deploymentOptions.get(deploymentOptions.size() - i - 1))); + } + } + + private static List idsFromVertices(final List vertices) { + return vertices.stream().map(TestingSchedulingExecutionVertex::getId).collect(Collectors.toList()); + } + + private static List idsFromDeploymentOptions( + final List deploymentOptions) { + + return deploymentOptions.stream().map(ExecutionVertexDeploymentOption::getExecutionVertexId).collect(Collectors.toList()); + } +} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/TestingSchedulingResultPartition.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/TestingSchedulingResultPartition.java index 55fabcf2fab797..8691a9b5fc78c6 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/TestingSchedulingResultPartition.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/strategy/TestingSchedulingResultPartition.java @@ -90,6 +90,10 @@ void setProducer(TestingSchedulingExecutionVertex producer) { this.producer = checkNotNull(producer); } + void setState(ResultPartitionState state) { + this.state = state; + } + /** * Builder for {@link TestingSchedulingResultPartition}. */