apache · zhuqi-lucas · Apr 15, 2021
diff --git a/...urcemanager/scheduler/placement/ResourceUsageWithPartialShuffleMultiNodeLookupPolicy.java b/...urcemanager/scheduler/placement/ResourceUsageWithPartialShuffleMultiNodeLookupPolicy.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.placement;
+
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
+
+import java.util.Comparator;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentSkipListSet;
+
+/**
+ * <p>
+ * This class has the following functionality:
+ *
+ * <p>
+ * ResourceUsageWithPartialShuffleMultiNodeLookupPolicy
+ * holds sorted nodes list based on the
+ * resource usage of nodes at given time.
+ * Also inorder to prevent hot accessing node with multi-thread scheduling,
+ * we add the partial shuffle with SHUFFLE_INTERVAL default is 10.
+ * Se details YARN-10738.
+ * </p>
+ */
+public class ResourceUsageWithPartialShuffleMultiNodeLookupPolicy
+    <N extends SchedulerNode> implements MultiNodeLookupPolicy<N> {
+
+  private Map<String, Set<N>> nodesPerPartition = new ConcurrentHashMap<>();
+  private Comparator<N> comparator;
+  // Shuffle interval(the shuffle size of every shuffle).
+  private static final int SHUFFLE_INTERVAL = 10;
+
+  public ResourceUsageWithPartialShuffleMultiNodeLookupPolicy() {
+    this.comparator = new Comparator<N>() {
+      @Override
+      public int compare(N o1, N o2) {
+        int allocatedDiff = o1.getAllocatedResource()
+            .compareTo(o2.getAllocatedResource());
+        if (allocatedDiff == 0) {
+          return o1.getNodeID().compareTo(o2.getNodeID());
+        }
+        return allocatedDiff;
+      }
+    };
+  }
+
+  @Override
+  public Iterator<N> getPreferredNodeIterator(Collection<N> nodes,
+      String partition) {
+    Iterator<N> beforePartialShuffle =
+        getNodesPerPartition(partition).iterator();
+    int counter = 0;
+    List<N> list = new ArrayList<N>();
+    while(beforePartialShuffle.hasNext()) {
+      list.add(beforePartialShuffle.next());
+      // Every shuffle interval(the shuffle size of every shuffle),
+      // we should shuffle to prevent
+      // hot accessing node when multi scheduling.
+      // It's very important for big clusters.
+      if (counter > 0 && counter % SHUFFLE_INTERVAL == 0) {
+        Collections.
+            shuffle(list.subList(counter -10, counter));
+      }
+      ++counter;
+    }
+    return list.iterator();
+  }
+
+  @Override
+  public void addAndRefreshNodesSet(Collection<N> nodes,
+      String partition) {
+    Set<N> nodeList = new ConcurrentSkipListSet<N>(comparator);
+    nodeList.addAll(nodes);
+    nodesPerPartition.put(partition, Collections.unmodifiableSet(nodeList));
+  }
+
+  @Override
+  public Set<N> getNodesPerPartition(String partition) {
+    return nodesPerPartition.getOrDefault(partition, Collections.emptySet());
+  }
+
+  public Map<String, Set<N>> getNodesPerPartition() {
+    return nodesPerPartition;
+  }
+
+  public Comparator<N> getComparator() {
+    return comparator;
+  }
+}
diff --git a/...adoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerMultiNodes.java b/...adoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerMultiNodes.java
@@ -60,7 +60,10 @@ public class TestCapacitySchedulerMultiNodes extends CapacitySchedulerTestBase {
   private CapacitySchedulerConfiguration conf;
   private static final String POLICY_CLASS_NAME =
       "org.apache.hadoop.yarn.server.resourcemanager.scheduler.placement.ResourceUsageMultiNodeLookupPolicy";
-
+  private static final String POLICY_CLASS_NAME_WITH_SHUFFLE =
+      "org.apache.hadoop.yarn.server.resourcemanager." +
+          "scheduler.placement." +
+          "ResourceUsageWithPartialShuffleMultiNodeLookupPolicy";
   @Before
   public void setUp() {
     CapacitySchedulerConfiguration config =
@@ -105,6 +108,40 @@ public void testMultiNodeSorterForScheduling() throws Exception {
     rm.stop();
   }
 
+  @Test
+  public void testResourceUsageWithPartialShuffleMultiNodeLookupPolicy()
+      throws Exception {
+    String policyName =
+        CapacitySchedulerConfiguration.MULTI_NODE_SORTING_POLICY_NAME
+            + ".resource-based" + ".class";
+    conf.set(policyName, POLICY_CLASS_NAME_WITH_SHUFFLE);
+    MockRM rm = new MockRM(conf);
+    rm.start();
+    for (int i = 0; i < 1000; ++i) {
+      rm.registerNode("127.0.0.1:" + i, 10 * GB);
+    }
+    ResourceScheduler scheduler = rm.getRMContext().getScheduler();
+    waitforNMRegistered(scheduler, 1000, 5);
+    MultiNodeSortingManager<SchedulerNode> mns = rm.getRMContext()
+        .getMultiNodeSortingManager();
+    MultiNodeSorter<SchedulerNode> sorter = mns
+        .getMultiNodePolicy(POLICY_CLASS_NAME_WITH_SHUFFLE);
+    sorter.reSortClusterNodes();
+    Set<SchedulerNode> nodes = sorter.getMultiNodeLookupPolicy()
+        .getNodesPerPartition("");
+    Assert.assertEquals(1000, nodes.size());
+
+    Iterator<SchedulerNode> list = sorter.getMultiNodeLookupPolicy().
+        getPreferredNodeIterator(null, "");
+    int count = 0;
+    while (list.hasNext()) {
+      list.next();
+      ++count;
+    }
+    Assert.assertEquals(1000, count);
+    rm.stop();
+  }
+
   @Test
   public void testMultiNodeSorterForSchedulingWithOrdering() throws Exception {
     MockRM rm = new MockRM(conf);