New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-22537][core] Aggregation of map output statistics on driver faces single point bottleneck #19763
[SPARK-22537][core] Aggregation of map output statistics on driver faces single point bottleneck #19763
Changes from 8 commits
5dd0487
819774f
da02825
2735b30
8501970
4dafb19
3419dfa
da147d7
8a1719d
055d44c
72c3d97
c9c26ce
0f87dd6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,11 +23,14 @@ import java.util.zip.{GZIPInputStream, GZIPOutputStream} | |
|
||
import scala.collection.JavaConverters._ | ||
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map} | ||
import scala.concurrent.{ExecutionContext, Future} | ||
import scala.concurrent.duration.Duration | ||
import scala.reflect.ClassTag | ||
import scala.util.control.NonFatal | ||
|
||
import org.apache.spark.broadcast.{Broadcast, BroadcastManager} | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.internal.config._ | ||
import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} | ||
import org.apache.spark.scheduler.MapStatus | ||
import org.apache.spark.shuffle.MetadataFetchFailedException | ||
|
@@ -472,16 +475,45 @@ private[spark] class MapOutputTrackerMaster( | |
shuffleStatuses.get(shuffleId).map(_.findMissingPartitions()) | ||
} | ||
|
||
/** | ||
* Try to equally divide Range(0, num) to divisor slices | ||
*/ | ||
def equallyDivide(num: Int, divisor: Int): Iterator[Seq[Int]] = { | ||
assert(divisor > 0, "Divisor should be positive") | ||
val (each, remain) = (num / divisor, num % divisor) | ||
val (smaller, bigger) = (0 until num).splitAt((divisor-remain) * each) | ||
if (each != 0) { | ||
smaller.grouped(each) ++ bigger.grouped(each + 1) | ||
} else { | ||
bigger.grouped(each + 1) | ||
} | ||
} | ||
|
||
/** | ||
* Return statistics about all of the outputs for a given shuffle. | ||
*/ | ||
def getStatistics(dep: ShuffleDependency[_, _, _]): MapOutputStatistics = { | ||
shuffleStatuses(dep.shuffleId).withMapStatuses { statuses => | ||
val totalSizes = new Array[Long](dep.partitioner.numPartitions) | ||
for (s <- statuses) { | ||
for (i <- 0 until totalSizes.length) { | ||
totalSizes(i) += s.getSizeForBlock(i) | ||
if (statuses.length * totalSizes.length <= | ||
conf.get(SHUFFLE_MAP_OUTPUT_STATISTICS_PARALLEL_AGGREGATION_THRESHOLD)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
|
||
for (s <- statuses) { | ||
for (i <- 0 until totalSizes.length) { | ||
totalSizes(i) += s.getSizeForBlock(i) | ||
} | ||
} | ||
} else { | ||
val parallelism = conf.get(SHUFFLE_MAP_OUTPUT_STATISTICS_CORES) | ||
val threadPool = ThreadUtils.newDaemonFixedThreadPool(parallelism, "map-output-statistics") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please put |
||
val executionContext = ExecutionContext.fromExecutor(threadPool) | ||
val mapStatusSubmitTasks = equallyDivide(totalSizes.length, parallelism).map { | ||
reduceIds => Future { | ||
for (s <- statuses; i <- reduceIds) { | ||
totalSizes(i) += s.getSizeForBlock(i) | ||
} | ||
} (executionContext) | ||
} | ||
ThreadUtils.awaitResult(Future.sequence(mapStatusSubmitTasks), Duration.Inf) | ||
} | ||
new MapOutputStatistics(dep.shuffleId, totalSizes) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -485,4 +485,20 @@ package object config { | |
"array in the sorter.") | ||
.intConf | ||
.createWithDefault(Integer.MAX_VALUE) | ||
|
||
private[spark] val SHUFFLE_MAP_OUTPUT_STATISTICS_PARALLEL_AGGREGATION_THRESHOLD = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really? I grep the code base but can't find it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's not a big problem, adaptive execution need both core and sql code, so both confs are needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get it. You showed me that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is also a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea let's add it. BTW shall we also use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually there are 3 confs like that... all need? |
||
ConfigBuilder("spark.shuffle.mapOutputStatistics.parallelAggregationThreshold") | ||
.internal() | ||
.doc("Multi-thread is used when the number of mappers * shuffle partitions exceeds this " + | ||
"threshold.") | ||
.intConf | ||
.createWithDefault(10000000) | ||
|
||
private[spark] val SHUFFLE_MAP_OUTPUT_STATISTICS_CORES = | ||
ConfigBuilder("spark.shuffle.mapOutputStatistics.cores") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
.internal() | ||
.doc("The cores will be used during map output statistics parallel aggregation.") | ||
.intConf | ||
.createWithDefault(8) | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add some comment to describe the algorithm? I'd expect something like:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure : )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my proposal