From bd103b78adebf933d3e33d4628c2394a845dbaa2 Mon Sep 17 00:00:00 2001 From: shijinkui Date: Mon, 29 Sep 2014 13:34:02 +0800 Subject: [PATCH 1/6] code style format --- .../scala/org/apache/spark/Aggregator.scala | 14 ++++------ .../spark/examples/SkewedGroupByTest.scala | 8 +++--- .../org/apache/spark/examples/SparkLR.scala | 28 +++++++++++-------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala index 79c9c451d273d..6f20adf66e479 100644 --- a/core/src/main/scala/org/apache/spark/Aggregator.scala +++ b/core/src/main/scala/org/apache/spark/Aggregator.scala @@ -40,10 +40,9 @@ case class Aggregator[K, V, C] ( def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] = combineValuesByKey(iter, null) - def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]], - context: TaskContext): Iterator[(K, C)] = { + def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]], context: TaskContext): Iterator[(K, C)] = { if (!externalSorting) { - val combiners = new AppendOnlyMap[K,C] + val combiners = new AppendOnlyMap[K, C] var kv: Product2[K, V] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2) @@ -67,14 +66,11 @@ case class Aggregator[K, V, C] ( } @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0") - def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] = - combineCombinersByKey(iter, null) + def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]): Iterator[(K, C)] = combineCombinersByKey(iter, null) - def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext) - : Iterator[(K, C)] = - { + def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext): Iterator[(K, C)] = { if (!externalSorting) { - val combiners = new AppendOnlyMap[K,C] + val combiners = new AppendOnlyMap[K, C] var kc: Product2[K, C] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2 diff --git a/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala index 017d4e1e5ce13..f156b6cc69f3b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala @@ -28,10 +28,10 @@ import org.apache.spark.SparkContext._ object SkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") - var numMappers = if (args.length > 0) args(0).toInt else 2 + val numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 - var valSize = if (args.length > 2) args(2).toInt else 1000 - var numReducers = if (args.length > 3) args(3).toInt else numMappers + val valSize = if (args.length > 2) args(2).toInt else 1000 + val numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) @@ -41,7 +41,7 @@ object SkewedGroupByTest { // map output sizes lineraly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt - var arr1 = new Array[(Int, Array[Byte])](numKVPairs) + val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala index fc23308fc4adf..01dbde686eff9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala @@ -19,12 +19,11 @@ package org.apache.spark.examples import java.util.Random -import scala.math.exp - -import breeze.linalg.{Vector, DenseVector} - +import breeze.linalg.{DenseVector, Vector} import org.apache.spark._ +import scala.math.exp + /** * Logistic regression based classification. * Usage: SparkLR [slices] @@ -33,9 +32,12 @@ import org.apache.spark._ * please refer to org.apache.spark.mllib.classification.LogisticRegression */ object SparkLR { - val N = 10000 // Number of data points - val D = 10 // Numer of dimensions - val R = 0.7 // Scaling factor + val N = 10000 + // Number of data points + val D = 10 + // Numer of dimensions + val R = 0.7 + // Scaling factor val ITERATIONS = 5 val rand = new Random(42) @@ -43,8 +45,10 @@ object SparkLR { def generateData = { def generatePoint(i: Int) = { - val y = if(i % 2 == 0) -1 else 1 - val x = DenseVector.fill(D){rand.nextGaussian + y * R} + val y = if (i % 2 == 0) -1 else 1 + val x = DenseVector.fill(D) { + rand.nextGaussian + y * R + } DataPoint(x, y) } Array.tabulate(N)(generatePoint) @@ -68,13 +72,15 @@ object SparkLR { val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value - var w = DenseVector.fill(D){2 * rand.nextDouble - 1} + var w = DenseVector.fill(D) { + 2 * rand.nextDouble - 1 + } println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => - p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y + p.x * (1 / (1 + exp(-p.y * w.dot(p.x))) - 1) * p.y }.reduce(_ + _) w -= gradient } From 0c9d1754f9c8bf36236f915868eb6f62863fea25 Mon Sep 17 00:00:00 2001 From: shijinkui Date: Fri, 3 Oct 2014 22:21:44 +0800 Subject: [PATCH 2/6] code format --- .../org/apache/spark/ContextCleaner.scala | 6 +- .../scala/org/apache/spark/SparkContext.scala | 279 +++++++++--------- .../spark/api/java/JavaSparkContext.scala | 2 +- .../spark/broadcast/BroadcastManager.scala | 2 +- .../org/apache/spark/rdd/FlatMappedRDD.scala | 4 +- .../spark/rdd/ParallelCollectionRDD.scala | 3 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 2 +- .../apache/spark/scheduler/DAGScheduler.scala | 100 +++---- .../spark/scheduler/SchedulerBackend.scala | 9 +- .../apache/spark/scheduler/SplitInfo.scala | 6 +- .../org/apache/spark/scheduler/Stage.scala | 6 +- .../spark/scheduler/TaskSchedulerImpl.scala | 16 +- .../CoarseGrainedSchedulerBackend.scala | 23 +- .../mesos/CoarseMesosSchedulerBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 2 +- .../apache/spark/storage/BlockManager.scala | 10 +- .../spark/storage/BlockManagerMaster.scala | 2 +- .../apache/spark/ui/UIWorkloadGenerator.scala | 18 +- .../apache/spark/util/ClosureCleaner.scala | 30 +- .../scala/org/apache/spark/util/Utils.scala | 49 ++- .../apache/spark/SparkContextInfoSuite.scala | 2 +- .../apache/spark/examples/GroupByTest.scala | 17 +- .../spark/streaming/dstream/DStream.scala | 15 +- 23 files changed, 296 insertions(+), 309 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index ede1e23f4fcc5..b52aa21229727 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -75,7 +75,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { * longer in scope. */ private val blockOnCleanupTasks = sc.conf.getBoolean( - "spark.cleaner.referenceTracking.blocking", true) + "spark.cleaner.referenceTracking.blocking", defaultValue = true) /** * Whether the cleaning thread will block on shuffle cleanup tasks. @@ -88,7 +88,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { * resolved. */ private val blockOnShuffleCleanupTasks = sc.conf.getBoolean( - "spark.cleaner.referenceTracking.blocking.shuffle", false) + "spark.cleaner.referenceTracking.blocking.shuffle", defaultValue = false) @volatile private var stopped = false @@ -182,7 +182,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { def doCleanupBroadcast(broadcastId: Long, blocking: Boolean) { try { logDebug("Cleaning broadcast " + broadcastId) - broadcastManager.unbroadcast(broadcastId, true, blocking) + broadcastManager.unbroadcast(broadcastId, removeFromDriver = true, blocking = blocking) listeners.foreach(_.broadcastCleaned(broadcastId)) logInfo("Cleaned broadcast " + broadcastId) } catch { diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 396cdd1247e07..c12a22f4583f3 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -17,27 +17,20 @@ package org.apache.spark -import scala.language.implicitConversions - import java.io._ import java.net.URI +import java.util.UUID.randomUUID import java.util.concurrent.atomic.AtomicInteger import java.util.{Properties, UUID} -import java.util.UUID.randomUUID -import scala.collection.{Map, Set} -import scala.collection.JavaConversions._ -import scala.collection.generic.Growable -import scala.collection.mutable.HashMap -import scala.reflect.{ClassTag, classTag} + +import akka.actor.Props import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, SequenceFileInputFormat, TextInputFormat} -import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} +import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.mesos.MesosNativeLibrary -import akka.actor.Props - import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} @@ -45,13 +38,19 @@ import org.apache.spark.input.WholeTextFileInputFormat import org.apache.spark.partial.{ApproximateEvaluator, PartialResult} import org.apache.spark.rdd._ import org.apache.spark.scheduler._ -import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SparkDeploySchedulerBackend, SimrSchedulerBackend} import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend} +import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SimrSchedulerBackend, SparkDeploySchedulerBackend} import org.apache.spark.scheduler.local.LocalBackend import org.apache.spark.storage._ import org.apache.spark.ui.SparkUI import org.apache.spark.util.{CallSite, ClosureCleaner, MetadataCleaner, MetadataCleanerType, TimeStampedWeakValueHashMap, Utils} +import scala.collection.JavaConversions._ +import scala.collection.generic.Growable +import scala.collection.{Map, Set, mutable} +import scala.language.implicitConversions +import scala.reflect.{ClassTag, classTag} + /** * Main entry point for Spark functionality. A SparkContext represents the connection to a Spark * cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster. @@ -108,13 +107,12 @@ class SparkContext(config: SparkConf) extends Logging { * @param environment Environment variables to set on worker nodes. */ def this( - master: String, - appName: String, - sparkHome: String = null, - jars: Seq[String] = Nil, - environment: Map[String, String] = Map(), - preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = - { + master: String, + appName: String, + sparkHome: String = null, + jars: Seq[String] = Nil, + environment: Map[String, String] = Map(), + preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = { this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment)) this.preferredNodeLocationData = preferredNodeLocationData } @@ -170,7 +168,7 @@ class SparkContext(config: SparkConf) extends Logging { throw new SparkException("An application name must be set in your configuration") } - if (conf.getBoolean("spark.logConf", false)) { + if (conf.getBoolean("spark.logConf", defaultValue = false)) { logInfo("Spark configuration:\n" + conf.toDebugString) } @@ -187,7 +185,7 @@ class SparkContext(config: SparkConf) extends Logging { val master = conf.get("spark.master") val appName = conf.get("spark.app.name") - private[spark] val isEventLogEnabled = conf.getBoolean("spark.eventLog.enabled", false) + private[spark] val isEventLogEnabled = conf.getBoolean("spark.eventLog.enabled", defaultValue = false) private[spark] val eventLogDir: Option[String] = { if (isEventLogEnabled) { Some(conf.get("spark.eventLog.dir", EventLoggingListener.DEFAULT_LOG_DIR).stripSuffix("/")) @@ -198,10 +196,10 @@ class SparkContext(config: SparkConf) extends Logging { // Generate the random name for a temp folder in Tachyon // Add a timestamp as the suffix here to make it more safe - val tachyonFolderName = "spark-" + randomUUID.toString() + val tachyonFolderName = "spark-" + randomUUID.toString conf.set("spark.tachyonStore.folderName", tachyonFolderName) - val isLocal = (master == "local" || master.startsWith("local[")) + val isLocal = master == "local" || master.startsWith("local[") if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true") @@ -221,8 +219,8 @@ class SparkContext(config: SparkConf) extends Logging { SparkEnv.set(env) // Used to store a URL for each static file/jar together with the file's local timestamp - private[spark] val addedFiles = HashMap[String, Long]() - private[spark] val addedJars = HashMap[String, Long]() + private[spark] val addedFiles = mutable.HashMap[String, Long]() + private[spark] val addedJars = mutable.HashMap[String, Long]() // Keeps track of all persisted RDDs private[spark] val persistentRdds = new TimeStampedWeakValueHashMap[Int, RDD[_]] @@ -231,7 +229,7 @@ class SparkContext(config: SparkConf) extends Logging { // Initialize the Spark UI, registering all associated listeners private[spark] val ui: Option[SparkUI] = - if (conf.getBoolean("spark.ui.enabled", true)) { + if (conf.getBoolean("spark.ui.enabled", defaultValue = true)) { Some(new SparkUI(this)) } else { // For tests, do not enable the UI @@ -266,12 +264,12 @@ class SparkContext(config: SparkConf) extends Logging { .getOrElse(512) // Environment variables to pass to our executors. - private[spark] val executorEnvs = HashMap[String, String]() + private[spark] val executorEnvs = mutable.HashMap[String, String]() // Convert java options to env vars as a work around // since we can't set env vars directly in sbt. for { (envKey, propKey) <- Seq(("SPARK_TESTING", "spark.testing")) - value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} { + value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} { executorEnvs(envKey) = value } Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v => @@ -330,7 +328,7 @@ class SparkContext(config: SparkConf) extends Logging { listenerBus.start() private[spark] val cleaner: Option[ContextCleaner] = { - if (conf.getBoolean("spark.cleaner.referenceTracking", true)) { + if (conf.getBoolean("spark.cleaner.referenceTracking", defaultValue = true)) { Some(new ContextCleaner(this)) } else { None @@ -379,7 +377,7 @@ class SparkContext(config: SparkConf) extends Logging { * [[org.apache.spark.SparkContext.setLocalProperty]]. */ def getLocalProperty(key: String): String = - Option(localProperties.get).map(_.getProperty(key)).getOrElse(null) + Option(localProperties.get).map(_.getProperty(key)).orNull /** Set a human readable description of the current job. */ @deprecated("use setJobGroup", "0.8.1") @@ -444,20 +442,20 @@ class SparkContext(config: SparkConf) extends Logging { // Methods for creating RDDs /** Distribute a local Scala collection to form an RDD. - * - * @note Parallelize acts lazily. If `seq` is a mutable collection and is - * altered after the call to parallelize and before the first action on the - * RDD, the resultant RDD will reflect the modified collection. Pass a copy of - * the argument to avoid this. - */ + * + * @note Parallelize acts lazily. If `seq` is a mutable collection and is + * altered after the call to parallelize and before the first action on the + * RDD, the resultant RDD will reflect the modified collection. Pass a copy of + * the argument to avoid this. + */ def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]()) } /** Distribute a local Scala collection to form an RDD. - * - * This method is identical to `parallelize`. - */ + * + * This method is identical to `parallelize`. + */ def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { parallelize(seq, numSlices) } @@ -537,12 +535,12 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopRDD[K, V]( - conf: JobConf, - inputFormatClass: Class[_ <: InputFormat[K, V]], - keyClass: Class[K], - valueClass: Class[V], - minPartitions: Int = defaultMinPartitions - ): RDD[(K, V)] = { + conf: JobConf, + inputFormatClass: Class[_ <: InputFormat[K, V]], + keyClass: Class[K], + valueClass: Class[V], + minPartitions: Int = defaultMinPartitions + ): RDD[(K, V)] = { // Add necessary security credentials to the JobConf before broadcasting it. SparkHadoopUtil.get.addCredentials(conf) new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minPartitions) @@ -556,12 +554,12 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. * */ def hadoopFile[K, V]( - path: String, - inputFormatClass: Class[_ <: InputFormat[K, V]], - keyClass: Class[K], - valueClass: Class[V], - minPartitions: Int = defaultMinPartitions - ): RDD[(K, V)] = { + path: String, + inputFormatClass: Class[_ <: InputFormat[K, V]], + keyClass: Class[K], + valueClass: Class[V], + minPartitions: Int = defaultMinPartitions + ): RDD[(K, V)] = { // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it. val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path) @@ -589,8 +587,8 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]] - (path: String, minPartitions: Int) - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { + (path: String, minPartitions: Int) + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { hadoopFile(path, fm.runtimeClass.asInstanceOf[Class[F]], km.runtimeClass.asInstanceOf[Class[K]], @@ -612,13 +610,13 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]](path: String) - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = hadoopFile[K, V, F](path, defaultMinPartitions) /** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */ def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]] - (path: String) - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { + (path: String) + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { newAPIHadoopFile( path, fm.runtimeClass.asInstanceOf[Class[F]], @@ -636,11 +634,11 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]]( - path: String, - fClass: Class[F], - kClass: Class[K], - vClass: Class[V], - conf: Configuration = hadoopConfiguration): RDD[(K, V)] = { + path: String, + fClass: Class[F], + kClass: Class[K], + vClass: Class[V], + conf: Configuration = hadoopConfiguration): RDD[(K, V)] = { val job = new NewHadoopJob(conf) NewFileInputFormat.addInputPath(job, new Path(path)) val updatedConf = job.getConfiguration @@ -657,10 +655,10 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]]( - conf: Configuration = hadoopConfiguration, - fClass: Class[F], - kClass: Class[K], - vClass: Class[V]): RDD[(K, V)] = { + conf: Configuration = hadoopConfiguration, + fClass: Class[F], + kClass: Class[K], + vClass: Class[V]): RDD[(K, V)] = { new NewHadoopRDD(this, fClass, kClass, vClass, conf) } @@ -672,10 +670,10 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def sequenceFile[K, V](path: String, - keyClass: Class[K], - valueClass: Class[V], - minPartitions: Int - ): RDD[(K, V)] = { + keyClass: Class[K], + valueClass: Class[V], + minPartitions: Int + ): RDD[(K, V)] = { val inputFormatClass = classOf[SequenceFileInputFormat[K, V]] hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions) } @@ -687,8 +685,7 @@ class SparkContext(config: SparkConf) extends Logging { * If you plan to directly cache Hadoop writable objects, you should first copy them using * a `map` function. * */ - def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V] - ): RDD[(K, V)] = + def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = sequenceFile(path, keyClass, valueClass, defaultMinPartitions) /** @@ -712,17 +709,17 @@ class SparkContext(config: SparkConf) extends Logging { * If you plan to directly cache Hadoop writable objects, you should first copy them using * a `map` function. */ - def sequenceFile[K, V] - (path: String, minPartitions: Int = defaultMinPartitions) - (implicit km: ClassTag[K], vm: ClassTag[V], - kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) - : RDD[(K, V)] = { + def sequenceFile[K, V] + (path: String, minPartitions: Int = defaultMinPartitions) + (implicit km: ClassTag[K], vm: ClassTag[V], + kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) + : RDD[(K, V)] = { val kc = kcf() val vc = vcf() val format = classOf[SequenceFileInputFormat[Writable, Writable]] val writables = hadoopFile(path, format, - kc.writableClass(km).asInstanceOf[Class[Writable]], - vc.writableClass(vm).asInstanceOf[Class[Writable]], minPartitions) + kc.writableClass(km).asInstanceOf[Class[Writable]], + vc.writableClass(vm).asInstanceOf[Class[Writable]], minPartitions) writables.map { case (k, v) => (kc.convert(k), vc.convert(v)) } } @@ -735,15 +732,15 @@ class SparkContext(config: SparkConf) extends Logging { * objects. */ def objectFile[T: ClassTag]( - path: String, - minPartitions: Int = defaultMinPartitions - ): RDD[T] = { + path: String, + minPartitions: Int = defaultMinPartitions + ): RDD[T] = { sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .flatMap(x => Utils.deserialize[Array[T]](x._2.getBytes, Utils.getContextOrSparkClassLoader)) } protected[spark] def checkpointFile[T: ClassTag]( - path: String + path: String ): RDD[T] = { new CheckpointRDD[T](this, path) } @@ -802,7 +799,7 @@ class SparkContext(config: SparkConf) extends Logging { * standard mutable collections. So you can use this with mutable Map, Set, etc. */ def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T] - (initialValue: R): Accumulable[R, T] = { + (initialValue: R): Accumulable[R, T] = { val param = new GrowableAccumulableParam[R,T] new Accumulable(initialValue, param) } @@ -971,11 +968,11 @@ class SparkContext(config: SparkConf) extends Logging { case null | "file" => // yarn-standalone is deprecated, but still supported if (SparkHadoopUtil.get.isYarnMode() && - (master == "yarn-standalone" || master == "yarn-cluster")) { + (master == "yarn-standalone" || master == "yarn-cluster")) { // In order for this to work in yarn-cluster mode the user must specify the // --addJars option to the client to upload the file into the distributed cache // of the AM to make it show up in the current working directory. - val fileName = new Path(uri.getPath).getName() + val fileName = new Path(uri.getPath).getName try { env.httpFileServer.addJar(new File(fileName)) } catch { @@ -1045,7 +1042,7 @@ class SparkContext(config: SparkConf) extends Logging { * or the spark.home Java property, or the SPARK_HOME environment variable * (in that order of preference). If neither of these is set, return None. */ - private[spark] def getSparkHome(): Option[String] = { + private[spark] def getSparkHome: Option[String] = { conf.getOption("spark.home").orElse(Option(System.getenv("SPARK_HOME"))) } @@ -1079,7 +1076,7 @@ class SparkContext(config: SparkConf) extends Logging { * Capture the current user callsite and return a formatted version for printing. If the user * has overridden the call site using `setCallSite()`, this will return the user's version. */ - private[spark] def getCallSite(): CallSite = { + private[spark] def getCallSite: CallSite = { Option(getLocalProperty(CallSite.SHORT_FORM)).map { case shortCallSite => val longCallSite = Option(getLocalProperty(CallSite.LONG_FORM)).getOrElse("") CallSite(shortCallSite, longCallSite) @@ -1093,11 +1090,11 @@ class SparkContext(config: SparkConf) extends Logging { * shipping it out to the cluster, for short actions like first(). */ def runJob[T, U: ClassTag]( - rdd: RDD[T], - func: (TaskContext, Iterator[T]) => U, - partitions: Seq[Int], - allowLocal: Boolean, - resultHandler: (Int, U) => Unit) { + rdd: RDD[T], + func: (TaskContext, Iterator[T]) => U, + partitions: Seq[Int], + allowLocal: Boolean, + resultHandler: (Int, U) => Unit) { if (dagScheduler == null) { throw new SparkException("SparkContext has been shutdown") } @@ -1115,11 +1112,11 @@ class SparkContext(config: SparkConf) extends Logging { * than shipping it out to the cluster, for short actions like first(). */ def runJob[T, U: ClassTag]( - rdd: RDD[T], - func: (TaskContext, Iterator[T]) => U, - partitions: Seq[Int], - allowLocal: Boolean - ): Array[U] = { + rdd: RDD[T], + func: (TaskContext, Iterator[T]) => U, + partitions: Seq[Int], + allowLocal: Boolean + ): Array[U] = { val results = new Array[U](partitions.size) runJob[T, U](rdd, func, partitions, allowLocal, (index, res) => results(index) = res) results @@ -1130,11 +1127,11 @@ class SparkContext(config: SparkConf) extends Logging { * `Iterator[T] => U` instead of `(TaskContext, Iterator[T]) => U`. */ def runJob[T, U: ClassTag]( - rdd: RDD[T], - func: Iterator[T] => U, - partitions: Seq[Int], - allowLocal: Boolean - ): Array[U] = { + rdd: RDD[T], + func: Iterator[T] => U, + partitions: Seq[Int], + allowLocal: Boolean + ): Array[U] = { runJob(rdd, (context: TaskContext, iter: Iterator[T]) => func(iter), partitions, allowLocal) } @@ -1142,14 +1139,14 @@ class SparkContext(config: SparkConf) extends Logging { * Run a job on all partitions in an RDD and return the results in an array. */ def runJob[T, U: ClassTag](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = { - runJob(rdd, func, 0 until rdd.partitions.size, false) + runJob(rdd, func, 0 until rdd.partitions.size, allowLocal = false) } /** * Run a job on all partitions in an RDD and return the results in an array. */ def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = { - runJob(rdd, func, 0 until rdd.partitions.size, false) + runJob(rdd, func, 0 until rdd.partitions.size, allowLocal = false) } /** @@ -1158,21 +1155,19 @@ class SparkContext(config: SparkConf) extends Logging { def runJob[T, U: ClassTag]( rdd: RDD[T], processPartition: (TaskContext, Iterator[T]) => U, - resultHandler: (Int, U) => Unit) - { - runJob[T, U](rdd, processPartition, 0 until rdd.partitions.size, false, resultHandler) + resultHandler: (Int, U) => Unit) { + runJob[T, U](rdd, processPartition, 0 until rdd.partitions.size, allowLocal = false, resultHandler) } /** * Run a job on all partitions in an RDD and pass the results to a handler function. */ def runJob[T, U: ClassTag]( - rdd: RDD[T], - processPartition: Iterator[T] => U, - resultHandler: (Int, U) => Unit) - { + rdd: RDD[T], + processPartition: Iterator[T] => U, + resultHandler: (Int, U) => Unit) { val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter) - runJob[T, U](rdd, processFunc, 0 until rdd.partitions.size, false, resultHandler) + runJob[T, U](rdd, processFunc, 0 until rdd.partitions.size, allowLocal = false, resultHandler) } /** @@ -1181,10 +1176,10 @@ class SparkContext(config: SparkConf) extends Logging { */ @DeveloperApi def runApproximateJob[T, U, R]( - rdd: RDD[T], - func: (TaskContext, Iterator[T]) => U, - evaluator: ApproximateEvaluator[U, R], - timeout: Long): PartialResult[R] = { + rdd: RDD[T], + func: (TaskContext, Iterator[T]) => U, + evaluator: ApproximateEvaluator[U, R], + timeout: Long): PartialResult[R] = { val callSite = getCallSite logInfo("Starting job: " + callSite.shortForm) val start = System.nanoTime @@ -1201,12 +1196,11 @@ class SparkContext(config: SparkConf) extends Logging { */ @Experimental def submitJob[T, U, R]( - rdd: RDD[T], - processPartition: Iterator[T] => U, - partitions: Seq[Int], - resultHandler: (Int, U) => Unit, - resultFunc: => R): SimpleFutureAction[R] = - { + rdd: RDD[T], + processPartition: Iterator[T] => U, + partitions: Seq[Int], + resultHandler: (Int, U) => Unit, + resultFunc: => R): SimpleFutureAction[R] = { val cleanF = clean(processPartition) val callSite = getCallSite val waiter = dagScheduler.submitJob( @@ -1276,7 +1270,7 @@ class SparkContext(config: SparkConf) extends Logging { def getCheckpointDir = checkpointDir /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */ - def defaultParallelism: Int = taskScheduler.defaultParallelism + def defaultParallelism: Int = taskScheduler.defaultParallelism() /** Default min number of partitions for Hadoop RDDs when not given by user */ @deprecated("use defaultMinPartitions", "1.0.0") @@ -1287,12 +1281,12 @@ class SparkContext(config: SparkConf) extends Logging { private val nextShuffleId = new AtomicInteger(0) - private[spark] def newShuffleId(): Int = nextShuffleId.getAndIncrement() + private[spark] def newShuffleId(): Int = nextShuffleId.getAndIncrement private val nextRddId = new AtomicInteger(0) /** Register a new RDD, returning its RDD ID */ - private[spark] def newRddId(): Int = nextRddId.getAndIncrement() + private[spark] def newRddId(): Int = nextRddId.getAndIncrement /** Post the application start event */ private def postApplicationStart() { @@ -1363,19 +1357,17 @@ object SparkContext extends Logging { // TODO: Add AccumulatorParams for other types, e.g. lists and strings implicit def rddToPairRDDFunctions[K, V](rdd: RDD[(K, V)]) - (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) = { + (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) = { new PairRDDFunctions(rdd) } implicit def rddToAsyncRDDActions[T: ClassTag](rdd: RDD[T]) = new AsyncRDDActions(rdd) implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable: ClassTag]( - rdd: RDD[(K, V)]) = - new SequenceFileRDDFunctions(rdd) + rdd: RDD[(K, V)]) = new SequenceFileRDDFunctions(rdd) implicit def rddToOrderedRDDFunctions[K : Ordering : ClassTag, V: ClassTag]( - rdd: RDD[(K, V)]) = - new OrderedRDDFunctions[K, V, (K, V)](rdd) + rdd: RDD[(K, V)]) = new OrderedRDDFunctions[K, V, (K, V)](rdd) implicit def doubleRDDToDoubleRDDFunctions(rdd: RDD[Double]) = new DoubleRDDFunctions(rdd) @@ -1399,16 +1391,16 @@ object SparkContext extends Logging { implicit def stringToText(s: String) = new Text(s) private implicit def arrayToArrayWritable[T <% Writable: ClassTag](arr: Traversable[T]) - : ArrayWritable = { + : ArrayWritable = { def anyToWritable[U <% Writable](u: U): Writable = u new ArrayWritable(classTag[T].runtimeClass.asInstanceOf[Class[Writable]], - arr.map(x => anyToWritable(x)).toArray) + arr.map(x => anyToWritable(x)).toArray) } // Helper objects for converting common types to Writable private def simpleWritableConverter[T, W <: Writable: ClassTag](convert: W => T) - : WritableConverter[T] = { + : WritableConverter[T] = { val wClass = classTag[W].runtimeClass.asInstanceOf[Class[W]] new WritableConverter[T](_ => wClass, x => convert(x.asInstanceOf[W])) } @@ -1472,12 +1464,12 @@ object SparkContext extends Logging { * like SparkConf would. */ private[spark] def updatedConf( - conf: SparkConf, - master: String, - appName: String, - sparkHome: String = null, - jars: Seq[String] = Nil, - environment: Map[String, String] = Map()): SparkConf = + conf: SparkConf, + master: String, + appName: String, + sparkHome: String = null, + jars: Seq[String] = Nil, + environment: Map[String, String] = Map()): SparkConf = { val res = conf.clone() res.setMaster(master) @@ -1622,9 +1614,9 @@ object SparkContext extends Logging { case mesosUrl @ MESOS_REGEX(_) => MesosNativeLibrary.load() val scheduler = new TaskSchedulerImpl(sc) - val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", false) + val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = false) val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs - val backend = if (coarseGrained) { + val backend = if (coarseGrained) { new CoarseMesosSchedulerBackend(scheduler, sc, url) } else { new MesosSchedulerBackend(scheduler, sc, url) @@ -1652,6 +1644,5 @@ object SparkContext extends Logging { * support converting subclasses of Writable to themselves (writableWritableConverter). */ private[spark] class WritableConverter[T]( - val writableClass: ClassTag[T] => Class[_ <: Writable], - val convert: Writable => T) - extends Serializable + val writableClass: ClassTag[T] => Class[_ <: Writable], + val convert: Writable => T) extends Serializable diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 791d853a015a1..ab65389b736a4 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -544,7 +544,7 @@ class JavaSparkContext(val sc: SparkContext) * or the spark.home Java property, or the SPARK_HOME environment variable * (in that order of preference). If neither of these is set, return None. */ - def getSparkHome(): Optional[String] = JavaUtils.optionToOptional(sc.getSparkHome()) + def getSparkHome: Optional[String] = JavaUtils.optionToOptional(sc.getSparkHome) /** * Add a file to be downloaded with this Spark job on every node. diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala index 8f8a0b11f9f2e..46f31f20d63c0 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala @@ -59,7 +59,7 @@ private[spark] class BroadcastManager( private val nextBroadcastId = new AtomicLong(0) def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean) = { - broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement()) + broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement) } def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) { diff --git a/core/src/main/scala/org/apache/spark/rdd/FlatMappedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/FlatMappedRDD.scala index d8f87d4e3690e..3f64faaf9893f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/FlatMappedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/FlatMappedRDD.scala @@ -17,10 +17,10 @@ package org.apache.spark.rdd -import scala.reflect.ClassTag - import org.apache.spark.{Partition, TaskContext} +import scala.reflect.ClassTag + private[spark] class FlatMappedRDD[U: ClassTag, T: ClassTag]( prev: RDD[T], diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala index 66c71bf7e8bb5..1325d6f30025e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala @@ -133,8 +133,7 @@ private object ParallelCollectionRDD { } else { 1 } - slice(new Range( - r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices) + slice(new Range(r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices) } case r: Range => { positions(r.length, numSlices).map({ diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 2aba40d152e3e..52977d2b03334 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1239,7 +1239,7 @@ abstract class RDD[T: ClassTag]( private var storageLevel: StorageLevel = StorageLevel.NONE /** User code that created this RDD (e.g. `textFile`, `parallelize`). */ - @transient private[spark] val creationSite = sc.getCallSite() + @transient private[spark] val creationSite = sc.getCallSite private[spark] def getCreationSite: String = Option(creationSite).map(_.shortForm).getOrElse("") diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 788eb1ff4e455..98dcf61115e7f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -21,7 +21,8 @@ import java.io.NotSerializableException import java.util.Properties import java.util.concurrent.atomic.AtomicInteger -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map, Stack} +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, HashSet} import scala.concurrent.Await import scala.concurrent.duration._ import scala.language.postfixOps @@ -85,24 +86,24 @@ class DAGScheduler( private[scheduler] def numTotalJobs: Int = nextJobId.get() private val nextStageId = new AtomicInteger(0) - private[scheduler] val jobIdToStageIds = new HashMap[Int, HashSet[Int]] - private[scheduler] val stageIdToStage = new HashMap[Int, Stage] - private[scheduler] val shuffleToMapStage = new HashMap[Int, Stage] - private[scheduler] val jobIdToActiveJob = new HashMap[Int, ActiveJob] + private[scheduler] val jobIdToStageIds = new mutable.HashMap[Int, mutable.HashSet[Int]] + private[scheduler] val stageIdToStage = new mutable.HashMap[Int, Stage] + private[scheduler] val shuffleToMapStage = new mutable.HashMap[Int, Stage] + private[scheduler] val jobIdToActiveJob = new mutable.HashMap[Int, ActiveJob] // Stages we need to run whose parents aren't done - private[scheduler] val waitingStages = new HashSet[Stage] + private[scheduler] val waitingStages = new mutable.HashSet[Stage] // Stages we are running right now - private[scheduler] val runningStages = new HashSet[Stage] + private[scheduler] val runningStages = new mutable.HashSet[Stage] // Stages that must be resubmitted due to fetch failures - private[scheduler] val failedStages = new HashSet[Stage] + private[scheduler] val failedStages = new mutable.HashSet[Stage] - private[scheduler] val activeJobs = new HashSet[ActiveJob] + private[scheduler] val activeJobs = new mutable.HashSet[ActiveJob] // Contains the locations that each RDD's partitions are cached on - private val cacheLocs = new HashMap[Int, Array[Seq[TaskLocation]]] + private val cacheLocs = new mutable.HashMap[Int, Array[Seq[TaskLocation]]] // For tracking failed nodes, we use the MapOutputTracker's epoch number, which is sent with // every task. When we detect a node failing, we note the current epoch number and failed @@ -110,7 +111,7 @@ class DAGScheduler( // // TODO: Garbage collect information about failure epochs when we know there are no more // stray messages to detect. - private val failedEpoch = new HashMap[String, Long] + private val failedEpoch = new mutable.HashMap[String, Long] private val dagSchedulerActorSupervisor = env.actorSystem.actorOf(Props(new DAGSchedulerActorSupervisor(this))) @@ -122,7 +123,7 @@ class DAGScheduler( private[scheduler] var eventProcessActor: ActorRef = _ /** If enabled, we may run certain actions like take() and first() locally. */ - private val localExecutionEnabled = sc.getConf.getBoolean("spark.localExecution.enabled", false) + private val localExecutionEnabled = sc.getConf.getBoolean("spark.localExecution.enabled", defaultValue = false) private def initializeEventProcessActor() { // blocking the thread until supervisor is started, which ensures eventProcessActor is @@ -151,7 +152,7 @@ class DAGScheduler( task: Task[_], reason: TaskEndReason, result: Any, - accumUpdates: Map[Long, Any], + accumUpdates: mutable.Map[Long, Any], taskInfo: TaskInfo, taskMetrics: TaskMetrics) { eventProcessActor ! CompletionEvent(task, reason, result, accumUpdates, taskInfo, taskMetrics) @@ -234,15 +235,14 @@ class DAGScheduler( * directly. */ private def newStage( - rdd: RDD[_], - numTasks: Int, - shuffleDep: Option[ShuffleDependency[_, _, _]], - jobId: Int, - callSite: CallSite) - : Stage = - { + rdd: RDD[_], + numTasks: Int, + shuffleDep: Option[ShuffleDependency[_, _, _]], + jobId: Int, + callSite: CallSite) + : Stage = { val parentStages = getParentStages(rdd, jobId) - val id = nextStageId.getAndIncrement() + val id = nextStageId.getAndIncrement val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite) stageIdToStage(id) = stage updateJobIdStageIdMaps(jobId, stage) @@ -256,13 +256,12 @@ class DAGScheduler( * recovered from the MapOutputTracker */ private def newOrUsedStage( - rdd: RDD[_], - numTasks: Int, - shuffleDep: ShuffleDependency[_, _, _], - jobId: Int, - callSite: CallSite) - : Stage = - { + rdd: RDD[_], + numTasks: Int, + shuffleDep: ShuffleDependency[_, _, _], + jobId: Int, + callSite: CallSite) + : Stage = { val stage = newStage(rdd, numTasks, Some(shuffleDep), jobId, callSite) if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) { val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId) @@ -285,11 +284,11 @@ class DAGScheduler( * provided jobId if they haven't already been created with a lower jobId. */ private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = { - val parents = new HashSet[Stage] - val visited = new HashSet[RDD[_]] + val parents = new mutable.HashSet[Stage] + val visited = new mutable.HashSet[RDD[_]] // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting - val waitingForVisit = new Stack[RDD[_]] + val waitingForVisit = new mutable.Stack[RDD[_]] def visit(r: RDD[_]) { if (!visited(r)) { visited += r @@ -326,12 +325,12 @@ class DAGScheduler( } // Find ancestor shuffle dependencies that are not registered in shuffleToMapStage yet - private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = { - val parents = new Stack[ShuffleDependency[_, _, _]] - val visited = new HashSet[RDD[_]] + private def getAncestorShuffleDependencies(rdd: RDD[_]): mutable.Stack[ShuffleDependency[_, _, _]] = { + val parents = new mutable.Stack[ShuffleDependency[_, _, _]] + val visited = new mutable.HashSet[RDD[_]] // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting - val waitingForVisit = new Stack[RDD[_]] + val waitingForVisit = new mutable.Stack[RDD[_]] def visit(r: RDD[_]) { if (!visited(r)) { visited += r @@ -358,11 +357,11 @@ class DAGScheduler( } private def getMissingParentStages(stage: Stage): List[Stage] = { - val missing = new HashSet[Stage] - val visited = new HashSet[RDD[_]] + val missing = new mutable.HashSet[Stage] + val visited = new mutable.HashSet[RDD[_]] // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting - val waitingForVisit = new Stack[RDD[_]] + val waitingForVisit = new mutable.Stack[RDD[_]] def visit(rdd: RDD[_]) { if (!visited(rdd)) { visited += rdd @@ -397,7 +396,7 @@ class DAGScheduler( if (stages.nonEmpty) { val s = stages.head s.jobIds += jobId - jobIdToStageIds.getOrElseUpdate(jobId, new HashSet[Int]()) += s.id + jobIdToStageIds.getOrElseUpdate(jobId, new mutable.HashSet[Int]()) += s.id val parents: List[Stage] = getParentStages(s.rdd, jobId) val parentsWithoutThisJobId = parents.filter { ! _.jobIds.contains(jobId) } updateJobIdStageIdMapsList(parentsWithoutThisJobId ++ stages.tail) @@ -485,7 +484,7 @@ class DAGScheduler( "Total number of partitions: " + maxPartitions) } - val jobId = nextJobId.getAndIncrement() + val jobId = nextJobId.getAndIncrement if (partitions.size == 0) { return new JobWaiter[U](this, jobId, 0, resultHandler) } @@ -533,7 +532,7 @@ class DAGScheduler( val listener = new ApproximateActionListener(rdd, func, evaluator, timeout) val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] val partitions = (0 until rdd.partitions.size).toArray - val jobId = nextJobId.getAndIncrement() + val jobId = nextJobId.getAndIncrement eventProcessActor ! JobSubmitted( jobId, rdd, func2, partitions, allowLocal = false, callSite, listener, properties) listener.awaitResult() // Will throw an exception if the job fails @@ -1241,12 +1240,12 @@ class DAGScheduler( if (stage == target) { return true } - val visitedRdds = new HashSet[RDD[_]] - val visitedStages = new HashSet[Stage] + val visitedRdds = new mutable.HashSet[RDD[_]] + val visitedStages = new mutable.HashSet[Stage] // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting - val waitingForVisit = new Stack[RDD[_]] - def visit(rdd: RDD[_]) { + val waitingForVisit = new mutable.Stack[RDD[_]] + def visit(rdd: RDD[_]) { if (!visitedRdds(rdd)) { visitedRdds += rdd for (dep <- rdd.dependencies) { @@ -1278,16 +1277,15 @@ class DAGScheduler( */ private[spark] def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = synchronized { - getPreferredLocsInternal(rdd, partition, new HashSet) + getPreferredLocsInternal(rdd, partition, new mutable.HashSet) } /** Recursive implementation for getPreferredLocs. */ private def getPreferredLocsInternal( - rdd: RDD[_], - partition: Int, - visited: HashSet[(RDD[_],Int)]) - : Seq[TaskLocation] = - { + rdd: RDD[_], + partition: Int, + visited: mutable.HashSet[(RDD[_],Int)]) + : Seq[TaskLocation] = { // If the partition has already been visited, no need to re-visit. // This avoids exponential path exploration. SPARK-695 if (!visited.add((rdd,partition))) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala index 992c477493d8e..64802d8074704 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala @@ -26,13 +26,16 @@ private[spark] trait SchedulerBackend { private val appId = "spark-application-" + System.currentTimeMillis def start(): Unit + def stop(): Unit + def reviveOffers(): Unit + def defaultParallelism(): Int - def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = - throw new UnsupportedOperationException - def isReady(): Boolean = true + def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = throw new UnsupportedOperationException + + def isReady: Boolean = true /** * Get an application ID associated with the job. diff --git a/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala index 1ce83485f024b..6a54252753a63 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala @@ -17,10 +17,10 @@ package org.apache.spark.scheduler -import collection.mutable.ArrayBuffer - import org.apache.spark.annotation.DeveloperApi +import scala.collection.mutable.ArrayBuffer + // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi @@ -30,7 +30,7 @@ class SplitInfo( val path: String, val length: Long, val underlyingSplit: Any) { - override def toString(): String = { + override def toString: String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala index 071568cdfb429..31c54769594d5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler -import scala.collection.mutable.HashSet +import scala.collection.mutable import org.apache.spark._ import org.apache.spark.rdd.RDD @@ -63,11 +63,11 @@ private[spark] class Stage( var numAvailableOutputs = 0 /** Set of jobs that this stage belongs to. */ - val jobIds = new HashSet[Int] + val jobIds = new mutable.HashSet[Int] /** For stages that are the final (consists of only ResultTasks), link to the ActiveJob. */ var resultOfJob: Option[ActiveJob] = None - var pendingTasks = new HashSet[Task[_]] + var pendingTasks = new mutable.HashSet[Task[_]] private var nextAttemptId = 0 diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 6d697e3d003f6..60bd23bf15db0 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -21,6 +21,7 @@ import java.nio.ByteBuffer import java.util.{TimerTask, Timer} import java.util.concurrent.atomic.AtomicLong +import scala.collection.mutable import scala.concurrent.duration._ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap @@ -34,7 +35,6 @@ import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.util.Utils import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId -import akka.actor.Props /** * Schedules tasks for multiple types of clusters by acting through a SchedulerBackend. @@ -91,9 +91,9 @@ private[spark] class TaskSchedulerImpl( // in turn is used to decide when we can attain data locality on a given host protected val executorsByHost = new HashMap[String, HashSet[String]] - protected val hostsByRack = new HashMap[String, HashSet[String]] + protected val hostsByRack = new mutable.HashMap[String, mutable.HashSet[String]] - protected val executorIdToHost = new HashMap[String, String] + protected val executorIdToHost = new mutable.HashMap[String, String] // Listener object to pass upcalls into var dagScheduler: DAGScheduler = null @@ -135,12 +135,12 @@ private[spark] class TaskSchedulerImpl( schedulableBuilder.buildPools() } - def newTaskId(): Long = nextTaskId.getAndIncrement() + def newTaskId(): Long = nextTaskId.getAndIncrement override def start() { backend.start() - if (!isLocal && conf.getBoolean("spark.speculation", false)) { + if (!isLocal && conf.getBoolean("spark.speculation", defaultValue = false)) { logInfo("Starting speculative execution thread") import sc.env.actorSystem.dispatcher sc.env.actorSystem.scheduler.schedule(SPECULATION_INTERVAL milliseconds, @@ -441,7 +441,7 @@ private[spark] class TaskSchedulerImpl( private def removeExecutor(executorId: String) { activeExecutorIds -= executorId val host = executorIdToHost(executorId) - val execs = executorsByHost.getOrElse(host, new HashSet) + val execs = executorsByHost.getOrElse(host, new mutable.HashSet) execs -= executorId if (execs.isEmpty) { executorsByHost -= host @@ -507,7 +507,7 @@ private[spark] object TaskSchedulerImpl { * For example, given , , , returns * [o1, o5, o4, 02, o6, o3] */ - def prioritizeContainers[K, T] (map: HashMap[K, ArrayBuffer[T]]): List[T] = { + def prioritizeContainers[K, T] (map: mutable.HashMap[K, ArrayBuffer[T]]): List[T] = { val _keyList = new ArrayBuffer[K](map.size) _keyList ++= map.keys @@ -523,7 +523,7 @@ private[spark] object TaskSchedulerImpl { while (found) { found = false for (key <- keyList) { - val containerList: ArrayBuffer[T] = map.get(key).getOrElse(null) + val containerList: ArrayBuffer[T] = map.getOrElse(key, null) assert(containerList != null) // Get the index'th entry for this host - if present if (index < containerList.size){ diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 59aed6b72fe42..e11d02315ce6f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -19,7 +19,8 @@ package org.apache.spark.scheduler.cluster import java.util.concurrent.atomic.AtomicInteger -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer import scala.concurrent.Await import scala.concurrent.duration._ @@ -63,8 +64,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A class DriverActor(sparkProperties: Seq[(String, String)]) extends Actor with ActorLogReceive { override protected def log = CoarseGrainedSchedulerBackend.this.log - private val addressToExecutorId = new HashMap[Address, String] - private val executorDataMap = new HashMap[String, ExecutorData] + private val addressToExecutorId = new mutable.HashMap[Address, String] + private val executorDataMap = new mutable.HashMap[String, ExecutorData] override def preStart() { // Listen for remote client disconnection events, since they don't go through Akka's watch() @@ -84,10 +85,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A } else { logInfo("Registered executor: " + sender + " with ID " + executorId) sender ! RegisteredExecutor - executorDataMap.put(executorId, new ExecutorData(sender, sender.path.address, + executorDataMap.put(executorId, new ExecutorData(sender(), sender().path.address, Utils.parseHostPort(hostPort)._1, cores, cores)) - addressToExecutorId(sender.path.address) = executorId + addressToExecutorId(sender().path.address) = executorId totalCoreCount.addAndGet(cores) totalRegisteredExecutors.addAndGet(1) makeOffers() @@ -107,8 +108,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A } } - case ReviveOffers => - makeOffers() + case ReviveOffers => makeOffers() case KillTask(taskId, executorId, interruptThread) => executorDataMap(executorId).executorActor ! KillTask(taskId, executorId, interruptThread) @@ -195,7 +195,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A } var driverActor: ActorRef = null - val taskIdsOnSlave = new HashMap[String, HashSet[String]] + val taskIdsOnSlave = new mutable.HashMap[String, mutable.HashSet[String]] override def start() { val properties = new ArrayBuffer[(String, String)] @@ -260,8 +260,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A def sufficientResourcesRegistered(): Boolean = true - override def isReady(): Boolean = { - if (sufficientResourcesRegistered) { + override def isReady: Boolean = { + if (sufficientResourcesRegistered()) { logInfo("SchedulerBackend is ready for scheduling beginning after " + s"reached minRegisteredResourcesRatio: $minRegisteredRatio") return true @@ -280,8 +280,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A System.setProperty("spark.ui.proxyBase", proxyBase) } - val hasFilter = (filterName != null && filterName.nonEmpty && - filterParams != null && filterParams.nonEmpty) + val hasFilter = filterName != null && filterName.nonEmpty && filterParams != null && filterParams.nonEmpty if (hasFilter) { logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase") conf.set("spark.ui.filters", filterName) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 90828578cd88f..36d36ce8955ce 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -110,7 +110,7 @@ private[spark] class CoarseMesosSchedulerBackend( def createCommand(offer: Offer, numCores: Int): CommandInfo = { val executorSparkHome = conf.getOption("spark.mesos.executor.home") - .orElse(sc.getSparkHome()) + .orElse(sc.getSparkHome) .getOrElse { throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!") } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index e0f2fd622f54c..44164c24185be 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -89,7 +89,7 @@ private[spark] class MesosSchedulerBackend( def createExecutorInfo(execId: String): ExecutorInfo = { val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home") - .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility + .orElse(sc.getSparkHome) // Fall back to driver Spark home for backward compatibility .getOrElse { throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!") } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 3f5d06e1aeee7..90b49102e3d1e 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -91,13 +91,13 @@ private[spark] class BlockManager( executorId, blockTransferService.hostName, blockTransferService.port) // Whether to compress broadcast variables that are stored - private val compressBroadcast = conf.getBoolean("spark.broadcast.compress", true) + private val compressBroadcast = conf.getBoolean("spark.broadcast.compress", defaultValue = true) // Whether to compress shuffle output that are stored - private val compressShuffle = conf.getBoolean("spark.shuffle.compress", true) + private val compressShuffle = conf.getBoolean("spark.shuffle.compress", defaultValue = true) // Whether to compress RDD partitions that are stored serialized - private val compressRdds = conf.getBoolean("spark.rdd.compress", false) + private val compressRdds = conf.getBoolean("spark.rdd.compress", defaultValue = false) // Whether to compress shuffle output temporarily spilled to disk - private val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", true) + private val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", defaultValue = true) private val slaveActor = actorSystem.actorOf( Props(new BlockManagerSlaveActor(this, mapOutputTracker)), @@ -577,7 +577,7 @@ private[spark] class BlockManager( bufferSize: Int, writeMetrics: ShuffleWriteMetrics): BlockObjectWriter = { val compressStream: OutputStream => OutputStream = wrapForCompression(blockId, _) - val syncWrites = conf.getBoolean("spark.shuffle.sync", false) + val syncWrites = conf.getBoolean("spark.shuffle.sync", defaultValue = false) new DiskBlockObjectWriter(blockId, file, serializer, bufferSize, compressStream, syncWrites, writeMetrics) } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index d08e1419e3e41..e05ca3c80ec67 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -80,7 +80,7 @@ class BlockManagerMaster( * those blocks that are reported to block manager master. */ def contains(blockId: BlockId) = { - !getLocations(blockId).isEmpty + getLocations(blockId).nonEmpty } /** Get ids of other nodes in the cluster from the driver */ diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala index 18d2b5075aa08..bf8b7922ecee1 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala @@ -17,11 +17,11 @@ package org.apache.spark.ui -import scala.util.Random - -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ import org.apache.spark.scheduler.SchedulingMode +import org.apache.spark.{SparkConf, SparkContext} + +import scala.util.Random /** * Continuously generates jobs that expose various features of the WebUI (internal testing tool). @@ -49,7 +49,7 @@ private[spark] object UIWorkloadGenerator { val sc = new SparkContext(conf) def setProperties(s: String) = { - if(schedulingMode == SchedulingMode.FAIR) { + if (schedulingMode == SchedulingMode.FAIR) { sc.setLocalProperty("spark.scheduler.pool", s) } sc.setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, s) @@ -64,8 +64,8 @@ private[spark] object UIWorkloadGenerator { ("Single Shuffle", baseData.map(x => (x % 10, x)).reduceByKey(_ + _).count), ("Entirely failed phase", baseData.map(x => throw new Exception).count), ("Partially failed phase", { - baseData.map{x => - val probFailure = (4.0 / NUM_PARTITIONS) + baseData.map { x => + val probFailure = 4.0 / NUM_PARTITIONS if (nextFloat() < probFailure) { throw new Exception("This is a task failure") } @@ -73,8 +73,8 @@ private[spark] object UIWorkloadGenerator { }.count }), ("Partially failed phase (longer tasks)", { - baseData.map{x => - val probFailure = (4.0 / NUM_PARTITIONS) + baseData.map { x => + val probFailure = 4.0 / NUM_PARTITIONS if (nextFloat() < probFailure) { Thread.sleep(100) throw new Exception("This is a task failure") @@ -98,7 +98,7 @@ private[spark] object UIWorkloadGenerator { println("Job Failed: " + desc) } } - }.start + }.start() Thread.sleep(INTER_JOB_WAIT_MS) } } diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index e3f52f6ff1e63..d4fe30a7c9ac7 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -19,14 +19,12 @@ package org.apache.spark.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import scala.collection.mutable.Map -import scala.collection.mutable.Set - -import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.{ClassReader, ClassVisitor, MethodVisitor, Type} import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._ - +import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.{ClassReader, ClassVisitor, MethodVisitor, Type} import org.apache.spark.{Logging, SparkEnv, SparkException} +import scala.collection.mutable + private[spark] object ClosureCleaner extends Logging { // Get an ASM class reader for a given class from the JAR that loaded it private def getClassReader(cls: Class[_]): ClassReader = { @@ -37,7 +35,7 @@ private[spark] object ClosureCleaner extends Logging { if (resourceStream == null) return new ClassReader(resourceStream) val baos = new ByteArrayOutputStream(128) - Utils.copyStream(resourceStream, baos, true) + Utils.copyStream(resourceStream, baos, closeStreams = true) new ClassReader(new ByteArrayInputStream(baos.toByteArray)) } @@ -78,19 +76,19 @@ private[spark] object ClosureCleaner extends Logging { } private def getInnerClasses(obj: AnyRef): List[Class[_]] = { - val seen = Set[Class[_]](obj.getClass) + val seen = mutable.Set[Class[_]](obj.getClass) var stack = List[Class[_]](obj.getClass) - while (!stack.isEmpty) { + while (stack.nonEmpty) { val cr = getClassReader(stack.head) stack = stack.tail - val set = Set[Class[_]]() + val set = mutable.Set[Class[_]]() cr.accept(new InnerClosureFinder(set), 0) for (cls <- set -- seen) { seen += cls stack = cls :: stack } } - return (seen - obj.getClass).toList + (seen - obj.getClass).toList } private def createNullValue(cls: Class[_]): AnyRef = { @@ -107,12 +105,12 @@ private[spark] object ClosureCleaner extends Logging { val innerClasses = getInnerClasses(func) val outerObjects = getOuterObjects(func) - val accessedFields = Map[Class[_], Set[String]]() + val accessedFields = mutable.Map[Class[_], mutable.Set[String]]() getClassReader(func.getClass).accept(new ReturnStatementFinder(), 0) for (cls <- outerClasses) - accessedFields(cls) = Set[String]() + accessedFields(cls) = mutable.Set[String]() for (cls <- func.getClass :: innerClasses) getClassReader(cls).accept(new FieldAccessFinder(accessedFields), 0) // logInfo("accessedFields: " + accessedFields) @@ -177,10 +175,10 @@ private[spark] object ClosureCleaner extends Logging { if (outer != null) { params(0) = outer // First param is always outer object } - return cons.newInstance(params: _*).asInstanceOf[AnyRef] + cons.newInstance(params: _*).asInstanceOf[AnyRef] } else { // Use reflection to instantiate object without calling constructor - val rf = sun.reflect.ReflectionFactory.getReflectionFactory() + val rf = sun.reflect.ReflectionFactory.getReflectionFactory val parentCtor = classOf[java.lang.Object].getDeclaredConstructor() val newCtor = rf.newConstructorForSerialization(cls, parentCtor) val obj = newCtor.newInstance().asInstanceOf[AnyRef] @@ -214,7 +212,7 @@ class ReturnStatementFinder extends ClassVisitor(ASM4) { } private[spark] -class FieldAccessFinder(output: Map[Class[_], Set[String]]) extends ClassVisitor(ASM4) { +class FieldAccessFinder(output: mutable.Map[Class[_], mutable.Set[String]]) extends ClassVisitor(ASM4) { override def visitMethod(access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): MethodVisitor = { new MethodVisitor(ASM4) { @@ -240,7 +238,7 @@ class FieldAccessFinder(output: Map[Class[_], Set[String]]) extends ClassVisitor } } -private[spark] class InnerClosureFinder(output: Set[Class[_]]) extends ClassVisitor(ASM4) { +private[spark] class InnerClosureFinder(output: mutable.Set[Class[_]]) extends ClassVisitor(ASM4) { var myName: String = null override def visit(version: Int, access: Int, name: String, sig: String, diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 3d307b3c16d3e..5cb7d4e175827 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -170,7 +170,7 @@ private[spark] object Utils extends Logging { // Register the path to be deleted via shutdown hook def registerShutdownDeleteDir(file: File) { - val absolutePath = file.getAbsolutePath() + val absolutePath = file.getAbsolutePath shutdownDeletePaths.synchronized { shutdownDeletePaths += absolutePath } @@ -178,7 +178,7 @@ private[spark] object Utils extends Logging { // Register the tachyon path to be deleted via shutdown hook def registerShutdownDeleteDir(tachyonfile: TachyonFile) { - val absolutePath = tachyonfile.getPath() + val absolutePath = tachyonfile.getPath shutdownDeleteTachyonPaths.synchronized { shutdownDeleteTachyonPaths += absolutePath } @@ -186,7 +186,7 @@ private[spark] object Utils extends Logging { // Is the path already registered to be deleted via a shutdown hook ? def hasShutdownDeleteDir(file: File): Boolean = { - val absolutePath = file.getAbsolutePath() + val absolutePath = file.getAbsolutePath shutdownDeletePaths.synchronized { shutdownDeletePaths.contains(absolutePath) } @@ -194,7 +194,7 @@ private[spark] object Utils extends Logging { // Is the path already registered to be deleted via a shutdown hook ? def hasShutdownDeleteTachyonDir(file: TachyonFile): Boolean = { - val absolutePath = file.getPath() + val absolutePath = file.getPath shutdownDeletePaths.synchronized { shutdownDeletePaths.contains(absolutePath) } @@ -204,7 +204,7 @@ private[spark] object Utils extends Logging { // else false. This is to ensure that two shutdown hooks do not try to delete each others // paths - resulting in IOException and incomplete cleanup. def hasRootAsShutdownDeleteDir(file: File): Boolean = { - val absolutePath = file.getAbsolutePath() + val absolutePath = file.getAbsolutePath val retval = shutdownDeletePaths.synchronized { shutdownDeletePaths.exists { path => !absolutePath.equals(path) && absolutePath.startsWith(path) @@ -220,7 +220,7 @@ private[spark] object Utils extends Logging { // else false. This is to ensure that two shutdown hooks do not try to delete each others // paths - resulting in Exception and incomplete cleanup. def hasRootAsShutdownDeleteDir(file: TachyonFile): Boolean = { - val absolutePath = file.getPath() + val absolutePath = file.getPath val retval = shutdownDeleteTachyonPaths.synchronized { shutdownDeleteTachyonPaths.exists { path => !absolutePath.equals(path) && absolutePath.startsWith(path) @@ -265,15 +265,14 @@ private[spark] object Utils extends Logging { /** Copy all data from an InputStream to an OutputStream */ def copyStream(in: InputStream, - out: OutputStream, - closeStreams: Boolean = false): Long = - { + out: OutputStream, + closeStreams: Boolean = false): Long = { var count = 0L try { if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]) { // When both streams are File stream, use transferTo to improve copy performance. - val inChannel = in.asInstanceOf[FileInputStream].getChannel() - val outChannel = out.asInstanceOf[FileOutputStream].getChannel() + val inChannel = in.asInstanceOf[FileInputStream].getChannel + val outChannel = out.asInstanceOf[FileOutputStream].getChannel val size = inChannel.size() // In case transferTo method transferred less data than we have required. @@ -315,8 +314,8 @@ private[spark] object Utils extends Logging { val userCred = securityMgr.getSecretKey() if (userCred == null) throw new Exception("Secret key is null with authentication on") val userInfo = securityMgr.getHttpUser() + ":" + userCred - new URI(uri.getScheme(), userInfo, uri.getHost(), uri.getPort(), uri.getPath(), - uri.getQuery(), uri.getFragment()) + new URI(uri.getScheme, userInfo, uri.getHost, uri.getPort, uri.getPath, + uri.getQuery, uri.getFragment) } /** @@ -342,7 +341,7 @@ private[spark] object Utils extends Logging { if (securityMgr.isAuthenticationEnabled()) { logDebug("fetchFile with security enabled") val newuri = constructURIForAuthentication(uri, securityMgr) - uc = newuri.toURL().openConnection() + uc = newuri.toURL.openConnection() uc.setAllowUserInteraction(false) } else { logDebug("fetchFile not using security") @@ -353,7 +352,7 @@ private[spark] object Utils extends Logging { uc.setConnectTimeout(timeout) uc.setReadTimeout(timeout) uc.connect() - val in = uc.getInputStream() + val in = uc.getInputStream val out = new FileOutputStream(tempFile) Utils.copyStream(in, out, closeStreams = true) if (targetFile.exists && !Files.equal(tempFile, targetFile)) { @@ -666,7 +665,7 @@ private[spark] object Utils extends Logging { */ def deleteRecursively(file: File) { if (file != null) { - if (file.isDirectory() && !isSymlink(file)) { + if (file.isDirectory && !isSymlink(file)) { for (child <- listFilesSafely(file)) { deleteRecursively(child) } @@ -684,7 +683,7 @@ private[spark] object Utils extends Logging { * Delete a file or directory and its contents recursively. */ def deleteRecursively(dir: TachyonFile, client: TachyonFS) { - if (!client.delete(dir.getPath(), true)) { + if (!client.delete(dir.getPath, true)) { throw new IOException("Failed to delete the tachyon dir: " + dir) } } @@ -695,13 +694,13 @@ private[spark] object Utils extends Logging { def isSymlink(file: File): Boolean = { if (file == null) throw new NullPointerException("File must not be null") if (isWindows) return false - val fileInCanonicalDir = if (file.getParent() == null) { + val fileInCanonicalDir = if (file.getParent == null) { file } else { - new File(file.getParentFile().getCanonicalFile(), file.getName()) + new File(file.getParentFile.getCanonicalFile, file.getName) } - !fileInCanonicalDir.getCanonicalFile().equals(fileInCanonicalDir.getAbsoluteFile()) + !fileInCanonicalDir.getCanonicalFile.equals(fileInCanonicalDir.getAbsoluteFile) } /** @@ -888,7 +887,7 @@ private[spark] object Utils extends Logging { * @param skipClass Function that is used to exclude non-user-code classes. */ def getCallSite(skipClass: String => Boolean = coreExclusionFunction): CallSite = { - val trace = Thread.currentThread.getStackTrace().filterNot { ste: StackTraceElement => + val trace = Thread.currentThread.getStackTrace.filterNot { ste: StackTraceElement => // When running under some profilers, the current stack trace might contain some bogus // frames. This is intended to ensure that we don't crash in these situations by // ignoring any frames that we can't examine. @@ -1117,7 +1116,7 @@ private[spark] object Utils extends Logging { } /** Returns a copy of the system properties that is thread-safe to iterator over. */ - def getSystemProperties(): Map[String, String] = { + def getSystemProperties: Map[String, String] = { System.getProperties.clone().asInstanceOf[java.util.Properties].toMap[String, String] } @@ -1165,10 +1164,10 @@ private[spark] object Utils extends Logging { * @param dst relative path for the destination */ def symlink(src: File, dst: File) { - if (!src.isAbsolute()) { + if (!src.isAbsolute) { throw new IOException("Source must be absolute") } - if (dst.isAbsolute()) { + if (dst.isAbsolute) { throw new IOException("Destination must be relative") } var cmdSuffix = "" @@ -1181,7 +1180,7 @@ private[spark] object Utils extends Logging { "ln -sf " } import scala.sys.process._ - (linkCmd + src.getAbsolutePath() + " " + dst.getPath() + cmdSuffix) lines_! + (linkCmd + src.getAbsolutePath + " " + dst.getPath + cmdSuffix) lines_! ProcessLogger(line => logInfo(line)) } diff --git a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala index e6ab538d77bcc..c92ebaa0d5aa6 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala @@ -78,7 +78,7 @@ package object testPackage extends Assertions { def runCallSiteTest(sc: SparkContext) { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite - val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" + val curCallSite = sc.getCallSite.shortForm // note: 2 lines after definition of "rdd" val rddCreationLine = rddCreationSite match { case CALL_SITE_REGEX(func, file, line) => { diff --git a/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala index 15f6678648b29..36db774f2ba25 100644 --- a/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala @@ -19,25 +19,25 @@ package org.apache.spark.examples import java.util.Random -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ +import org.apache.spark.{SparkConf, SparkContext} /** - * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] - */ + * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] + */ object GroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") - var numMappers = if (args.length > 0) args(0).toInt else 2 - var numKVPairs = if (args.length > 1) args(1).toInt else 1000 - var valSize = if (args.length > 2) args(2).toInt else 1000 - var numReducers = if (args.length > 3) args(3).toInt else numMappers + val numMappers = if (args.length > 0) args(0).toInt else 2 + val numKVPairs = if (args.length > 1) args(1).toInt else 1000 + val valSize = if (args.length > 2) args(2).toInt else 1000 + val numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random - var arr1 = new Array[(Int, Array[Byte])](numKVPairs) + val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) @@ -45,6 +45,7 @@ object GroupByTest { } arr1 }.cache() + // Enforce that everything has been calculated and in cache pairs1.count() diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 65f7ccd318684..fa5762699a680 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -20,18 +20,17 @@ package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream, ObjectOutputStream} -import scala.deprecated -import scala.collection.mutable.HashMap -import scala.reflect.ClassTag -import scala.util.matching.Regex - -import org.apache.spark.{Logging, SparkException} import org.apache.spark.rdd.{BlockRDD, RDD} import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ +import org.apache.spark.streaming._ import org.apache.spark.streaming.scheduler.Job import org.apache.spark.util.{CallSite, MetadataCleaner} +import org.apache.spark.{Logging, SparkException} + +import scala.collection.mutable.HashMap +import scala.reflect.ClassTag +import scala.util.matching.Regex /** * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous @@ -291,7 +290,7 @@ abstract class DStream[T: ClassTag] ( // Note that this `getOrCompute` may get called from another DStream which may have // set its own call site. So we store its call site in a temporary variable, // set this DStream's creation site, generate RDDs and then restore the previous call site. - val prevCallSite = ssc.sparkContext.getCallSite() + val prevCallSite = ssc.sparkContext.getCallSite ssc.sparkContext.setCallSite(creationSite) val rddOption = compute(time) ssc.sparkContext.setCallSite(prevCallSite) From 237bacc3d1c911e6475da4ad08dcd4d0031883ec Mon Sep 17 00:00:00 2001 From: shijinkui Date: Wed, 8 Oct 2014 11:05:01 +0800 Subject: [PATCH 3/6] resolve conflic --- core/src/main/scala/org/apache/spark/Aggregator.scala | 2 +- .../scala/org/apache/spark/scheduler/DAGScheduler.scala | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala index 6f20adf66e479..133a89df9b83e 100644 --- a/core/src/main/scala/org/apache/spark/Aggregator.scala +++ b/core/src/main/scala/org/apache/spark/Aggregator.scala @@ -34,7 +34,7 @@ case class Aggregator[K, V, C] ( mergeValue: (C, V) => C, mergeCombiners: (C, C) => C) { - private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true) + private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", defaultValue = true) @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0") def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] = diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 98dcf61115e7f..fa52a4211ea29 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -22,7 +22,7 @@ import java.util.Properties import java.util.concurrent.atomic.AtomicInteger import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, HashSet} +import scala.collection.mutable.{ArrayBuffer} import scala.concurrent.Await import scala.concurrent.duration._ import scala.language.postfixOps @@ -223,7 +223,7 @@ class DAGScheduler( shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId, shuffleDep.rdd.creationSite) shuffleToMapStage(shuffleDep.shuffleId) = stage - + stage } } @@ -1200,7 +1200,7 @@ class DAGScheduler( logError("No stages registered for job " + job.jobId) } stages.foreach { stageId => - val jobsForStage: Option[HashSet[Int]] = stageIdToStage.get(stageId).map(_.jobIds) + val jobsForStage: Option[mutable.HashSet[Int]] = stageIdToStage.get(stageId).map(_.jobIds) if (jobsForStage.isEmpty || !jobsForStage.get.contains(job.jobId)) { logError( "Job %d not registered for stage %d even though that stage was registered for the job" @@ -1245,7 +1245,7 @@ class DAGScheduler( // We are manually maintaining a stack here to prevent StackOverflowError // caused by recursively visiting val waitingForVisit = new mutable.Stack[RDD[_]] - def visit(rdd: RDD[_]) { + def visit(rdd: RDD[_]) { if (!visitedRdds(rdd)) { visitedRdds += rdd for (dep <- rdd.dependencies) { From 78f69b9526ea546251dac5eddf4da9c9eb6e20ad Mon Sep 17 00:00:00 2001 From: shijinkui Date: Wed, 8 Oct 2014 11:31:24 +0800 Subject: [PATCH 4/6] code format --- .../scala/org/apache/spark/SparkContext.scala | 78 ++++++++++++++++++- .../apache/spark/api/python/PythonRDD.scala | 2 +- .../deploy/master/ui/ApplicationPage.scala | 2 +- .../shuffle/hash/HashShuffleWriter.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala | 2 +- .../spark/graphx/lib/TriangleCount.scala | 2 +- .../receiver/ReceiverSupervisor.scala | 27 ++++--- 7 files changed, 93 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index c12a22f4583f3..96d574fca1de7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -22,8 +22,16 @@ import java.net.URI import java.util.UUID.randomUUID import java.util.concurrent.atomic.AtomicInteger import java.util.{Properties, UUID} +<<<<<<< HEAD import akka.actor.Props +======= +import java.util.UUID.randomUUID +import scala.collection.{mutable, Map, Set} +import scala.collection.JavaConversions._ +import scala.collection.generic.Growable +import scala.reflect.{ClassTag, classTag} +>>>>>>> code format import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} @@ -112,7 +120,12 @@ class SparkContext(config: SparkConf) extends Logging { sparkHome: String = null, jars: Seq[String] = Nil, environment: Map[String, String] = Map(), +<<<<<<< HEAD preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = { +======= + preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = + { +>>>>>>> code format this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment)) this.preferredNodeLocationData = preferredNodeLocationData } @@ -587,9 +600,15 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]] +<<<<<<< HEAD (path: String, minPartitions: Int) (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { hadoopFile(path, +======= + (path: String, minPartitions: Int) + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { + hadoopFile(path, +>>>>>>> code format fm.runtimeClass.asInstanceOf[Class[F]], km.runtimeClass.asInstanceOf[Class[K]], vm.runtimeClass.asInstanceOf[Class[V]], @@ -610,13 +629,22 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]](path: String) +<<<<<<< HEAD (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = +======= + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = +>>>>>>> code format hadoopFile[K, V, F](path, defaultMinPartitions) /** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */ def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]] +<<<<<<< HEAD (path: String) (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { +======= + (path: String) + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { +>>>>>>> code format newAPIHadoopFile( path, fm.runtimeClass.asInstanceOf[Class[F]], @@ -685,7 +713,12 @@ class SparkContext(config: SparkConf) extends Logging { * If you plan to directly cache Hadoop writable objects, you should first copy them using * a `map` function. * */ +<<<<<<< HEAD def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = +======= + def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V] + ): RDD[(K, V)] = +>>>>>>> code format sequenceFile(path, keyClass, valueClass, defaultMinPartitions) /** @@ -710,10 +743,17 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def sequenceFile[K, V] +<<<<<<< HEAD (path: String, minPartitions: Int = defaultMinPartitions) (implicit km: ClassTag[K], vm: ClassTag[V], kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) : RDD[(K, V)] = { +======= + (path: String, minPartitions: Int = defaultMinPartitions) + (implicit km: ClassTag[K], vm: ClassTag[V], + kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) + : RDD[(K, V)] = { +>>>>>>> code format val kc = kcf() val vc = vcf() val format = classOf[SequenceFileInputFormat[Writable, Writable]] @@ -740,8 +780,13 @@ class SparkContext(config: SparkConf) extends Logging { } protected[spark] def checkpointFile[T: ClassTag]( +<<<<<<< HEAD path: String ): RDD[T] = { +======= + path: String + ): RDD[T] = { +>>>>>>> code format new CheckpointRDD[T](this, path) } @@ -799,7 +844,11 @@ class SparkContext(config: SparkConf) extends Logging { * standard mutable collections. So you can use this with mutable Map, Set, etc. */ def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T] +<<<<<<< HEAD (initialValue: R): Accumulable[R, T] = { +======= + (initialValue: R): Accumulable[R, T] = { +>>>>>>> code format val param = new GrowableAccumulableParam[R,T] new Accumulable(initialValue, param) } @@ -1180,13 +1229,16 @@ class SparkContext(config: SparkConf) extends Logging { func: (TaskContext, Iterator[T]) => U, evaluator: ApproximateEvaluator[U, R], timeout: Long): PartialResult[R] = { +<<<<<<< HEAD val callSite = getCallSite +======= + val callSite = getCallSite() +>>>>>>> code format logInfo("Starting job: " + callSite.shortForm) val start = System.nanoTime val result = dagScheduler.runApproximateJob(rdd, func, evaluator, callSite, timeout, localProperties.get) - logInfo( - "Job finished: " + callSite.shortForm + ", took " + (System.nanoTime - start) / 1e9 + " s") + logInfo("Job finished: " + callSite.shortForm + ", took " + (System.nanoTime - start) / 1e9 + " s") result } @@ -1202,7 +1254,7 @@ class SparkContext(config: SparkConf) extends Logging { resultHandler: (Int, U) => Unit, resultFunc: => R): SimpleFutureAction[R] = { val cleanF = clean(processPartition) - val callSite = getCallSite + val callSite = getCallSite() val waiter = dagScheduler.submitJob( rdd, (context: TaskContext, iter: Iterator[T]) => cleanF(iter), @@ -1356,15 +1408,26 @@ object SparkContext extends Logging { // TODO: Add AccumulatorParams for other types, e.g. lists and strings +<<<<<<< HEAD implicit def rddToPairRDDFunctions[K, V](rdd: RDD[(K, V)]) (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) = { new PairRDDFunctions(rdd) +======= + implicit def rddToPairRDDFunctions[K, V] + (rdd: RDD[(K, V)]) + (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) + = {new PairRDDFunctions(rdd) +>>>>>>> code format } implicit def rddToAsyncRDDActions[T: ClassTag](rdd: RDD[T]) = new AsyncRDDActions(rdd) implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable: ClassTag]( +<<<<<<< HEAD rdd: RDD[(K, V)]) = new SequenceFileRDDFunctions(rdd) +======= + rdd: RDD[(K, V)]) = new SequenceFileRDDFunctions(rdd) +>>>>>>> code format implicit def rddToOrderedRDDFunctions[K : Ordering : ClassTag, V: ClassTag]( rdd: RDD[(K, V)]) = new OrderedRDDFunctions[K, V, (K, V)](rdd) @@ -1464,12 +1527,21 @@ object SparkContext extends Logging { * like SparkConf would. */ private[spark] def updatedConf( +<<<<<<< HEAD conf: SparkConf, master: String, appName: String, sparkHome: String = null, jars: Seq[String] = Nil, environment: Map[String, String] = Map()): SparkConf = +======= + conf: SparkConf, + master: String, + appName: String, + sparkHome: String = null, + jars: Seq[String] = Nil, + environment: Map[String, String] = Map()): SparkConf = +>>>>>>> code format { val res = conf.clone() res.setMaster(master) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index c74f86548ef85..400e3a1fc9ddf 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -135,7 +135,7 @@ private[spark] class PythonRDD( val obj = new Array[Byte](exLength) stream.readFully(obj) throw new PythonException(new String(obj, "utf-8"), - writerThread.exception.getOrElse(null)) + writerThread.exception.orNull) case SpecialLengths.END_OF_DATA_SECTION => // We've finished the data section of the output, but we can still // read some accumulator updates: diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index 4588c130ef439..b1d37b9282029 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -42,7 +42,7 @@ private[spark] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse] val state = Await.result(stateFuture, timeout) val app = state.activeApps.find(_.id == appId).getOrElse({ - state.completedApps.find(_.id == appId).getOrElse(null) + state.completedApps.find(_.id == appId).orNull }) JsonProtocol.writeApplicationInfo(app) } diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala index 746ed33b54c00..2f99b69f1beb1 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala @@ -44,7 +44,7 @@ private[spark] class HashShuffleWriter[K, V]( metrics.shuffleWriteMetrics = Some(writeMetrics) private val blockManager = SparkEnv.get.blockManager - private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) + private val ser = Serializer.getSerializer(dep.serializer.orNull) private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, writeMetrics) diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index 5b2e7d3a7edb9..a88997b0742b8 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -710,7 +710,7 @@ private[spark] object JsonProtocol { val properties = new Properties mapFromJson(json).foreach { case (k, v) => properties.setProperty(k, v) } properties - }.getOrElse(null) + }.orNull } def UUIDFromJson(json: JValue): UUID = { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala index 7c396e6e66a28..289d415cd6267 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala @@ -58,7 +58,7 @@ object TriangleCount { } // join the sets with the graph val setGraph: Graph[VertexSet, ED] = g.outerJoinVertices(nbrSets) { - (vid, _, optSet) => optSet.getOrElse(null) + (vid, _, optSet) => optSet.orNull } // Edge function computes intersection of smaller vertex with larger vertex def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(VertexId, Int)] = { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index 1f0244c251eba..6d95d0fd6b9ba 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -18,14 +18,14 @@ package org.apache.spark.streaming.receiver import java.nio.ByteBuffer +import java.util.concurrent.CountDownLatch -import scala.collection.mutable.ArrayBuffer - -import org.apache.spark.{Logging, SparkConf} import org.apache.spark.storage.StreamBlockId -import java.util.concurrent.CountDownLatch +import org.apache.spark.{Logging, SparkConf} + +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ -import ExecutionContext.Implicits.global /** * Abstract class that is responsible for supervising a Receiver in the worker. @@ -41,6 +41,7 @@ private[streaming] abstract class ReceiverSupervisor( type CheckpointState = Value val Initialized, Started, Stopped = Value } + import ReceiverState._ // Attach the executor to the receiver @@ -89,16 +90,16 @@ private[streaming] abstract class ReceiverSupervisor( def reportError(message: String, throwable: Throwable) /** Called when supervisor is started */ - protected def onStart() { } + protected def onStart() {} /** Called when supervisor is stopped */ - protected def onStop(message: String, error: Option[Throwable]) { } + protected def onStop(message: String, error: Option[Throwable]) {} /** Called when receiver is started */ - protected def onReceiverStart() { } + protected def onReceiverStart() {} /** Called when receiver is stopped */ - protected def onReceiverStop(message: String, error: Option[Throwable]) { } + protected def onReceiverStop(message: String, error: Option[Throwable]) {} /** Start the supervisor */ def start() { @@ -150,8 +151,7 @@ private[streaming] abstract class ReceiverSupervisor( /** Restart receiver with delay */ def restartReceiver(message: String, error: Option[Throwable], delay: Int) { Future { - logWarning("Restarting receiver with delay " + delay + " ms: " + message, - error.getOrElse(null)) + logWarning("Restarting receiver with delay " + delay + " ms: " + message, error.orNull) stopReceiver("Restarting receiver with delay " + delay + "ms: " + message, error) logDebug("Sleeping for " + delay) Thread.sleep(delay) @@ -162,18 +162,17 @@ private[streaming] abstract class ReceiverSupervisor( } /** Check if receiver has been marked for stopping */ - def isReceiverStarted() = { + def isReceiverStarted = { logDebug("state = " + receiverState) receiverState == Started } /** Check if receiver has been marked for stopping */ - def isReceiverStopped() = { + def isReceiverStopped = { logDebug("state = " + receiverState) receiverState == Stopped } - /** Wait the thread until the supervisor is stopped */ def awaitTermination() { stopLatch.await() From 725eec51fb7d29d0df99b92df3ef62fcec301d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=84=E7=95=85?= Date: Wed, 8 Oct 2014 22:42:29 +0800 Subject: [PATCH 5/6] resolve test fail --- .../scala/org/apache/spark/SparkContext.scala | 81 ++----------------- .../spark/streaming/receiver/Receiver.scala | 4 +- 2 files changed, 7 insertions(+), 78 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 96d574fca1de7..eccb6f7aefaba 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -22,16 +22,13 @@ import java.net.URI import java.util.UUID.randomUUID import java.util.concurrent.atomic.AtomicInteger import java.util.{Properties, UUID} -<<<<<<< HEAD import akka.actor.Props -======= import java.util.UUID.randomUUID import scala.collection.{mutable, Map, Set} import scala.collection.JavaConversions._ import scala.collection.generic.Growable import scala.reflect.{ClassTag, classTag} ->>>>>>> code format import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} @@ -120,12 +117,7 @@ class SparkContext(config: SparkConf) extends Logging { sparkHome: String = null, jars: Seq[String] = Nil, environment: Map[String, String] = Map(), -<<<<<<< HEAD preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = { -======= - preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) = - { ->>>>>>> code format this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment)) this.preferredNodeLocationData = preferredNodeLocationData } @@ -600,15 +592,9 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]] -<<<<<<< HEAD (path: String, minPartitions: Int) (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { hadoopFile(path, -======= - (path: String, minPartitions: Int) - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { - hadoopFile(path, ->>>>>>> code format fm.runtimeClass.asInstanceOf[Class[F]], km.runtimeClass.asInstanceOf[Class[K]], vm.runtimeClass.asInstanceOf[Class[V]], @@ -629,22 +615,13 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def hadoopFile[K, V, F <: InputFormat[K, V]](path: String) -<<<<<<< HEAD - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = -======= - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = ->>>>>>> code format + (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = hadoopFile[K, V, F](path, defaultMinPartitions) /** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */ def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]] -<<<<<<< HEAD (path: String) (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { -======= - (path: String) - (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = { ->>>>>>> code format newAPIHadoopFile( path, fm.runtimeClass.asInstanceOf[Class[F]], @@ -713,12 +690,7 @@ class SparkContext(config: SparkConf) extends Logging { * If you plan to directly cache Hadoop writable objects, you should first copy them using * a `map` function. * */ -<<<<<<< HEAD def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = -======= - def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V] - ): RDD[(K, V)] = ->>>>>>> code format sequenceFile(path, keyClass, valueClass, defaultMinPartitions) /** @@ -743,17 +715,10 @@ class SparkContext(config: SparkConf) extends Logging { * a `map` function. */ def sequenceFile[K, V] -<<<<<<< HEAD (path: String, minPartitions: Int = defaultMinPartitions) (implicit km: ClassTag[K], vm: ClassTag[V], kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) : RDD[(K, V)] = { -======= - (path: String, minPartitions: Int = defaultMinPartitions) - (implicit km: ClassTag[K], vm: ClassTag[V], - kcf: () => WritableConverter[K], vcf: () => WritableConverter[V]) - : RDD[(K, V)] = { ->>>>>>> code format val kc = kcf() val vc = vcf() val format = classOf[SequenceFileInputFormat[Writable, Writable]] @@ -779,14 +744,7 @@ class SparkContext(config: SparkConf) extends Logging { .flatMap(x => Utils.deserialize[Array[T]](x._2.getBytes, Utils.getContextOrSparkClassLoader)) } - protected[spark] def checkpointFile[T: ClassTag]( -<<<<<<< HEAD - path: String - ): RDD[T] = { -======= - path: String - ): RDD[T] = { ->>>>>>> code format + protected[spark] def checkpointFile[T: ClassTag](path: String): RDD[T] = { new CheckpointRDD[T](this, path) } @@ -844,11 +802,7 @@ class SparkContext(config: SparkConf) extends Logging { * standard mutable collections. So you can use this with mutable Map, Set, etc. */ def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T] -<<<<<<< HEAD (initialValue: R): Accumulable[R, T] = { -======= - (initialValue: R): Accumulable[R, T] = { ->>>>>>> code format val param = new GrowableAccumulableParam[R,T] new Accumulable(initialValue, param) } @@ -1229,11 +1183,7 @@ class SparkContext(config: SparkConf) extends Logging { func: (TaskContext, Iterator[T]) => U, evaluator: ApproximateEvaluator[U, R], timeout: Long): PartialResult[R] = { -<<<<<<< HEAD val callSite = getCallSite -======= - val callSite = getCallSite() ->>>>>>> code format logInfo("Starting job: " + callSite.shortForm) val start = System.nanoTime val result = dagScheduler.runApproximateJob(rdd, func, evaluator, callSite, timeout, @@ -1254,7 +1204,7 @@ class SparkContext(config: SparkConf) extends Logging { resultHandler: (Int, U) => Unit, resultFunc: => R): SimpleFutureAction[R] = { val cleanF = clean(processPartition) - val callSite = getCallSite() + val callSite = getCallSite val waiter = dagScheduler.submitJob( rdd, (context: TaskContext, iter: Iterator[T]) => cleanF(iter), @@ -1407,27 +1357,15 @@ object SparkContext extends Logging { } // TODO: Add AccumulatorParams for other types, e.g. lists and strings - -<<<<<<< HEAD implicit def rddToPairRDDFunctions[K, V](rdd: RDD[(K, V)]) (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) = { new PairRDDFunctions(rdd) -======= - implicit def rddToPairRDDFunctions[K, V] - (rdd: RDD[(K, V)]) - (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K] = null) - = {new PairRDDFunctions(rdd) ->>>>>>> code format } implicit def rddToAsyncRDDActions[T: ClassTag](rdd: RDD[T]) = new AsyncRDDActions(rdd) implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable: ClassTag]( -<<<<<<< HEAD - rdd: RDD[(K, V)]) = new SequenceFileRDDFunctions(rdd) -======= rdd: RDD[(K, V)]) = new SequenceFileRDDFunctions(rdd) ->>>>>>> code format implicit def rddToOrderedRDDFunctions[K : Ordering : ClassTag, V: ClassTag]( rdd: RDD[(K, V)]) = new OrderedRDDFunctions[K, V, (K, V)](rdd) @@ -1527,22 +1465,13 @@ object SparkContext extends Logging { * like SparkConf would. */ private[spark] def updatedConf( -<<<<<<< HEAD - conf: SparkConf, - master: String, - appName: String, - sparkHome: String = null, - jars: Seq[String] = Nil, - environment: Map[String, String] = Map()): SparkConf = -======= conf: SparkConf, master: String, appName: String, sparkHome: String = null, jars: Seq[String] = Nil, - environment: Map[String, String] = Map()): SparkConf = ->>>>>>> code format - { + environment: Map[String, String] = Map()) + : SparkConf = { val res = conf.clone() res.setMaster(master) res.setAppName(appName) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala index 5acf8a9a811ee..fe83f7313faba 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala @@ -230,7 +230,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable /** Check if the receiver has started or not. */ def isStarted(): Boolean = { - executor.isReceiverStarted() + executor.isReceiverStarted } /** @@ -238,7 +238,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * the receiving of data should be stopped. */ def isStopped(): Boolean = { - executor.isReceiverStopped() + executor.isReceiverStopped } /** From e54344b33b4c8cb4c1ff0dfb18a08188de464cfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=84=E7=95=85?= Date: Thu, 9 Oct 2014 16:55:18 +0800 Subject: [PATCH 6/6] code format --- .../scala/org/apache/spark/SparkFiles.scala | 2 +- .../org/apache/spark/executor/Executor.scala | 24 ++-- .../apache/spark/scheduler/DAGScheduler.scala | 2 +- .../spark/scheduler/DAGSchedulerEvent.scala | 36 +++--- .../spark/scheduler/DAGSchedulerSource.scala | 6 +- .../scheduler/EventLoggingListener.scala | 106 +++++++++--------- .../spark/scheduler/ExecutorLossReason.scala | 2 +- .../spark/scheduler/InputFormatInfo.scala | 16 +-- .../apache/spark/scheduler/JobListener.scala | 1 + .../apache/spark/scheduler/JobLogger.scala | 36 +++--- .../org/apache/spark/scheduler/Pool.scala | 7 +- .../spark/scheduler/SchedulableBuilder.scala | 8 +- .../spark/scheduler/SparkListenerBus.scala | 6 +- .../apache/spark/scheduler/SplitInfo.scala | 10 +- .../apache/spark/scheduler/StageInfo.scala | 18 +-- .../org/apache/spark/scheduler/Task.scala | 32 +++--- .../spark/scheduler/TaskDescription.scala | 11 +- .../apache/spark/scheduler/TaskLocation.scala | 10 +- .../apache/spark/scheduler/TaskResult.scala | 11 +- .../spark/scheduler/TaskResultGetter.scala | 15 +-- .../spark/scheduler/TaskScheduler.scala | 4 +- .../spark/scheduler/TaskSchedulerImpl.scala | 19 ++-- .../org/apache/spark/scheduler/TaskSet.scala | 13 ++- .../spark/scheduler/TaskSetManager.scala | 104 +++++++++-------- .../apache/spark/scheduler/WorkerOffer.scala | 3 +- .../cluster/CoarseGrainedClusterMessage.scala | 4 +- .../CoarseGrainedSchedulerBackend.scala | 24 ++-- .../scheduler/cluster/ExecutorData.scala | 2 +- .../cluster/SimrSchedulerBackend.scala | 9 +- .../cluster/SparkDeploySchedulerBackend.scala | 12 +- .../mesos/CoarseMesosSchedulerBackend.scala | 11 +- .../scheduler/cluster/mesos/MemoryUtils.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 11 +- .../spark/scheduler/local/LocalBackend.scala | 3 +- .../spark/serializer/JavaSerializer.scala | 23 ++-- .../spark/serializer/KryoSerializer.scala | 27 ++--- .../apache/spark/serializer/Serializer.scala | 11 +- .../org/apache/spark/util/AkkaUtils.scala | 4 +- .../apache/spark/SparkContextInfoSuite.scala | 2 +- 39 files changed, 326 insertions(+), 321 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkFiles.scala b/core/src/main/scala/org/apache/spark/SparkFiles.scala index e85b89fd014ef..eb38404c9a47b 100644 --- a/core/src/main/scala/org/apache/spark/SparkFiles.scala +++ b/core/src/main/scala/org/apache/spark/SparkFiles.scala @@ -28,7 +28,7 @@ object SparkFiles { * Get the absolute path of a file added through `SparkContext.addFile()`. */ def get(filename: String): String = - new File(getRootDirectory(), filename).getAbsolutePath() + new File(getRootDirectory(), filename).getAbsolutePath /** * Get the root directory that contains files added through `SparkContext.addFile()`. diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 616c7e6a46368..fdf089d8f7f93 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -23,6 +23,7 @@ import java.nio.ByteBuffer import java.util.concurrent._ import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.NonFatal @@ -41,12 +42,11 @@ private[spark] class Executor( slaveHostname: String, properties: Seq[(String, String)], isLocal: Boolean = false) - extends Logging -{ + extends Logging { // Application dependencies (added through SparkContext) that we've fetched so far on this node. // Each map holds the master's timestamp for the version of that file or JAR we got. - private val currentFiles: HashMap[String, Long] = new HashMap[String, Long]() - private val currentJars: HashMap[String, Long] = new HashMap[String, Long]() + private val currentFiles: mutable.HashMap[String, Long] = new mutable.HashMap[String, Long]() + private val currentJars: mutable.HashMap[String, Long] = new mutable.HashMap[String, Long]() private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0)) @@ -278,7 +278,7 @@ private[spark] class Executor( val urls = currentJars.keySet.map { uri => new File(uri.split("/").last).toURI.toURL }.toArray - val userClassPathFirst = conf.getBoolean("spark.files.userClassPathFirst", false) + val userClassPathFirst = conf.getBoolean("spark.files.userClassPathFirst", defaultValue = false) userClassPathFirst match { case true => new ChildExecutorURLClassLoader(urls, currentLoader) case false => new ExecutorURLClassLoader(urls, currentLoader) @@ -294,7 +294,7 @@ private[spark] class Executor( if (classUri != null) { logInfo("Using REPL class URI: " + classUri) val userClassPathFirst: java.lang.Boolean = - conf.getBoolean("spark.files.userClassPathFirst", false) + conf.getBoolean("spark.files.userClassPathFirst", defaultValue = false) try { val klass = Class.forName("org.apache.spark.repl.ExecutorClassLoader") .asInstanceOf[Class[_ <: ClassLoader]] @@ -316,24 +316,24 @@ private[spark] class Executor( * Download any missing dependencies if we receive a new set of files and JARs from the * SparkContext. Also adds any new JARs we fetched to the class loader. */ - private def updateDependencies(newFiles: HashMap[String, Long], newJars: HashMap[String, Long]) { + private def updateDependencies(newFiles: mutable.HashMap[String, Long], newJars: mutable.HashMap[String, Long]) { val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) synchronized { // Fetch missing dependencies for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) { logInfo("Fetching " + name + " with timestamp " + timestamp) - Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf, env.securityManager, + Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf) currentFiles(name) = timestamp } for ((name, timestamp) <- newJars if currentJars.getOrElse(name, -1L) < timestamp) { logInfo("Fetching " + name + " with timestamp " + timestamp) - Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf, env.securityManager, + Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf) currentJars(name) = timestamp // Add it to our class loader val localName = name.split("/").last - val url = new File(SparkFiles.getRootDirectory, localName).toURI.toURL + val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL if (!urlClassLoader.getURLs.contains(url)) { logInfo("Adding " + url + " to class loader") urlClassLoader.addURL(url) @@ -357,9 +357,9 @@ private[spark] class Executor( while (!isStopped) { val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]() for (taskRunner <- runningTasks.values()) { - if (!taskRunner.attemptedTask.isEmpty) { + if (taskRunner.attemptedTask.nonEmpty) { Option(taskRunner.task).flatMap(_.metrics).foreach { metrics => - metrics.updateShuffleReadMetrics + metrics.updateShuffleReadMetrics() if (isLocal) { // JobProgressListener will hold an reference of it during // onExecutorMetricsUpdate(), then JobProgressListener can not see diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index fa52a4211ea29..088a9d7b71386 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1176,7 +1176,7 @@ class DAGScheduler( activeJobs.filter(job => stageDependsOn(job.finalStage, failedStage)).toSeq failedStage.latestInfo.completionTime = Some(clock.getTime()) for (job <- dependentJobs) { - failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason") + failJobAndIndependentStages(job, "Job aborted due to stage failure: $reason") } if (dependentJobs.isEmpty) { logInfo("Ignoring failure of " + failedStage + " because all jobs depending on it are done") diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala index 2b6f7e4205c32..dc667388bb29b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala @@ -19,14 +19,14 @@ package org.apache.spark.scheduler import java.util.Properties -import scala.collection.mutable.Map -import scala.language.existentials - import org.apache.spark._ import org.apache.spark.executor.TaskMetrics import org.apache.spark.rdd.RDD import org.apache.spark.util.CallSite +import scala.collection.mutable +import scala.language.existentials + /** * Types of events that can be handled by the DAGScheduler. The DAGScheduler uses an event queue * architecture where any thread can post an event (e.g. a task finishing or a new job being @@ -36,15 +36,14 @@ import org.apache.spark.util.CallSite private[scheduler] sealed trait DAGSchedulerEvent private[scheduler] case class JobSubmitted( - jobId: Int, - finalRDD: RDD[_], - func: (TaskContext, Iterator[_]) => _, - partitions: Array[Int], - allowLocal: Boolean, - callSite: CallSite, - listener: JobListener, - properties: Properties = null) - extends DAGSchedulerEvent + jobId: Int, + finalRDD: RDD[_], + func: (TaskContext, Iterator[_]) => _, + partitions: Array[Int], + allowLocal: Boolean, + callSite: CallSite, + listener: JobListener, + properties: Properties = null) extends DAGSchedulerEvent private[scheduler] case class StageCancelled(stageId: Int) extends DAGSchedulerEvent @@ -61,13 +60,12 @@ private[scheduler] case class GettingResultEvent(taskInfo: TaskInfo) extends DAGSchedulerEvent private[scheduler] case class CompletionEvent( - task: Task[_], - reason: TaskEndReason, - result: Any, - accumUpdates: Map[Long, Any], - taskInfo: TaskInfo, - taskMetrics: TaskMetrics) - extends DAGSchedulerEvent + task: Task[_], + reason: TaskEndReason, + result: Any, + accumUpdates: mutable.Map[Long, Any], + taskInfo: TaskInfo, + taskMetrics: TaskMetrics) extends DAGSchedulerEvent private[scheduler] case class ExecutorAdded(execId: String, host: String) extends DAGSchedulerEvent diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala index 12668b6c0988e..b47814f588298 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerSource.scala @@ -17,13 +17,11 @@ package org.apache.spark.scheduler -import com.codahale.metrics.{Gauge,MetricRegistry} - -import org.apache.spark.SparkContext +import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.spark.metrics.source.Source private[spark] class DAGSchedulerSource(val dagScheduler: DAGScheduler) - extends Source { + extends Source { override val metricRegistry = new MetricRegistry() override val sourceName = "DAGScheduler" diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 100c9ba9b7809..460aa922b316e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -17,46 +17,43 @@ package org.apache.spark.scheduler -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission -import org.json4s.JsonAST.JValue -import org.json4s.jackson.JsonMethods._ - -import org.apache.spark.{Logging, SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.io.CompressionCodec -import org.apache.spark.SPARK_VERSION import org.apache.spark.util.{FileLogger, JsonProtocol, Utils} +import org.apache.spark.{Logging, SPARK_VERSION, SparkConf} +import org.json4s.JsonAST.JValue +import org.json4s.jackson.JsonMethods._ + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer /** * A SparkListener that logs events to persistent storage. * * Event logging is specified by the following configurable parameters: - * spark.eventLog.enabled - Whether event logging is enabled. - * spark.eventLog.compress - Whether to compress logged events - * spark.eventLog.overwrite - Whether to overwrite any existing files. - * spark.eventLog.dir - Path to the directory in which events are logged. - * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams + * spark.eventLog.enabled - Whether event logging is enabled. + * spark.eventLog.compress - Whether to compress logged events + * spark.eventLog.overwrite - Whether to overwrite any existing files. + * spark.eventLog.dir - Path to the directory in which events are logged. + * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams */ private[spark] class EventLoggingListener( - appId: String, - logBaseDir: String, - sparkConf: SparkConf, - hadoopConf: Configuration) - extends SparkListener with Logging { + appId: String, + logBaseDir: String, + sparkConf: SparkConf, + hadoopConf: Configuration) extends SparkListener with Logging { - import EventLoggingListener._ + import org.apache.spark.scheduler.EventLoggingListener._ def this(appId: String, logBaseDir: String, sparkConf: SparkConf) = this(appId, logBaseDir, sparkConf, SparkHadoopUtil.get.newConfiguration(sparkConf)) - private val shouldCompress = sparkConf.getBoolean("spark.eventLog.compress", false) - private val shouldOverwrite = sparkConf.getBoolean("spark.eventLog.overwrite", false) - private val testing = sparkConf.getBoolean("spark.eventLog.testing", false) + private val shouldCompress = sparkConf.getBoolean("spark.eventLog.compress", defaultValue = false) + private val shouldOverwrite = sparkConf.getBoolean("spark.eventLog.overwrite", defaultValue = false) + private val testing = sparkConf.getBoolean("spark.eventLog.testing", defaultValue = false) private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024 val logDir = EventLoggingListener.getLogDirPath(logBaseDir, appId) val logDirName: String = logDir.split("/").last @@ -95,36 +92,41 @@ private[spark] class EventLoggingListener( } // Events that do not trigger a flush - override def onStageSubmitted(event: SparkListenerStageSubmitted) = - logEvent(event) - override def onTaskStart(event: SparkListenerTaskStart) = - logEvent(event) + override def onStageSubmitted(event: SparkListenerStageSubmitted) = logEvent(event) + + override def onTaskStart(event: SparkListenerTaskStart) = logEvent(event) + override def onTaskGettingResult(event: SparkListenerTaskGettingResult) = logEvent(event) - override def onTaskEnd(event: SparkListenerTaskEnd) = - logEvent(event) - override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate) = - logEvent(event) + + override def onTaskEnd(event: SparkListenerTaskEnd) = logEvent(event) + + override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate) = logEvent(event) // Events that trigger a flush - override def onStageCompleted(event: SparkListenerStageCompleted) = - logEvent(event, flushLogger = true) - override def onJobStart(event: SparkListenerJobStart) = - logEvent(event, flushLogger = true) - override def onJobEnd(event: SparkListenerJobEnd) = - logEvent(event, flushLogger = true) + override def onStageCompleted(event: SparkListenerStageCompleted) = logEvent(event, flushLogger = true) + + override def onJobStart(event: SparkListenerJobStart) = logEvent(event, flushLogger = true) + + override def onJobEnd(event: SparkListenerJobEnd) = logEvent(event, flushLogger = true) + override def onBlockManagerAdded(event: SparkListenerBlockManagerAdded) = logEvent(event, flushLogger = true) + override def onBlockManagerRemoved(event: SparkListenerBlockManagerRemoved) = logEvent(event, flushLogger = true) + override def onUnpersistRDD(event: SparkListenerUnpersistRDD) = logEvent(event, flushLogger = true) + override def onApplicationStart(event: SparkListenerApplicationStart) = logEvent(event, flushLogger = true) + override def onApplicationEnd(event: SparkListenerApplicationEnd) = logEvent(event, flushLogger = true) + // No-op because logging every update would be overkill - override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate) { } + override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate) {} /** * Stop logging events. @@ -207,20 +209,20 @@ private[spark] object EventLoggingListener extends Logging { logWarning("No files found in logging directory %s".format(logDir)) } EventLoggingInfo( - logPaths = filePaths.filter { path => isEventLogFile(path.getName) }, + logPaths = filePaths.filter { path => isEventLogFile(path.getName)}, sparkVersion = filePaths - .find { path => isSparkVersionFile(path.getName) } - .map { path => parseSparkVersion(path.getName) } + .find { path => isSparkVersionFile(path.getName)} + .map { path => parseSparkVersion(path.getName)} .getOrElse(""), compressionCodec = filePaths - .find { path => isCompressionCodecFile(path.getName) } + .find { path => isCompressionCodecFile(path.getName)} .map { path => - val codec = EventLoggingListener.parseCompressionCodec(path.getName) - val conf = new SparkConf - conf.set("spark.io.compression.codec", codec) - codecMap.getOrElseUpdate(codec, CompressionCodec.createCodec(conf)) - }, - applicationComplete = filePaths.exists { path => isApplicationCompleteFile(path.getName) } + val codec = EventLoggingListener.parseCompressionCodec(path.getName) + val conf = new SparkConf + conf.set("spark.io.compression.codec", codec) + codecMap.getOrElseUpdate(codec, CompressionCodec.createCodec(conf)) + }, + applicationComplete = filePaths.exists { path => isApplicationCompleteFile(path.getName)} ) } catch { case e: Exception => @@ -242,10 +244,10 @@ private[spark] object EventLoggingListener extends Logging { * Information needed to process the event logs associated with an application. */ private[spark] case class EventLoggingInfo( - logPaths: Seq[Path], - sparkVersion: String, - compressionCodec: Option[CompressionCodec], - applicationComplete: Boolean = false) + logPaths: Seq[Path], + sparkVersion: String, + compressionCodec: Option[CompressionCodec], + applicationComplete: Boolean = false) private[spark] object EventLoggingInfo { def empty = EventLoggingInfo(Seq[Path](), "", None, applicationComplete = false) diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala index 2bc43a9186449..b0b72958f7a74 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala @@ -28,7 +28,7 @@ class ExecutorLossReason(val message: String) { } private[spark] -case class ExecutorExited(val exitCode: Int) +case class ExecutorExited(exitCode: Int) extends ExecutorLossReason(ExecutorExitCode.explainExitCode(exitCode)) { } diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala index bac37bfdaa23f..ed77158865fd6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala @@ -17,19 +17,19 @@ package org.apache.spark.scheduler -import scala.collection.JavaConversions._ -import scala.collection.immutable.Set -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.util.ReflectionUtils - import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.SparkHadoopUtil +import scala.collection.JavaConversions._ +import scala.collection.immutable.Set +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + /** * :: DeveloperApi :: * Parses and holds information about inputFormat (and files) specified as a parameter. @@ -125,7 +125,7 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl org.apache.hadoop.mapred.InputFormat[_, _]] val retval = new ArrayBuffer[SplitInfo]() - instance.getSplits(jobConf, jobConf.getNumMapTasks()).foreach( + instance.getSplits(jobConf, jobConf.getNumMapTasks).foreach( elem => retval ++= SplitInfo.toSplitInfo(inputFormatClazz, path, elem) ) @@ -169,13 +169,13 @@ object InputFormatInfo { */ def computePreferredLocations(formats: Seq[InputFormatInfo]): Map[String, Set[SplitInfo]] = { - val nodeToSplit = new HashMap[String, HashSet[SplitInfo]] + val nodeToSplit = new mutable.HashMap[String, mutable.HashSet[SplitInfo]] for (inputSplit <- formats) { val splits = inputSplit.findPreferredLocations() for (split <- splits){ val location = split.hostLocation - val set = nodeToSplit.getOrElseUpdate(location, new HashSet[SplitInfo]) + val set = nodeToSplit.getOrElseUpdate(location, new mutable.HashSet[SplitInfo]) set += split } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobListener.scala b/core/src/main/scala/org/apache/spark/scheduler/JobListener.scala index 50c2b9acd609c..b6e216cd17ac9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobListener.scala @@ -24,5 +24,6 @@ package org.apache.spark.scheduler */ private[spark] trait JobListener { def taskSucceeded(index: Int, result: Any) + def jobFailed(exception: Exception) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala index 54904bffdf10b..2fbc5c0aeefb1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala @@ -21,12 +21,12 @@ import java.io.{File, FileNotFoundException, IOException, PrintWriter} import java.text.SimpleDateFormat import java.util.{Date, Properties} -import scala.collection.mutable.HashMap - import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics +import scala.collection.mutable + /** * :: DeveloperApi :: * A logger class to record runtime information for jobs in Spark. This class outputs one log file @@ -53,9 +53,9 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener "/tmp/spark-%s".format(user) } - private val jobIdToPrintWriter = new HashMap[Int, PrintWriter] - private val stageIdToJobId = new HashMap[Int, Int] - private val jobIdToStageIds = new HashMap[Int, Seq[Int]] + private val jobIdToPrintWriter = new mutable.HashMap[Int, PrintWriter] + private val stageIdToJobId = new mutable.HashMap[Int, Int] + private val jobIdToStageIds = new mutable.HashMap[Int, Seq[Int]] private val dateFormat = new ThreadLocal[SimpleDateFormat]() { override def initialValue() = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss") } @@ -110,7 +110,7 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener */ protected def buildJobStageDependencies(jobId: Int, stageIds: Seq[Int]) = { jobIdToStageIds(jobId) = stageIds - stageIds.foreach { stageId => stageIdToJobId(stageId) = jobId } + stageIds.foreach { stageId => stageIdToJobId(stageId) = jobId} } /** @@ -146,31 +146,31 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener * @param taskMetrics Task running metrics */ protected def recordTaskMetrics(stageId: Int, status: String, - taskInfo: TaskInfo, taskMetrics: TaskMetrics) { + taskInfo: TaskInfo, taskMetrics: TaskMetrics) { val info = " TID=" + taskInfo.taskId + " STAGE_ID=" + stageId + - " START_TIME=" + taskInfo.launchTime + " FINISH_TIME=" + taskInfo.finishTime + - " EXECUTOR_ID=" + taskInfo.executorId + " HOST=" + taskMetrics.hostname + " START_TIME=" + taskInfo.launchTime + " FINISH_TIME=" + taskInfo.finishTime + + " EXECUTOR_ID=" + taskInfo.executorId + " HOST=" + taskMetrics.hostname val executorRunTime = " EXECUTOR_RUN_TIME=" + taskMetrics.executorRunTime val gcTime = " GC_TIME=" + taskMetrics.jvmGCTime val inputMetrics = taskMetrics.inputMetrics match { case Some(metrics) => " READ_METHOD=" + metrics.readMethod.toString + - " INPUT_BYTES=" + metrics.bytesRead + " INPUT_BYTES=" + metrics.bytesRead case None => "" } val shuffleReadMetrics = taskMetrics.shuffleReadMetrics match { case Some(metrics) => " BLOCK_FETCHED_TOTAL=" + metrics.totalBlocksFetched + - " BLOCK_FETCHED_LOCAL=" + metrics.localBlocksFetched + - " BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched + - " REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime + - " REMOTE_BYTES_READ=" + metrics.remoteBytesRead + " BLOCK_FETCHED_LOCAL=" + metrics.localBlocksFetched + + " BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched + + " REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime + + " REMOTE_BYTES_READ=" + metrics.remoteBytesRead case None => "" } val writeMetrics = taskMetrics.shuffleWriteMetrics match { case Some(metrics) => " SHUFFLE_BYTES_WRITTEN=" + metrics.shuffleBytesWritten + - " SHUFFLE_WRITE_TIME=" + metrics.shuffleWriteTime + " SHUFFLE_WRITE_TIME=" + metrics.shuffleWriteTime case None => "" } stageLogInfo(stageId, status + info + executorRunTime + gcTime + inputMetrics + @@ -213,12 +213,12 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener recordTaskMetrics(taskEnd.stageId, taskStatus, taskInfo, taskMetrics) case Resubmitted => taskStatus += " STATUS=RESUBMITTED TID=" + taskInfo.taskId + - " STAGE_ID=" + taskEnd.stageId + " STAGE_ID=" + taskEnd.stageId stageLogInfo(taskEnd.stageId, taskStatus) case FetchFailed(bmAddress, shuffleId, mapId, reduceId) => taskStatus += " STATUS=FETCHFAILED TID=" + taskInfo.taskId + " STAGE_ID=" + - taskEnd.stageId + " SHUFFLE_ID=" + shuffleId + " MAP_ID=" + - mapId + " REDUCE_ID=" + reduceId + taskEnd.stageId + " SHUFFLE_ID=" + shuffleId + " MAP_ID=" + + mapId + " REDUCE_ID=" + reduceId stageLogInfo(taskEnd.stageId, taskStatus) case _ => } diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index 174b73221afc0..16c8dce77f575 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -88,11 +88,12 @@ private[spark] class Pool( } override def checkSpeculatableTasks(): Boolean = { - var shouldRevive = false for (schedulable <- schedulableQueue) { - shouldRevive |= schedulable.checkSpeculatableTasks() + if(schedulable.checkSpeculatableTasks()){ + return true + } } - shouldRevive + false } override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala index 6c5827f75e636..3713134b97a07 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala @@ -20,10 +20,10 @@ package org.apache.spark.scheduler import java.io.{FileInputStream, InputStream} import java.util.{NoSuchElementException, Properties} -import scala.xml.XML - -import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkConf} + +import scala.xml.XML /** * An interface to build Schedulable tree @@ -77,7 +77,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) } } - is.foreach { i => buildFairSchedulerPool(i) } + is.foreach { i => buildFairSchedulerPool(i)} } finally { is.foreach(_.close()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala index e79ffd7a3587d..28e09b7624fc6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala @@ -17,12 +17,12 @@ package org.apache.spark.scheduler -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.Logging import org.apache.spark.util.Utils +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + /** * A SparkListenerEvent bus that relays events to its listeners */ diff --git a/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala index 6a54252753a63..838f2ba81ea78 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SplitInfo.scala @@ -25,11 +25,11 @@ import scala.collection.mutable.ArrayBuffer // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( - val inputFormatClazz: Class[_], - val hostLocation: String, - val path: String, - val length: Long, - val underlyingSplit: Any) { + val inputFormatClazz: Class[_], + val hostLocation: String, + val path: String, + val length: Long, +val underlyingSplit: Any) { override def toString: String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala index c6dc3369ba5cc..dee4af790de26 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala @@ -17,23 +17,23 @@ package org.apache.spark.scheduler -import scala.collection.mutable.HashMap - import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.RDDInfo +import scala.collection.mutable + /** * :: DeveloperApi :: * Stores information about a stage to pass from the scheduler to SparkListeners. */ @DeveloperApi class StageInfo( - val stageId: Int, - val attemptId: Int, - val name: String, - val numTasks: Int, - val rddInfos: Seq[RDDInfo], - val details: String) { + val stageId: Int, + val attemptId: Int, + val name: String, + val numTasks: Int, + val rddInfos: Seq[RDDInfo], + val details: String) { /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */ var submissionTime: Option[Long] = None /** Time when all tasks in the stage completed or when the stage was cancelled. */ @@ -41,7 +41,7 @@ class StageInfo( /** If the stage failed, the reason why. */ var failureReason: Option[String] = None /** Terminal values of accumulables updated during this stage. */ - val accumulables = HashMap[Long, AccumulableInfo]() + val accumulables = mutable.HashMap[Long, AccumulableInfo]() def stageFailed(reason: String) { failureReason = Some(reason) diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index c6e47c84a0cb2..81993f6543d39 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -20,13 +20,12 @@ package org.apache.spark.scheduler import java.io.{ByteArrayOutputStream, DataInputStream, DataOutputStream} import java.nio.ByteBuffer -import scala.collection.mutable.HashMap - import org.apache.spark.TaskContext import org.apache.spark.executor.TaskMetrics import org.apache.spark.serializer.SerializerInstance -import org.apache.spark.util.ByteBufferInputStream -import org.apache.spark.util.Utils +import org.apache.spark.util.{ByteBufferInputStream, Utils} + +import scala.collection.mutable /** @@ -73,11 +72,13 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex @transient protected var context: TaskContext = _ // The actual Thread on which the task is running, if any. Initialized in run(). - @volatile @transient private var taskThread: Thread = _ + @volatile + @transient private var taskThread: Thread = _ // A flag to indicate whether the task is killed. This is used in case context is not yet // initialized when kill() is invoked. - @volatile @transient private var _killed = false + @volatile + @transient private var _killed = false /** * Whether the task has been killed. @@ -98,7 +99,7 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex if (interruptThread && taskThread != null) { taskThread.interrupt() } - } + } } /** @@ -113,11 +114,10 @@ private[spark] object Task { * Serialize a task and the current app dependencies (files and JARs added to the SparkContext) */ def serializeWithDependencies( - task: Task[_], - currentFiles: HashMap[String, Long], - currentJars: HashMap[String, Long], - serializer: SerializerInstance) - : ByteBuffer = { + task: Task[_], + currentFiles: mutable.HashMap[String, Long], + currentJars: mutable.HashMap[String, Long], + serializer: SerializerInstance): ByteBuffer = { val out = new ByteArrayOutputStream(4096) val dataOut = new DataOutputStream(out) @@ -151,27 +151,27 @@ private[spark] object Task { * @return (taskFiles, taskJars, taskBytes) */ def deserializeWithDependencies(serializedTask: ByteBuffer) - : (HashMap[String, Long], HashMap[String, Long], ByteBuffer) = { + : (mutable.HashMap[String, Long], mutable.HashMap[String, Long], ByteBuffer) = { val in = new ByteBufferInputStream(serializedTask) val dataIn = new DataInputStream(in) // Read task's files - val taskFiles = new HashMap[String, Long]() + val taskFiles = new mutable.HashMap[String, Long]() val numFiles = dataIn.readInt() for (i <- 0 until numFiles) { taskFiles(dataIn.readUTF()) = dataIn.readLong() } // Read task's JARs - val taskJars = new HashMap[String, Long]() + val taskJars = new mutable.HashMap[String, Long]() val numJars = dataIn.readInt() for (i <- 0 until numJars) { taskJars(dataIn.readUTF()) = dataIn.readLong() } // Create a sub-buffer for the rest of the data, which is the serialized Task object - val subBuffer = serializedTask.slice() // ByteBufferInputStream will have read just up to task + val subBuffer = serializedTask.slice() // ByteBufferInputStream will have read just up to task (taskFiles, taskJars, subBuffer) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 4c96b9e5fef60..4e229f85a48aa 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -26,12 +26,11 @@ import org.apache.spark.util.SerializableBuffer * [[TaskSetManager.resourceOffer]]. */ private[spark] class TaskDescription( - val taskId: Long, - val executorId: String, - val name: String, - val index: Int, // Index within this task's TaskSet - _serializedTask: ByteBuffer) - extends Serializable { + val taskId: Long, + val executorId: String, + val name: String, + val index: Int, // Index within this task's TaskSet + _serializedTask: ByteBuffer) extends Serializable { // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer private val buffer = new SerializableBuffer(_serializedTask) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala index 10c685f29d3ac..1e3a2d9c3895d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocation.scala @@ -29,22 +29,22 @@ private[spark] sealed trait TaskLocation { /** * A location that includes both a host and an executor id on that host. */ -private [spark] case class ExecutorCacheTaskLocation(override val host: String, - val executorId: String) extends TaskLocation { +private[spark] case class ExecutorCacheTaskLocation(override val host: String, executorId: String) extends TaskLocation { + } /** * A location on a host. */ -private [spark] case class HostTaskLocation(override val host: String) extends TaskLocation { +private[spark] case class HostTaskLocation(override val host: String) extends TaskLocation { override def toString = host } /** * A location on a host that is cached by HDFS. */ -private [spark] case class HDFSCacheTaskLocation(override val host: String) - extends TaskLocation { +private[spark] case class HDFSCacheTaskLocation(override val host: String) + extends TaskLocation { override def toString = TaskLocation.inMemoryLocationTag + host } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala index d49d8fb887007..a3d890755fc1e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala @@ -20,13 +20,14 @@ package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer -import scala.collection.mutable.Map - import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils +import scala.collection.mutable +import scala.collection.mutable.Map + // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] @@ -37,14 +38,14 @@ case class IndirectTaskResult[T](blockId: BlockId) extends TaskResult[T] with Se /** A TaskResult that contains the task's return value and accumulator updates. */ private[spark] class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long, Any], - var metrics: TaskMetrics) + var metrics: TaskMetrics) extends TaskResult[T] with Externalizable { def this() = this(null.asInstanceOf[ByteBuffer], null, null) override def writeExternal(out: ObjectOutput) { - out.writeInt(valueBytes.remaining); + out.writeInt(valueBytes.remaining) Utils.writeByteBuffer(valueBytes, out) out.writeInt(accumUpdates.size) @@ -66,7 +67,7 @@ class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long if (numUpdates == 0) { accumUpdates = null } else { - accumUpdates = Map() + accumUpdates = mutable.Map() for (i <- 0 until numUpdates) { accumUpdates(in.readLong()) = in.readObject() } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index 3f345ceeaaf7a..dcd24eae52a89 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -19,13 +19,13 @@ package org.apache.spark.scheduler import java.nio.ByteBuffer -import scala.util.control.NonFatal - -import org.apache.spark._ import org.apache.spark.TaskState.TaskState +import org.apache.spark._ import org.apache.spark.serializer.SerializerInstance import org.apache.spark.util.Utils +import scala.util.control.NonFatal + /** * Runs a thread pool that deserializes and remotely fetches (if necessary) task results. */ @@ -42,8 +42,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul } } - def enqueueSuccessfulTask( - taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer) { + def enqueueSuccessfulTask(taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer) { getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { try { @@ -81,9 +80,11 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul }) } - def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState, + def enqueueFailedTask( + taskSetManager: TaskSetManager, + tid: Long, taskState: TaskState, serializedData: ByteBuffer) { - var reason : TaskEndReason = UnknownReason + var reason: TaskEndReason = UnknownReason getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { try { diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index a129a434c9a1a..ce114039b639c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -17,8 +17,8 @@ package org.apache.spark.scheduler -import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId /** @@ -42,7 +42,7 @@ private[spark] trait TaskScheduler { // Invoked after system has successfully initialized (typically in spark context). // Yarn uses this to bootstrap allocation of resources based on preferred locations, // wait for slave registerations, etc. - def postStartHook() { } + def postStartHook() {} // Disconnect from the cluster. def stop(): Unit diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 60bd23bf15db0..9aaa1d407da7d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -24,8 +24,6 @@ import java.util.concurrent.atomic.AtomicLong import scala.collection.mutable import scala.concurrent.duration._ import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.HashMap -import scala.collection.mutable.HashSet import scala.language.postfixOps import scala.util.Random @@ -55,8 +53,7 @@ private[spark] class TaskSchedulerImpl( val sc: SparkContext, val maxTaskFailures: Int, isLocal: Boolean = false) - extends TaskScheduler with Logging -{ + extends TaskScheduler with Logging { def this(sc: SparkContext) = this(sc, sc.conf.getInt("spark.task.maxFailures", 4)) val conf = sc.conf @@ -72,10 +69,10 @@ private[spark] class TaskSchedulerImpl( // TaskSetManagers are not thread safe, so any access to one should be synchronized // on this class. - val activeTaskSets = new HashMap[String, TaskSetManager] + val activeTaskSets = new mutable.HashMap[String, TaskSetManager] - val taskIdToTaskSetId = new HashMap[Long, String] - val taskIdToExecutorId = new HashMap[Long, String] + val taskIdToTaskSetId = new mutable.HashMap[Long, String] + val taskIdToExecutorId = new mutable.HashMap[Long, String] @volatile private var hasReceivedTask = false @volatile private var hasLaunchedTask = false @@ -85,11 +82,11 @@ private[spark] class TaskSchedulerImpl( val nextTaskId = new AtomicLong(0) // Which executor IDs we have executors on - val activeExecutorIds = new HashSet[String] + val activeExecutorIds = new mutable.HashSet[String] // The set of executors we have on each host; this is used to compute hostsAlive, which // in turn is used to decide when we can attain data locality on a given host - protected val executorsByHost = new HashMap[String, HashSet[String]] + protected val executorsByHost = new mutable.HashMap[String, mutable.HashSet[String]] protected val hostsByRack = new mutable.HashMap[String, mutable.HashSet[String]] @@ -222,12 +219,12 @@ private[spark] class TaskSchedulerImpl( for (o <- offers) { executorIdToHost(o.executorId) = o.host if (!executorsByHost.contains(o.host)) { - executorsByHost(o.host) = new HashSet[String]() + executorsByHost(o.host) = new mutable.HashSet[String]() executorAdded(o.executorId, o.host) newExecAvail = true } for (rack <- getRackForHost(o.host)) { - hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host + hostsByRack.getOrElseUpdate(rack, new mutable.HashSet[String]()) += o.host } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala index c3ad325156f53..9df988a6d2f84 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala @@ -24,12 +24,13 @@ import java.util.Properties * missing partitions of a particular stage. */ private[spark] class TaskSet( - val tasks: Array[Task[_]], - val stageId: Int, - val attempt: Int, - val priority: Int, - val properties: Properties) { - val id: String = stageId + "." + attempt + val tasks: Array[Task[_]], + val stageId: Int, + val attempt: Int, + val priority: Int, + val properties: Properties) { + + val id: String = stageId + "." + attempt override def toString: String = "TaskSet " + id } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index a6c23fc85a1b0..5e94cd29cdca6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -18,19 +18,18 @@ package org.apache.spark.scheduler import java.io.NotSerializableException +import java.util import java.util.Arrays -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.HashMap -import scala.collection.mutable.HashSet -import scala.math.max -import scala.math.min - -import org.apache.spark._ import org.apache.spark.TaskState.TaskState +import org.apache.spark._ import org.apache.spark.executor.TaskMetrics import org.apache.spark.util.{Clock, SystemClock} +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.math.{max, min} + /** * Schedules the tasks within a single TaskSet in the TaskSchedulerImpl. This class keeps track of * each task, retries tasks if they fail (up to a limited number of times), and @@ -47,10 +46,10 @@ import org.apache.spark.util.{Clock, SystemClock} * task set will be aborted */ private[spark] class TaskSetManager( - sched: TaskSchedulerImpl, - val taskSet: TaskSet, - val maxTaskFailures: Int, - clock: Clock = SystemClock) + sched: TaskSchedulerImpl, + val taskSet: TaskSet, + val maxTaskFailures: Int, + clock: Clock = SystemClock) extends Schedulable with Logging { val conf = sched.sc.conf @@ -78,7 +77,7 @@ private[spark] class TaskSetManager( val successful = new Array[Boolean](numTasks) private val numFailures = new Array[Int](numTasks) // key is taskId, value is a Map of executor id to when it failed - private val failedExecutors = new HashMap[Int, HashMap[String, Long]]() + private val failedExecutors = new mutable.HashMap[Int, mutable.HashMap[String, Long]]() val taskAttempts = Array.fill[List[TaskInfo]](numTasks)(Nil) var tasksSuccessful = 0 @@ -90,7 +89,8 @@ private[spark] class TaskSetManager( var name = "TaskSet_" + taskSet.stageId.toString var parent: Pool = null - val runningTasksSet = new HashSet[Long] + val runningTasksSet = new mutable.HashSet[Long] + override def runningTasks = runningTasksSet.size // True once no more tasks should be launched for this task set manager. TaskSetManagers enter @@ -108,14 +108,14 @@ private[spark] class TaskSetManager( // back at the head of the stack. They are also only cleaned up lazily; // when a task is launched, it remains in all the pending lists except // the one that it was launched from, but gets removed from them later. - private val pendingTasksForExecutor = new HashMap[String, ArrayBuffer[Int]] + private val pendingTasksForExecutor = new mutable.HashMap[String, ArrayBuffer[Int]] // Set of pending tasks for each host. Similar to pendingTasksForExecutor, // but at host level. - private val pendingTasksForHost = new HashMap[String, ArrayBuffer[Int]] + private val pendingTasksForHost = new mutable.HashMap[String, ArrayBuffer[Int]] // Set of pending tasks for each rack -- similar to the above. - private val pendingTasksForRack = new HashMap[String, ArrayBuffer[Int]] + private val pendingTasksForRack = new mutable.HashMap[String, ArrayBuffer[Int]] // Set containing pending tasks with no locality preferences. var pendingTasksWithNoPrefs = new ArrayBuffer[Int] @@ -125,10 +125,10 @@ private[spark] class TaskSetManager( // Tasks that can be speculated. Since these will be a small fraction of total // tasks, we'll just hold them in a HashSet. - val speculatableTasks = new HashSet[Int] + val speculatableTasks = new mutable.HashSet[Int] // Task index, start and finish time for each task attempt (indexed by task ID) - val taskInfos = new HashMap[Long, TaskInfo] + val taskInfos = new mutable.HashMap[Long, TaskInfo] // How frequently to reprint duplicate exceptions in full, in milliseconds val EXCEPTION_PRINT_INTERVAL = @@ -137,7 +137,7 @@ private[spark] class TaskSetManager( // Map of recent exceptions (identified by string representation and top stack frame) to // duplicate count (how many times the same exception has appeared) and time the full exception // was printed. This should ideally be an LRU map that can drop old exceptions automatically. - val recentExceptions = HashMap[String, (Int, Long)]() + val recentExceptions = mutable.HashMap[String, (Int, Long)]() // Figure out the current map output tracker epoch and set it on all tasks val epoch = sched.mapOutputTracker.getEpoch @@ -159,8 +159,9 @@ private[spark] class TaskSetManager( // Delay scheduling variables: we keep track of our current locality level and the time we // last launched a task at that level, and move up a level when localityWaits[curLevel] expires. // We then move down if we manage to launch a "more local" task. - var currentLocalityIndex = 0 // Index of our current locality level in validLocalityLevels - var lastLaunchTime = clock.getTime() // Time we last launched a task at this level + var currentLocalityIndex = 0 + // Index of our current locality level in validLocalityLevels + var lastLaunchTime = clock.getTime() // Time we last launched a task at this level override def schedulableQueue = null @@ -195,7 +196,7 @@ private[spark] class TaskSetManager( ", where there are executors " + set.mkString(",")) } case None => logDebug(s"Pending task $index has a cached location at ${e.host} " + - ", but there are no executors alive there.") + ", but there are no executors alive there.") } } case _ => Unit @@ -211,7 +212,7 @@ private[spark] class TaskSetManager( } if (!readding) { - allPendingTasks += index // No point scanning this whole list to find the old task there + allPendingTasks += index // No point scanning this whole list to find the old task there } } @@ -287,8 +288,7 @@ private[spark] class TaskSetManager( * the given locality constraint. */ private def findSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value) - : Option[(Int, TaskLocality.Value)] = - { + : Option[(Int, TaskLocality.Value)] = { speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set def canRunOnHost(index: Int): Boolean = @@ -302,7 +302,8 @@ private[spark] class TaskSetManager( val executors = prefs.flatMap(_ match { case e: ExecutorCacheTaskLocation => Some(e.executorId) case _ => None - }); + }) + if (executors.contains(execId)) { speculatableTasks -= index return Some((index, TaskLocality.PROCESS_LOCAL)) @@ -363,8 +364,7 @@ private[spark] class TaskSetManager( * @return An option containing (task index within the task set, locality, is speculative?) */ private def findTask(execId: String, host: String, maxLocality: TaskLocality.Value) - : Option[(Int, TaskLocality.Value, Boolean)] = - { + : Option[(Int, TaskLocality.Value, Boolean)] = { for (index <- findTaskFromList(execId, getPendingTasksForExecutor(execId))) { return Some((index, TaskLocality.PROCESS_LOCAL, false)) } @@ -399,7 +399,8 @@ private[spark] class TaskSetManager( // find a speculative task if all others tasks have been scheduled findSpeculativeTask(execId, host, maxLocality).map { - case (taskIndex, allowedLocality) => (taskIndex, allowedLocality, true)} + case (taskIndex, allowedLocality) => (taskIndex, allowedLocality, true) + } } /** @@ -414,11 +415,9 @@ private[spark] class TaskSetManager( * @param maxLocality the maximum locality we want to schedule the tasks at */ def resourceOffer( - execId: String, - host: String, - maxLocality: TaskLocality.TaskLocality) - : Option[TaskDescription] = - { + execId: String, + host: String, + maxLocality: TaskLocality.TaskLocality): Option[TaskDescription] = { if (!isZombie) { val curTime = clock.getTime() @@ -457,7 +456,7 @@ private[spark] class TaskSetManager( val serializedTask = Task.serializeWithDependencies( task, sched.sc.addedFiles, sched.sc.addedJars, ser) if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 && - !emittedTaskSizeWarning) { + !emittedTaskSizeWarning) { emittedTaskSizeWarning = true logWarning(s"Stage ${task.stageId} contains a task of very large size " + s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " + @@ -470,7 +469,7 @@ private[spark] class TaskSetManager( // val timeTaken = clock.getTime() - startTime val taskName = s"task ${info.id} in stage ${taskSet.id}" logInfo("Starting %s (TID %d, %s, %s, %d bytes)".format( - taskName, taskId, host, taskLocality, serializedTask.limit)) + taskName, taskId, host, taskLocality, serializedTask.limit)) sched.dagScheduler.taskStarted(task, info) return Some(new TaskDescription(taskId, execId, taskName, index, serializedTask)) @@ -492,8 +491,7 @@ private[spark] class TaskSetManager( */ private def getAllowedLocalityLevel(curTime: Long): TaskLocality.TaskLocality = { while (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex) && - currentLocalityIndex < myLocalityLevels.length - 1) - { + currentLocalityIndex < myLocalityLevels.length - 1) { // Jump to the next locality level, and remove our waiting time for the current one since // we don't want to count it again on the next one lastLaunchTime += localityWaits(currentLocalityIndex) @@ -561,7 +559,7 @@ private[spark] class TaskSetManager( info.markFailed() val index = info.index copiesRunning(index) -= 1 - var taskMetrics : TaskMetrics = null + var taskMetrics: TaskMetrics = null val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}): " + reason.asInstanceOf[TaskFailedReason].toErrorString @@ -607,22 +605,22 @@ private[spark] class TaskSetManager( } else { logInfo( s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid) on executor ${info.host}: " + - s"${ef.className} (${ef.description}) [duplicate $dupCount]") + s"${ef.className} (${ef.description}) [duplicate $dupCount]") } - case e: TaskFailedReason => // TaskResultLost, TaskKilled, and others + case e: TaskFailedReason => // TaskResultLost, TaskKilled, and others logWarning(failureReason) case e: TaskEndReason => logError("Unknown TaskEndReason: " + e) } // always add to failed executors - failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()). + failedExecutors.getOrElseUpdate(index, new mutable.HashMap[String, Long]()). put(info.executorId, clock.getTime()) sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics) addPendingTask(index) if (!isZombie && state != TaskState.KILLED) { - assert (null != failureReason) + assert(null != failureReason) numFailures(index) += 1 if (numFailures(index) >= maxTaskFailures) { logError("Task %d in stage %s failed %d times; aborting job".format( @@ -643,9 +641,9 @@ private[spark] class TaskSetManager( } /** If the given task ID is not in the set of running tasks, adds it. - * - * Used to keep track of the number of running tasks, for enforcing scheduling policies. - */ + * + * Used to keep track of the number of running tasks, for enforcing scheduling policies. + */ def addRunningTask(tid: Long) { if (runningTasksSet.add(tid) && parent != null) { parent.increaseRunningTasks(1) @@ -681,10 +679,10 @@ private[spark] class TaskSetManager( // that it's okay if we add a task to the same queue twice (if it had multiple preferred // locations), because findTaskFromList will skip already-running tasks. for (index <- getPendingTasksForExecutor(execId)) { - addPendingTask(index, readding=true) + addPendingTask(index, readding = true) } for (index <- getPendingTasksForHost(host)) { - addPendingTask(index, readding=true) + addPendingTask(index, readding = true) } // Re-enqueue any tasks that ran on the failed executor if this is a shuffle map stage. @@ -731,7 +729,7 @@ private[spark] class TaskSetManager( if (tasksSuccessful >= minFinishedForSpeculation && tasksSuccessful > 0) { val time = clock.getTime() val durations = taskInfos.values.filter(_.successful).map(_.duration).toArray - Arrays.sort(durations) + util.Arrays.sort(durations) val medianDuration = durations(min((0.5 * tasksSuccessful).round.toInt, durations.size - 1)) val threshold = max(SPECULATION_MULTIPLIER * medianDuration, 100) // TODO: Threshold should also look at standard deviation of task durations and have a lower @@ -771,21 +769,21 @@ private[spark] class TaskSetManager( * */ private def computeValidLocalityLevels(): Array[TaskLocality.TaskLocality] = { - import TaskLocality.{PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY} + import org.apache.spark.scheduler.TaskLocality.{ANY, NODE_LOCAL, NO_PREF, PROCESS_LOCAL, RACK_LOCAL} val levels = new ArrayBuffer[TaskLocality.TaskLocality] if (!pendingTasksForExecutor.isEmpty && getLocalityWait(PROCESS_LOCAL) != 0 && - pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) { + pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) { levels += PROCESS_LOCAL } if (!pendingTasksForHost.isEmpty && getLocalityWait(NODE_LOCAL) != 0 && - pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) { + pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) { levels += NODE_LOCAL } if (!pendingTasksWithNoPrefs.isEmpty) { levels += NO_PREF } if (!pendingTasksForRack.isEmpty && getLocalityWait(RACK_LOCAL) != 0 && - pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) { + pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) { levels += RACK_LOCAL } levels += ANY diff --git a/core/src/main/scala/org/apache/spark/scheduler/WorkerOffer.scala b/core/src/main/scala/org/apache/spark/scheduler/WorkerOffer.scala index 810b36cddf835..d1b406436b500 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/WorkerOffer.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/WorkerOffer.scala @@ -20,5 +20,4 @@ package org.apache.spark.scheduler /** * Represents free resources available on an executor. */ -private[spark] -case class WorkerOffer(executorId: String, host: String, cores: Int) +private[spark] case class WorkerOffer(executorId: String, host: String, cores: Int) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala index fb8160abc59db..b160798ac9c05 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala @@ -50,7 +50,7 @@ private[spark] object CoarseGrainedClusterMessages { object StatusUpdate { /** Alternate factory method that takes a ByteBuffer directly for the data field */ def apply(executorId: String, taskId: Long, state: TaskState, data: ByteBuffer) - : StatusUpdate = { + : StatusUpdate = { StatusUpdate(executorId, taskId, state, new SerializableBuffer(data)) } } @@ -66,7 +66,7 @@ private[spark] object CoarseGrainedClusterMessages { case class RemoveExecutor(executorId: String, reason: String) extends CoarseGrainedClusterMessage - case class AddWebUIFilter(filterName:String, filterParams: Map[String, String], proxyBase :String) + case class AddWebUIFilter(filterName: String, filterParams: Map[String, String], proxyBase: String) extends CoarseGrainedClusterMessage } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index e11d02315ce6f..9ba8280c3a803 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -19,20 +19,19 @@ package org.apache.spark.scheduler.cluster import java.util.concurrent.atomic.AtomicInteger -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.concurrent.Await -import scala.concurrent.duration._ - import akka.actor._ import akka.pattern.ask import akka.remote.{DisassociatedEvent, RemotingLifecycleEvent} - -import org.apache.spark.{SparkEnv, Logging, SparkException, TaskState} -import org.apache.spark.scheduler.{SchedulerBackend, SlaveLost, TaskDescription, TaskSchedulerImpl, WorkerOffer} import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ -import org.apache.spark.util.{ActorLogReceive, SerializableBuffer, AkkaUtils, Utils} +import org.apache.spark.scheduler.{SchedulerBackend, SlaveLost, TaskDescription, TaskSchedulerImpl, WorkerOffer} import org.apache.spark.ui.JettyUtils +import org.apache.spark.util.{ActorLogReceive, AkkaUtils, SerializableBuffer, Utils} +import org.apache.spark.{Logging, SparkEnv, SparkException, TaskState} + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.Await +import scala.concurrent.duration._ /** * A scheduler backend that waits for coarse grained executors to connect to it through Akka. @@ -44,8 +43,7 @@ import org.apache.spark.ui.JettyUtils */ private[spark] class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: ActorSystem) - extends SchedulerBackend with Logging -{ + extends SchedulerBackend with Logging { // Use an atomic variable to track total number of cores in the cluster for simplicity and speed var totalCoreCount = new AtomicInteger(0) var totalRegisteredExecutors = new AtomicInteger(0) @@ -284,8 +282,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A if (hasFilter) { logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase") conf.set("spark.ui.filters", filterName) - filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v) } - scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) } + filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v)} + scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf)} } } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala index b71bd5783d6df..be466648f9cec 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler.cluster -import akka.actor.{Address, ActorRef} +import akka.actor.{ActorRef, Address} /** * Grouping of data for an executor used by CoarseGrainedSchedulerBackend. diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala index ee10aa061f4e9..ada17523203d3 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SimrSchedulerBackend.scala @@ -17,11 +17,10 @@ package org.apache.spark.scheduler.cluster -import org.apache.hadoop.fs.{Path, FileSystem} - -import org.apache.spark.{Logging, SparkContext, SparkEnv} +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl +import org.apache.spark.{Logging, SparkContext, SparkEnv} private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, @@ -48,8 +47,8 @@ private[spark] class SimrSchedulerBackend( val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") - logInfo("Writing to HDFS file: " + driverFilePath) - logInfo("Writing Akka address: " + driverUrl) + logInfo("Writing to HDFS file: " + driverFilePath) + logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index ed209d195ec9d..5c932e3d24037 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -17,11 +17,11 @@ package org.apache.spark.scheduler.cluster -import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv} -import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.deploy.client.{AppClient, AppClientListener} +import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason, SlaveLost, TaskSchedulerImpl} import org.apache.spark.util.Utils +import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv} private[spark] class SparkDeploySchedulerBackend( scheduler: TaskSchedulerImpl, @@ -33,7 +33,7 @@ private[spark] class SparkDeploySchedulerBackend( var client: AppClient = null var stopping = false - var shutdownCallback : (SparkDeploySchedulerBackend) => Unit = _ + var shutdownCallback: (SparkDeploySchedulerBackend) => Unit = _ @volatile var appId: String = _ val registrationLock = new Object() @@ -109,8 +109,8 @@ private[spark] class SparkDeploySchedulerBackend( } } - override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, - memory: Int) { + override def executorAdded(fullId: String, workerId: String, hostPort: String, + cores: Int, memory: Int) { logInfo("Granted executor ID %s on hostPort %s with %d cores, %s RAM".format( fullId, hostPort, cores, Utils.megabytesToString(memory))) } @@ -131,7 +131,7 @@ private[spark] class SparkDeploySchedulerBackend( override def applicationId(): String = Option(appId).getOrElse { logWarning("Application ID is not initialized yet.") - super.applicationId + super.applicationId() } private def waitForRegistration() = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 36d36ce8955ce..9204ece8400e1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -22,6 +22,7 @@ import java.util.{List => JList} import java.util.Collections import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.collection.mutable.{HashMap, HashSet} import org.apache.mesos.{Scheduler => MScheduler} @@ -63,13 +64,13 @@ private[spark] class CoarseMesosSchedulerBackend( val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt // Cores we have acquired with each Mesos task ID - val coresByTaskId = new HashMap[Int, Int] + val coresByTaskId = new mutable.HashMap[Int, Int] var totalCoresAcquired = 0 - val slaveIdsWithExecutors = new HashSet[String] + val slaveIdsWithExecutors = new mutable.HashSet[String] - val taskIdToSlaveId = new HashMap[Int, String] - val failuresBySlaveId = new HashMap[String, Int] // How many times tasks on each slave failed + val taskIdToSlaveId = new mutable.HashMap[Int, String] + val failuresBySlaveId = new mutable.HashMap[String, Int] // How many times tasks on each slave failed val extraCoresPerSlave = conf.getInt("spark.mesos.extra.cores", 0) @@ -319,7 +320,7 @@ private[spark] class CoarseMesosSchedulerBackend( override def applicationId(): String = Option(appId).getOrElse { logWarning("Application ID is not initialized yet.") - super.applicationId + super.applicationId() } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala index 5101ec8352e79..510a41d33f4dc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala @@ -29,7 +29,7 @@ private[spark] object MemoryUtils { sc.conf.getOption("spark.mesos.executor.memoryOverhead") .getOrElse(OVERHEAD_MINIMUM.toString) .toInt + sc.executorMemory, - OVERHEAD_FRACTION * sc.executorMemory + OVERHEAD_FRACTION * sc.executorMemory ) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 44164c24185be..c075d1f5958e6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -22,6 +22,7 @@ import java.util.{ArrayList => JArrayList, List => JList} import java.util.Collections import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} import org.apache.mesos.protobuf.ByteString @@ -54,8 +55,8 @@ private[spark] class MesosSchedulerBackend( var driver: SchedulerDriver = null // Which slave IDs we have executors on - val slaveIdsWithExecutors = new HashSet[String] - val taskIdToSlaveId = new HashMap[Long, String] + val slaveIdsWithExecutors = new mutable.HashSet[String] + val taskIdToSlaveId = new mutable.HashMap[Long, String] // An ExecutorInfo for our tasks var execArgs: Array[Byte] = null @@ -154,7 +155,7 @@ private[spark] class MesosSchedulerBackend( */ private def createExecArg(): Array[Byte] = { if (execArgs == null) { - val props = new HashMap[String, String] + val props = new mutable.HashMap[String, String] for ((key,value) <- sc.conf.getAll) { props(key) = value } @@ -214,7 +215,7 @@ private[spark] class MesosSchedulerBackend( // Build a big list of the offerable workers, and remember their indices so that we can // figure out which Offer to reply to for each worker val offerableWorkers = new ArrayBuffer[WorkerOffer] - val offerableIndices = new HashMap[String, Int] + val offerableIndices = new mutable.HashMap[String, Int] def sufficientOffer(o: Offer) = { val mem = getResource(o.getResourcesList, "mem") @@ -385,7 +386,7 @@ private[spark] class MesosSchedulerBackend( override def applicationId(): String = Option(appId).getOrElse { logWarning("Application ID is not initialized yet.") - super.applicationId + super.applicationId() } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala index 58b78f041cd85..f5b39e18a2a15 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala @@ -20,12 +20,11 @@ package org.apache.spark.scheduler.local import java.nio.ByteBuffer import akka.actor.{Actor, ActorRef, Props} - -import org.apache.spark.{Logging, SparkEnv, TaskState} import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.{Executor, ExecutorBackend} import org.apache.spark.scheduler.{SchedulerBackend, TaskSchedulerImpl, WorkerOffer} import org.apache.spark.util.ActorLogReceive +import org.apache.spark.{Logging, SparkEnv, TaskState} private case class ReviveOffers() diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala index 554a33ce7f1a6..d8ea32f697a9d 100644 --- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala @@ -20,12 +20,11 @@ package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer -import scala.reflect.ClassTag - import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.util.ByteBufferInputStream -import org.apache.spark.util.Utils +import org.apache.spark.util.{ByteBufferInputStream, Utils} + +import scala.reflect.ClassTag private[spark] class JavaSerializationStream(out: OutputStream, counterReset: Int) extends SerializationStream { @@ -48,19 +47,27 @@ private[spark] class JavaSerializationStream(out: OutputStream, counterReset: In this } - def flush() { objOut.flush() } - def close() { objOut.close() } + def flush() { + objOut.flush() + } + + def close() { + objOut.close() + } } private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoader) -extends DeserializationStream { + extends DeserializationStream { private val objIn = new ObjectInputStream(in) { override def resolveClass(desc: ObjectStreamClass) = Class.forName(desc.getName, false, loader) } def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T] - def close() { objIn.close() } + + def close() { + objIn.close() + } } diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index d6386f8c06fff..1d95d84e9c0b1 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -20,14 +20,13 @@ package org.apache.spark.serializer import java.io.{EOFException, InputStream, OutputStream} import java.nio.ByteBuffer -import com.esotericsoftware.kryo.{Kryo, KryoException} import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput} import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} +import com.esotericsoftware.kryo.{Kryo, KryoException} import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator} - import org.apache.spark._ import org.apache.spark.broadcast.HttpBroadcast -import org.apache.spark.network.nio.{PutBlock, GotBlock, GetBlock} +import org.apache.spark.network.nio.{GetBlock, GotBlock, PutBlock} import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage._ import org.apache.spark.util.BoundedPriorityQueue @@ -47,12 +46,10 @@ class KryoSerializer(conf: SparkConf) with Logging with Serializable { - private val bufferSize = - (conf.getDouble("spark.kryoserializer.buffer.mb", 0.064) * 1024 * 1024).toInt - + private val bufferSize = (conf.getDouble("spark.kryoserializer.buffer.mb", 0.064) * 1024 * 1024).toInt private val maxBufferSize = conf.getInt("spark.kryoserializer.buffer.max.mb", 64) * 1024 * 1024 - private val referenceTracking = conf.getBoolean("spark.kryo.referenceTracking", true) - private val registrationRequired = conf.getBoolean("spark.kryo.registrationRequired", false) + private val referenceTracking = conf.getBoolean("spark.kryo.referenceTracking", defaultValue = true) + private val registrationRequired = conf.getBoolean("spark.kryo.registrationRequired", defaultValue = false) private val registrator = conf.getOption("spark.kryo.registrator") def newKryoOutput() = new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize)) @@ -120,8 +117,13 @@ class KryoSerializationStream(kryo: Kryo, outStream: OutputStream) extends Seria this } - override def flush() { output.flush() } - override def close() { output.close() } + override def flush() { + output.flush() + } + + override def close() { + output.close() + } } private[spark] @@ -211,8 +213,7 @@ private[serializer] object KryoSerializer { * The underlying object is scala.collection.convert.Wrappers$IterableWrapper. * Kryo deserializes this into an AbstractCollection, which unfortunately doesn't work. */ -private class JavaIterableWrapperSerializer - extends com.esotericsoftware.kryo.Serializer[java.lang.Iterable[_]] { +private class JavaIterableWrapperSerializer extends com.esotericsoftware.kryo.Serializer[java.lang.Iterable[_]] { import JavaIterableWrapperSerializer._ @@ -227,7 +228,7 @@ private class JavaIterableWrapperSerializer } override def read(kryo: Kryo, in: KryoInput, clz: Class[java.lang.Iterable[_]]) - : java.lang.Iterable[_] = { + : java.lang.Iterable[_] = { kryo.readClassAndObject(in) match { case scalaIterable: Iterable[_] => scala.collection.JavaConversions.asJavaIterable(scalaIterable) diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala index a9144cdd97b8c..e4c662c5b5e45 100644 --- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala @@ -17,14 +17,14 @@ package org.apache.spark.serializer -import java.io.{ByteArrayOutputStream, EOFException, InputStream, OutputStream} +import java.io.{EOFException, InputStream, OutputStream} import java.nio.ByteBuffer -import scala.reflect.ClassTag - import org.apache.spark.SparkEnv import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.util.{ByteBufferInputStream, NextIterator} +import org.apache.spark.util.NextIterator + +import scala.reflect.ClassTag /** * :: DeveloperApi :: @@ -102,7 +102,9 @@ abstract class SerializerInstance { @DeveloperApi abstract class SerializationStream { def writeObject[T: ClassTag](t: T): SerializationStream + def flush(): Unit + def close(): Unit def writeAll[T: ClassTag](iter: Iterator[T]): SerializationStream = { @@ -121,6 +123,7 @@ abstract class SerializationStream { @DeveloperApi abstract class DeserializationStream { def readObject[T: ClassTag](): T + def close(): Unit /** diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala index e2d32c859bbda..6d32347583451 100644 --- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala @@ -67,7 +67,7 @@ private[spark] object AkkaUtils extends Logging { val akkaBatchSize = conf.getInt("spark.akka.batchSize", 15) val akkaTimeout = conf.getInt("spark.akka.timeout", 100) val akkaFrameSize = maxFrameSizeBytes(conf) - val akkaLogLifecycleEvents = conf.getBoolean("spark.akka.logLifecycleEvents", false) + val akkaLogLifecycleEvents = conf.getBoolean("spark.akka.logLifecycleEvents", defaultValue = false) val lifecycleEvents = if (akkaLogLifecycleEvents) "on" else "off" if (!akkaLogLifecycleEvents) { // As a workaround for Akka issue #3787, we coerce the "EndpointWriter" log to be silent. @@ -75,7 +75,7 @@ private[spark] object AkkaUtils extends Logging { Option(Logger.getLogger("akka.remote.EndpointWriter")).map(l => l.setLevel(Level.FATAL)) } - val logAkkaConfig = if (conf.getBoolean("spark.akka.logAkkaConfig", false)) "on" else "off" + val logAkkaConfig = if (conf.getBoolean("spark.akka.logAkkaConfig", defaultValue = false)) "on" else "off" val akkaHeartBeatPauses = conf.getInt("spark.akka.heartbeat.pauses", 600) val akkaFailureDetector = diff --git a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala index c92ebaa0d5aa6..385f0d6fdaf9b 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala @@ -17,8 +17,8 @@ package org.apache.spark -import org.scalatest.{Assertions, FunSuite} import org.apache.spark.storage.StorageLevel +import org.scalatest.{Assertions, FunSuite} class SparkContextInfoSuite extends FunSuite with LocalSparkContext { test("getPersistentRDDs only returns RDDs that are marked as cached") {