Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into decisiontree-pyth…
Browse files Browse the repository at this point in the history
…on-new
  • Loading branch information
jkbradley committed Jul 31, 2014
2 parents e06e423 + 49b3612 commit 584449a
Show file tree
Hide file tree
Showing 33 changed files with 827 additions and 114 deletions.
23 changes: 12 additions & 11 deletions core/src/main/scala/org/apache/spark/Logging.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ trait Logging {
initializeIfNecessary()
var className = this.getClass.getName
// Ignore trailing $'s in the class names for Scala objects
if (className.endsWith("$")) {
className = className.substring(0, className.length - 1)
}
log_ = LoggerFactory.getLogger(className)
log_ = LoggerFactory.getLogger(className.stripSuffix("$"))
}
log_
}
Expand Down Expand Up @@ -110,23 +107,27 @@ trait Logging {
}

private def initializeLogging() {
// If Log4j is being used, but is not initialized, load a default properties file
val binder = StaticLoggerBinder.getSingleton
val usingLog4j = binder.getLoggerFactoryClassStr.endsWith("Log4jLoggerFactory")
val log4jInitialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
if (!log4jInitialized && usingLog4j) {
// Don't use a logger in here, as this is itself occurring during initialization of a logger
// If Log4j 1.2 is being used, but is not initialized, load a default properties file
val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
// This distinguishes the log4j 1.2 binding, currently
// org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
// org.apache.logging.slf4j.Log4jLoggerFactory
val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)
val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
if (!log4j12Initialized && usingLog4j12) {
val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
case Some(url) =>
PropertyConfigurator.configure(url)
log.info(s"Using Spark's default log4j profile: $defaultLogProps")
System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
case None =>
System.err.println(s"Spark was unable to load $defaultLogProps")
}
}
Logging.initialized = true

// Force a call into slf4j to initialize it. Avoids this happening from mutliple threads
// Force a call into slf4j to initialize it. Avoids this happening from multiple threads
// and triggering this: http://mailman.qos.ch/pipermail/slf4j-dev/2010-April/002956.html
log
}
Expand Down
43 changes: 43 additions & 0 deletions core/src/main/scala/org/apache/spark/api/java/JavaHadoopRDD.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapred.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.HadoopRDD

@DeveloperApi
class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V])
(implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
extends JavaPairRDD[K, V](rdd) {

/** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
@DeveloperApi
def mapPartitionsWithInputSplit[R](
f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
preservesPartitioning: Boolean = false): JavaRDD[R] = {
new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
preservesPartitioning)(fakeClassTag))(fakeClassTag)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
(implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
extends JavaPairRDD[K, V](rdd) {

/** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
@DeveloperApi
def mapPartitionsWithInputSplit[R](
f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
preservesPartitioning: Boolean = false): JavaRDD[R] = {
new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
preservesPartitioning)(fakeClassTag))(fakeClassTag)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import org.apache.spark._
import org.apache.spark.SparkContext.{DoubleAccumulatorParam, IntAccumulatorParam}
import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD}

/**
* A Java-friendly version of [[org.apache.spark.SparkContext]] that returns
Expand Down Expand Up @@ -294,7 +294,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minPartitions))
val rdd = sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass, minPartitions)
new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
}

/**
Expand All @@ -314,7 +315,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass))
val rdd = sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass)
new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
}

/** Get an RDD for a Hadoop file with an arbitrary InputFormat.
Expand All @@ -333,7 +335,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions))
val rdd = sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minPartitions)
new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
}

/** Get an RDD for a Hadoop file with an arbitrary InputFormat
Expand All @@ -351,8 +354,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(keyClass)
implicit val ctagV: ClassTag[V] = ClassTag(valueClass)
new JavaPairRDD(sc.hadoopFile(path,
inputFormatClass, keyClass, valueClass))
val rdd = sc.hadoopFile(path, inputFormatClass, keyClass, valueClass)
new JavaHadoopRDD(rdd.asInstanceOf[HadoopRDD[K, V]])
}

/**
Expand All @@ -372,7 +375,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
conf: Configuration): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(kClass)
implicit val ctagV: ClassTag[V] = ClassTag(vClass)
new JavaPairRDD(sc.newAPIHadoopFile(path, fClass, kClass, vClass, conf))
val rdd = sc.newAPIHadoopFile(path, fClass, kClass, vClass, conf)
new JavaNewHadoopRDD(rdd.asInstanceOf[NewHadoopRDD[K, V]])
}

/**
Expand All @@ -391,7 +395,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
vClass: Class[V]): JavaPairRDD[K, V] = {
implicit val ctagK: ClassTag[K] = ClassTag(kClass)
implicit val ctagV: ClassTag[V] = ClassTag(vClass)
new JavaPairRDD(sc.newAPIHadoopRDD(conf, fClass, kClass, vClass))
val rdd = sc.newAPIHadoopRDD(conf, fClass, kClass, vClass)
new JavaNewHadoopRDD(rdd.asInstanceOf[NewHadoopRDD[K, V]])
}

/** Build the union of two or more RDDs. */
Expand Down
11 changes: 7 additions & 4 deletions core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ object SparkSubmit {
OptionAssigner(args.archives, YARN, CLIENT, sysProp = "spark.yarn.dist.archives"),

// Yarn cluster only
OptionAssigner(args.name, YARN, CLUSTER, clOption = "--name", sysProp = "spark.app.name"),
OptionAssigner(args.name, YARN, CLUSTER, clOption = "--name"),
OptionAssigner(args.driverMemory, YARN, CLUSTER, clOption = "--driver-memory"),
OptionAssigner(args.queue, YARN, CLUSTER, clOption = "--queue"),
OptionAssigner(args.numExecutors, YARN, CLUSTER, clOption = "--num-executors"),
Expand Down Expand Up @@ -268,14 +268,17 @@ object SparkSubmit {
}
}

// Properties given with --conf are superceded by other options, but take precedence over
// properties in the defaults file.
for ((k, v) <- args.sparkProperties) {
sysProps.getOrElseUpdate(k, v)
}

// Read from default spark properties, if any
for ((k, v) <- args.getDefaultSparkProperties) {
sysProps.getOrElseUpdate(k, v)
}

// Spark properties included on command line take precedence
sysProps ++= args.sparkProperties

(childArgs, childClasspath, sysProps, childMainClass)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
val sparkProperties: HashMap[String, String] = new HashMap[String, String]()

parseOpts(args.toList)
loadDefaults()
mergeSparkProperties()
checkRequiredArguments()

/** Return default present in the currently defined defaults file. */
Expand All @@ -79,9 +79,11 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
defaultProperties
}

/** Fill in any undefined values based on the current properties file or built-in defaults. */
private def loadDefaults(): Unit = {

/**
* Fill in any undefined values based on the default properties file or options passed in through
* the '--conf' flag.
*/
private def mergeSparkProperties(): Unit = {
// Use common defaults file, if not specified by user
if (propertiesFile == null) {
sys.env.get("SPARK_HOME").foreach { sparkHome =>
Expand All @@ -94,18 +96,20 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
}
}

val defaultProperties = getDefaultSparkProperties
val properties = getDefaultSparkProperties
properties.putAll(sparkProperties)

// Use properties file as fallback for values which have a direct analog to
// arguments in this script.
master = Option(master).getOrElse(defaultProperties.get("spark.master").orNull)
master = Option(master).getOrElse(properties.get("spark.master").orNull)
executorMemory = Option(executorMemory)
.getOrElse(defaultProperties.get("spark.executor.memory").orNull)
.getOrElse(properties.get("spark.executor.memory").orNull)
executorCores = Option(executorCores)
.getOrElse(defaultProperties.get("spark.executor.cores").orNull)
.getOrElse(properties.get("spark.executor.cores").orNull)
totalExecutorCores = Option(totalExecutorCores)
.getOrElse(defaultProperties.get("spark.cores.max").orNull)
name = Option(name).getOrElse(defaultProperties.get("spark.app.name").orNull)
jars = Option(jars).getOrElse(defaultProperties.get("spark.jars").orNull)
.getOrElse(properties.get("spark.cores.max").orNull)
name = Option(name).getOrElse(properties.get("spark.app.name").orNull)
jars = Option(jars).getOrElse(properties.get("spark.jars").orNull)

// This supports env vars in older versions of Spark
master = Option(master).getOrElse(System.getenv("MASTER"))
Expand Down
32 changes: 32 additions & 0 deletions core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ package org.apache.spark.rdd
import java.text.SimpleDateFormat
import java.util.Date
import java.io.EOFException

import scala.collection.immutable.Map
import scala.reflect.ClassTag

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.mapred.FileSplit
Expand All @@ -39,6 +41,7 @@ import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.executor.{DataReadMethod, InputMetrics}
import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
import org.apache.spark.util.NextIterator

/**
Expand Down Expand Up @@ -232,6 +235,14 @@ class HadoopRDD[K, V](
new InterruptibleIterator[(K, V)](context, iter)
}

/** Maps over a partition, providing the InputSplit that was used as the base of the partition. */
@DeveloperApi
def mapPartitionsWithInputSplit[U: ClassTag](
f: (InputSplit, Iterator[(K, V)]) => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] = {
new HadoopMapPartitionsWithSplitRDD(this, f, preservesPartitioning)
}

override def getPreferredLocations(split: Partition): Seq[String] = {
// TODO: Filtering out "localhost" in case of file:// URLs
val hadoopSplit = split.asInstanceOf[HadoopPartition]
Expand Down Expand Up @@ -272,4 +283,25 @@ private[spark] object HadoopRDD {
conf.setInt("mapred.task.partition", splitId)
conf.set("mapred.job.id", jobID.toString)
}

/**
* Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
* the given function rather than the index of the partition.
*/
private[spark] class HadoopMapPartitionsWithSplitRDD[U: ClassTag, T: ClassTag](
prev: RDD[T],
f: (InputSplit, Iterator[T]) => Iterator[U],
preservesPartitioning: Boolean = false)
extends RDD[U](prev) {

override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None

override def getPartitions: Array[Partition] = firstParent[T].partitions

override def compute(split: Partition, context: TaskContext) = {
val partition = split.asInstanceOf[HadoopPartition]
val inputSplit = partition.inputSplit.value
f(inputSplit, firstParent[T].iterator(split, context))
}
}
}
Loading

0 comments on commit 584449a

Please sign in to comment.