Skip to content

Commit

Permalink
Merge branch 'master' into fix_metric
Browse files Browse the repository at this point in the history
  • Loading branch information
davies committed Sep 12, 2014
2 parents 7c879e0 + ed1980f commit a5cdb63
Show file tree
Hide file tree
Showing 60 changed files with 772 additions and 248 deletions.
1 change: 0 additions & 1 deletion core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SparkD
import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
import org.apache.spark.scheduler.local.LocalBackend
import org.apache.spark.storage._
import org.apache.spark.SPARK_VERSION
import org.apache.spark.ui.SparkUI
import org.apache.spark.util.{CallSite, ClosureCleaner, MetadataCleaner, MetadataCleanerType, TimeStampedWeakValueHashMap, Utils}

Expand Down
18 changes: 15 additions & 3 deletions core/src/main/scala/org/apache/spark/deploy/master/Master.scala
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,25 @@ private[spark] class Master(
if (state != RecoveryState.ALIVE) { return }

// First schedule drivers, they take strict precedence over applications
val shuffledWorkers = Random.shuffle(workers) // Randomization helps balance drivers
for (worker <- shuffledWorkers if worker.state == WorkerState.ALIVE) {
for (driver <- List(waitingDrivers: _*)) { // iterate over a copy of waitingDrivers
// Randomization helps balance drivers
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
val aliveWorkerNum = shuffledAliveWorkers.size
var curPos = 0
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
curPos = (curPos + 1) % aliveWorkerNum
val startPos = curPos
var launched = false
while (curPos != startPos && !launched) {
val worker = shuffledAliveWorkers(curPos)
if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
launchDriver(worker, driver)
waitingDrivers -= driver
launched = true
}
curPos = (curPos + 1) % aliveWorkerNum
}
}

Expand Down
19 changes: 18 additions & 1 deletion core/src/main/scala/org/apache/spark/util/Utils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ package org.apache.spark.util
import java.io._
import java.net._
import java.nio.ByteBuffer
import java.util.{Locale, Random, UUID}
import java.util.{Properties, Locale, Random, UUID}
import java.util.concurrent.{ThreadFactory, ConcurrentHashMap, Executors, ThreadPoolExecutor}

import org.apache.log4j.PropertyConfigurator

import scala.collection.JavaConversions._
import scala.collection.Map
import scala.collection.mutable.ArrayBuffer
Expand Down Expand Up @@ -834,6 +836,7 @@ private[spark] object Utils extends Logging {
val exitCode = process.waitFor()
stdoutThread.join() // Wait for it to finish reading output
if (exitCode != 0) {
logError(s"Process $command exited with code $exitCode: ${output}")
throw new SparkException("Process " + command + " exited with code " + exitCode)
}
output.toString
Expand Down Expand Up @@ -1444,6 +1447,20 @@ private[spark] object Utils extends Logging {
}
}

/**
* config a log4j properties used for testsuite
*/
def configTestLog4j(level: String): Unit = {
val pro = new Properties()
pro.put("log4j.rootLogger", s"$level, console")
pro.put("log4j.appender.console", "org.apache.log4j.ConsoleAppender")
pro.put("log4j.appender.console.target", "System.err")
pro.put("log4j.appender.console.layout", "org.apache.log4j.PatternLayout")
pro.put("log4j.appender.console.layout.ConversionPattern",
"%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n")
PropertyConfigurator.configure(pro)
}

}

/**
Expand Down
5 changes: 1 addition & 4 deletions core/src/test/scala/org/apache/spark/DriverSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ package org.apache.spark

import java.io.File

import org.apache.log4j.Logger
import org.apache.log4j.Level

import org.scalatest.FunSuite
import org.scalatest.concurrent.Timeouts
import org.scalatest.prop.TableDrivenPropertyChecks._
Expand Down Expand Up @@ -54,7 +51,7 @@ class DriverSuite extends FunSuite with Timeouts {
*/
object DriverWithoutCleanup {
def main(args: Array[String]) {
Logger.getRootLogger().setLevel(Level.WARN)
Utils.configTestLog4j("INFO")
val sc = new SparkContext(args(0), "DriverWithoutCleanup")
sc.parallelize(1 to 100, 4).count()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {

object JarCreationTest {
def main(args: Array[String]) {
Utils.configTestLog4j("INFO")
val conf = new SparkConf()
val sc = new SparkContext(conf)
val result = sc.makeRDD(1 to 100, 10).mapPartitions { x =>
Expand All @@ -338,6 +339,7 @@ object JarCreationTest {

object SimpleApplicationTest {
def main(args: Array[String]) {
Utils.configTestLog4j("INFO")
val conf = new SparkConf()
val sc = new SparkContext(conf)
val configs = Seq("spark.master", "spark.app.name")
Expand Down
7 changes: 7 additions & 0 deletions docs/running-on-yarn.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
the environment of the executor launcher.
</td>
</tr>
<tr>
<td><code>spark.yarn.containerLauncherMaxThreads</code></td>
<td>25</td>
<td>
The maximum number of threads to use in the application master for launching executor containers.
</td>
</tr>
</table>

# Launching Spark on YARN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.examples.graphx
import org.apache.spark.SparkContext._
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.examples.graphx.Analytics


/**
* Uses GraphX to run PageRank on a LiveJournal social network graph. Download the dataset from
Expand Down
72 changes: 53 additions & 19 deletions mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo

// Find best split for all nodes at a level.
timer.start("findBestSplits")
val splitsStatsForLevel: Array[(Split, InformationGainStats)] =
val splitsStatsForLevel: Array[(Split, InformationGainStats, Predict)] =
DecisionTree.findBestSplits(treeInput, parentImpurities,
metadata, level, nodes, splits, bins, maxLevelForSingleGroup, timer)
timer.stop("findBestSplits")
Expand All @@ -143,8 +143,9 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
timer.start("extractNodeInfo")
val split = nodeSplitStats._1
val stats = nodeSplitStats._2
val predict = nodeSplitStats._3.predict
val isLeaf = (stats.gain <= 0) || (level == strategy.maxDepth)
val node = new Node(nodeIndex, stats.predict, isLeaf, Some(split), None, None, Some(stats))
val node = new Node(nodeIndex, predict, isLeaf, Some(split), None, None, Some(stats))
logDebug("Node = " + node)
nodes(nodeIndex) = node
timer.stop("extractNodeInfo")
Expand Down Expand Up @@ -425,7 +426,7 @@ object DecisionTree extends Serializable with Logging {
splits: Array[Array[Split]],
bins: Array[Array[Bin]],
maxLevelForSingleGroup: Int,
timer: TimeTracker = new TimeTracker): Array[(Split, InformationGainStats)] = {
timer: TimeTracker = new TimeTracker): Array[(Split, InformationGainStats, Predict)] = {
// split into groups to avoid memory overflow during aggregation
if (level > maxLevelForSingleGroup) {
// When information for all nodes at a given level cannot be stored in memory,
Expand All @@ -434,7 +435,7 @@ object DecisionTree extends Serializable with Logging {
// numGroups is equal to 2 at level 11 and 4 at level 12, respectively.
val numGroups = 1 << level - maxLevelForSingleGroup
logDebug("numGroups = " + numGroups)
var bestSplits = new Array[(Split, InformationGainStats)](0)
var bestSplits = new Array[(Split, InformationGainStats, Predict)](0)
// Iterate over each group of nodes at a level.
var groupIndex = 0
while (groupIndex < numGroups) {
Expand Down Expand Up @@ -605,7 +606,7 @@ object DecisionTree extends Serializable with Logging {
bins: Array[Array[Bin]],
timer: TimeTracker,
numGroups: Int = 1,
groupIndex: Int = 0): Array[(Split, InformationGainStats)] = {
groupIndex: Int = 0): Array[(Split, InformationGainStats, Predict)] = {

/*
* The high-level descriptions of the best split optimizations are noted here.
Expand Down Expand Up @@ -705,7 +706,7 @@ object DecisionTree extends Serializable with Logging {

// Calculate best splits for all nodes at a given level
timer.start("chooseSplits")
val bestSplits = new Array[(Split, InformationGainStats)](numNodes)
val bestSplits = new Array[(Split, InformationGainStats, Predict)](numNodes)
// Iterating over all nodes at this level
var nodeIndex = 0
while (nodeIndex < numNodes) {
Expand Down Expand Up @@ -734,28 +735,27 @@ object DecisionTree extends Serializable with Logging {
topImpurity: Double,
level: Int,
metadata: DecisionTreeMetadata): InformationGainStats = {

val leftCount = leftImpurityCalculator.count
val rightCount = rightImpurityCalculator.count

val totalCount = leftCount + rightCount
if (totalCount == 0) {
// Return arbitrary prediction.
return new InformationGainStats(0, topImpurity, topImpurity, topImpurity, 0)
// If left child or right child doesn't satisfy minimum instances per node,
// then this split is invalid, return invalid information gain stats.
if ((leftCount < metadata.minInstancesPerNode) ||
(rightCount < metadata.minInstancesPerNode)) {
return InformationGainStats.invalidInformationGainStats
}

val parentNodeAgg = leftImpurityCalculator.copy
parentNodeAgg.add(rightImpurityCalculator)
val totalCount = leftCount + rightCount

// impurity of parent node
val impurity = if (level > 0) {
topImpurity
} else {
val parentNodeAgg = leftImpurityCalculator.copy
parentNodeAgg.add(rightImpurityCalculator)
parentNodeAgg.calculate()
}

val predict = parentNodeAgg.predict
val prob = parentNodeAgg.prob(predict)

val leftImpurity = leftImpurityCalculator.calculate() // Note: This equals 0 if count = 0
val rightImpurity = rightImpurityCalculator.calculate()

Expand All @@ -764,7 +764,31 @@ object DecisionTree extends Serializable with Logging {

val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity

new InformationGainStats(gain, impurity, leftImpurity, rightImpurity, predict, prob)
// if information gain doesn't satisfy minimum information gain,
// then this split is invalid, return invalid information gain stats.
if (gain < metadata.minInfoGain) {
return InformationGainStats.invalidInformationGainStats
}

new InformationGainStats(gain, impurity, leftImpurity, rightImpurity)
}

/**
* Calculate predict value for current node, given stats of any split.
* Note that this function is called only once for each node.
* @param leftImpurityCalculator left node aggregates for a split
* @param rightImpurityCalculator right node aggregates for a node
* @return predict value for current node
*/
private def calculatePredict(
leftImpurityCalculator: ImpurityCalculator,
rightImpurityCalculator: ImpurityCalculator): Predict = {
val parentNodeAgg = leftImpurityCalculator.copy
parentNodeAgg.add(rightImpurityCalculator)
val predict = parentNodeAgg.predict
val prob = parentNodeAgg.prob(predict)

new Predict(predict, prob)
}

/**
Expand All @@ -780,12 +804,15 @@ object DecisionTree extends Serializable with Logging {
nodeImpurity: Double,
level: Int,
metadata: DecisionTreeMetadata,
splits: Array[Array[Split]]): (Split, InformationGainStats) = {
splits: Array[Array[Split]]): (Split, InformationGainStats, Predict) = {

logDebug("node impurity = " + nodeImpurity)

// calculate predict only once
var predict: Option[Predict] = None

// For each (feature, split), calculate the gain, and select the best (feature, split).
Range(0, metadata.numFeatures).map { featureIndex =>
val (bestSplit, bestSplitStats) = Range(0, metadata.numFeatures).map { featureIndex =>
val numSplits = metadata.numSplits(featureIndex)
if (metadata.isContinuous(featureIndex)) {
// Cumulative sum (scanLeft) of bin statistics.
Expand All @@ -803,6 +830,7 @@ object DecisionTree extends Serializable with Logging {
val leftChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, splitIdx)
val rightChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, numSplits)
rightChildStats.subtract(leftChildStats)
predict = Some(predict.getOrElse(calculatePredict(leftChildStats, rightChildStats)))
val gainStats =
calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
(splitIdx, gainStats)
Expand All @@ -816,6 +844,7 @@ object DecisionTree extends Serializable with Logging {
Range(0, numSplits).map { splitIndex =>
val leftChildStats = binAggregates.getImpurityCalculator(leftChildOffset, splitIndex)
val rightChildStats = binAggregates.getImpurityCalculator(rightChildOffset, splitIndex)
predict = Some(predict.getOrElse(calculatePredict(leftChildStats, rightChildStats)))
val gainStats =
calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
(splitIndex, gainStats)
Expand Down Expand Up @@ -887,6 +916,7 @@ object DecisionTree extends Serializable with Logging {
val rightChildStats =
binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
rightChildStats.subtract(leftChildStats)
predict = Some(predict.getOrElse(calculatePredict(leftChildStats, rightChildStats)))
val gainStats =
calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
(splitIndex, gainStats)
Expand All @@ -898,6 +928,10 @@ object DecisionTree extends Serializable with Logging {
(bestFeatureSplit, bestFeatureGainStats)
}
}.maxBy(_._2.gain)

require(predict.isDefined, "must calculate predict for each node")

(bestSplit, bestSplitStats, predict.get)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
* k) implies the feature n is categorical with k categories 0,
* 1, 2, ... , k-1. It's important to note that features are
* zero-indexed.
* @param minInstancesPerNode Minimum number of instances each child must have after split.
* Default value is 1. If a split cause left or right child
* to have less than minInstancesPerNode,
* this split will not be considered as a valid split.
* @param minInfoGain Minimum information gain a split must get. Default value is 0.0.
* If a split has less information gain than minInfoGain,
* this split will not be considered as a valid split.
* @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. Default value is
* 256 MB.
*/
Expand All @@ -61,6 +68,8 @@ class Strategy (
val maxBins: Int = 32,
val quantileCalculationStrategy: QuantileStrategy = Sort,
val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
val minInstancesPerNode: Int = 1,
val minInfoGain: Double = 0.0,
val maxMemoryInMB: Int = 256) extends Serializable {

if (algo == Classification) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ private[tree] class DecisionTreeMetadata(
val unorderedFeatures: Set[Int],
val numBins: Array[Int],
val impurity: Impurity,
val quantileStrategy: QuantileStrategy) extends Serializable {
val quantileStrategy: QuantileStrategy,
val minInstancesPerNode: Int,
val minInfoGain: Double) extends Serializable {

def isUnordered(featureIndex: Int): Boolean = unorderedFeatures.contains(featureIndex)

Expand Down Expand Up @@ -127,7 +129,8 @@ private[tree] object DecisionTreeMetadata {

new DecisionTreeMetadata(numFeatures, numExamples, numClasses, numBins.max,
strategy.categoricalFeaturesInfo, unorderedFeatures.toSet, numBins,
strategy.impurity, strategy.quantileCalculationStrategy)
strategy.impurity, strategy.quantileCalculationStrategy,
strategy.minInstancesPerNode, strategy.minInfoGain)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,26 @@ import org.apache.spark.annotation.DeveloperApi
* @param impurity current node impurity
* @param leftImpurity left node impurity
* @param rightImpurity right node impurity
* @param predict predicted value
* @param prob probability of the label (classification only)
*/
@DeveloperApi
class InformationGainStats(
val gain: Double,
val impurity: Double,
val leftImpurity: Double,
val rightImpurity: Double,
val predict: Double,
val prob: Double = 0.0) extends Serializable {
val rightImpurity: Double) extends Serializable {

override def toString = {
"gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f, prob = %f"
.format(gain, impurity, leftImpurity, rightImpurity, predict, prob)
"gain = %f, impurity = %f, left impurity = %f, right impurity = %f"
.format(gain, impurity, leftImpurity, rightImpurity)
}
}


private[tree] object InformationGainStats {
/**
* An [[org.apache.spark.mllib.tree.model.InformationGainStats]] object to
* denote that current split doesn't satisfies minimum info gain or
* minimum number of instances per node.
*/
val invalidInformationGainStats = new InformationGainStats(Double.MinValue, -1.0, -1.0, -1.0)
}
Loading

0 comments on commit a5cdb63

Please sign in to comment.