Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/apache/spark into dt-prep…
Browse files Browse the repository at this point in the history
…rune

Conflicts:
	mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
  • Loading branch information
qiping.lqp committed Sep 9, 2014
2 parents cadd569 + c419e4f commit bb465ca
Show file tree
Hide file tree
Showing 17 changed files with 120 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,13 @@ private[spark] class ApplicationInfo(

def retryCount = _retryCount

def incrementRetryCount = {
def incrementRetryCount() = {
_retryCount += 1
_retryCount
}

def resetRetryCount() = _retryCount = 0

def markFinished(endState: ApplicationState.Value) {
state = endState
endTime = System.currentTimeMillis()
Expand Down
26 changes: 16 additions & 10 deletions core/src/main/scala/org/apache/spark/deploy/master/Master.scala
Original file line number Diff line number Diff line change
Expand Up @@ -296,28 +296,34 @@ private[spark] class Master(
val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
execOption match {
case Some(exec) => {
val appInfo = idToApp(appId)
exec.state = state
if (state == ExecutorState.RUNNING) { appInfo.resetRetryCount() }
exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus)
if (ExecutorState.isFinished(state)) {
val appInfo = idToApp(appId)
// Remove this executor from the worker and app
logInfo("Removing executor " + exec.fullId + " because it is " + state)
logInfo(s"Removing executor ${exec.fullId} because it is $state")
appInfo.removeExecutor(exec)
exec.worker.removeExecutor(exec)

val normalExit = exitStatus.exists(_ == 0)
val normalExit = exitStatus == Some(0)
// Only retry certain number of times so we don't go into an infinite loop.
if (!normalExit && appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) {
schedule()
} else if (!normalExit) {
logError("Application %s with ID %s failed %d times, removing it".format(
appInfo.desc.name, appInfo.id, appInfo.retryCount))
removeApplication(appInfo, ApplicationState.FAILED)
if (!normalExit) {
if (appInfo.incrementRetryCount() < ApplicationState.MAX_NUM_RETRY) {
schedule()
} else {
val execs = appInfo.executors.values
if (!execs.exists(_.state == ExecutorState.RUNNING)) {
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
s"${appInfo.retryCount} times; removing it")
removeApplication(appInfo, ApplicationState.FAILED)
}
}
}
}
}
case None =>
logWarning("Got status update for unknown executor " + appId + "/" + execId)
logWarning(s"Got status update for unknown executor $appId/$execId")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ private[spark] class ExecutorRunner(
Files.write(header, stderr, Charsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

state = ExecutorState.RUNNING
worker ! ExecutorStateChanged(appId, execId, state, None, None)
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
// or with nonzero exit code
val exitCode = process.waitFor()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ private[spark] class Worker(
try {
logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_,
self, workerId, host, sparkHome, workDir, akkaUrl, conf, ExecutorState.RUNNING)
self, workerId, host, sparkHome, workDir, akkaUrl, conf, ExecutorState.LOADING)
executors(appId + "/" + execId) = manager
manager.start()
coresUsed += cores_
Expand Down
2 changes: 1 addition & 1 deletion dev/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ echo "========================================================================="
# echo "q" is needed because sbt on encountering a build file with failure
# (either resolution or compilation) prompts the user for input either q, r,
# etc to quit or retry. This echo is there to make it not block.
BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver "
BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive "
echo -e "q\n" | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean package assembly/assembly | \
grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"

Expand Down
16 changes: 8 additions & 8 deletions docs/mllib-decision-tree.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ The ordered splits create "bins" and the maximum number of such
bins can be specified using the `maxBins` parameter.

Note that the number of bins cannot be greater than the number of instances `$N$` (a rare scenario
since the default `maxBins` value is 100). The tree algorithm automatically reduces the number of
since the default `maxBins` value is 32). The tree algorithm automatically reduces the number of
bins if the condition is not satisfied.

**Categorical features**
Expand Down Expand Up @@ -117,7 +117,7 @@ all nodes at each level of the tree. This could lead to high memory requirements
of the tree, potentially leading to memory overflow errors. To alleviate this problem, a `maxMemoryInMB`
training parameter specifies the maximum amount of memory at the workers (twice as much at the
master) to be allocated to the histogram computation. The default value is conservatively chosen to
be 128 MB to allow the decision algorithm to work in most scenarios. Once the memory requirements
be 256 MB to allow the decision algorithm to work in most scenarios. Once the memory requirements
for a level-wise computation cross the `maxMemoryInMB` threshold, the node training tasks at each
subsequent level are split into smaller tasks.

Expand Down Expand Up @@ -167,7 +167,7 @@ val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "gini"
val maxDepth = 5
val maxBins = 100
val maxBins = 32

val model = DecisionTree.trainClassifier(data, numClasses, categoricalFeaturesInfo, impurity,
maxDepth, maxBins)
Expand Down Expand Up @@ -213,7 +213,7 @@ Integer numClasses = 2;
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "gini";
Integer maxDepth = 5;
Integer maxBins = 100;
Integer maxBins = 32;

// Train a DecisionTree model for classification.
final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses,
Expand Down Expand Up @@ -250,7 +250,7 @@ data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt').cache()
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=100)
impurity='gini', maxDepth=5, maxBins=32)

# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))
Expand Down Expand Up @@ -293,7 +293,7 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").cache
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "variance"
val maxDepth = 5
val maxBins = 100
val maxBins = 32

val model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo, impurity,
maxDepth, maxBins)
Expand Down Expand Up @@ -338,7 +338,7 @@ JavaSparkContext sc = new JavaSparkContext(sparkConf);
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "variance";
Integer maxDepth = 5;
Integer maxBins = 100;
Integer maxBins = 32;

// Train a DecisionTree model.
final DecisionTreeModel model = DecisionTree.trainRegressor(data,
Expand Down Expand Up @@ -380,7 +380,7 @@ data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt').cache()
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo={},
impurity='variance', maxDepth=5, maxBins=100)
impurity='variance', maxDepth=5, maxBins=32)

# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public static void main(String[] args) {
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "gini";
Integer maxDepth = 5;
Integer maxBins = 100;
Integer maxBins = 32;

// Train a DecisionTree model for classification.
final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ object DecisionTreeRunner {
input: String = null,
dataFormat: String = "libsvm",
algo: Algo = Classification,
maxDepth: Int = 4,
maxDepth: Int = 5,
impurity: ImpurityType = Gini,
maxBins: Int = 100,
maxBins: Int = 32,
fracTest: Double = 0.2)

def main(args: Array[String]) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,9 @@ object DecisionTree extends Serializable with Logging {
* Supported values: "gini" (recommended) or "entropy".
* @param maxDepth Maximum depth of the tree.
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* (suggested value: 4)
* (suggested value: 5)
* @param maxBins maximum number of bins used for splitting features
* (suggested value: 100)
* (suggested value: 32)
* @return DecisionTreeModel that can be used for prediction
*/
def trainClassifier(
Expand Down Expand Up @@ -375,9 +375,9 @@ object DecisionTree extends Serializable with Logging {
* Supported values: "variance".
* @param maxDepth Maximum depth of the tree.
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* (suggested value: 4)
* (suggested value: 5)
* @param maxBins maximum number of bins used for splitting features
* (suggested value: 100)
* (suggested value: 32)
* @return DecisionTreeModel that can be used for prediction
*/
def trainRegressor(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,20 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
* If a split has less information gain than minInfoGain,
* this split will not be considered as a valid split.
* @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. Default value is
* 128 MB.
* 256 MB.
*/
@Experimental
class Strategy (
val algo: Algo,
val impurity: Impurity,
val maxDepth: Int,
val numClassesForClassification: Int = 2,
val maxBins: Int = 100,
val maxBins: Int = 32,
val quantileCalculationStrategy: QuantileStrategy = Sort,
val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
val minInstancesPerNode: Int = 0,
val minInfoGain: Double = 0.0,
val maxMemoryInMB: Int = 128) extends Serializable {
val maxMemoryInMB: Int = 256) extends Serializable {

if (algo == Classification) {
require(numClassesForClassification >= 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Variance}
import org.apache.spark.mllib.tree.model.{Split, DecisionTreeModel, Node}
import org.apache.spark.mllib.util.LocalSparkContext


class DecisionTreeSuite extends FunSuite with LocalSparkContext {

def validateClassifier(
Expand Down Expand Up @@ -355,8 +354,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
assert(splits(0).length === 99)
assert(bins.length === 2)
assert(bins(0).length === 100)
assert(splits(0).length === 99)
assert(bins(0).length === 100)

val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(8), metadata, 0,
Expand All @@ -383,8 +380,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
assert(splits(0).length === 99)
assert(bins.length === 2)
assert(bins(0).length === 100)
assert(splits(0).length === 99)
assert(bins(0).length === 100)

val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
Expand Down Expand Up @@ -412,8 +407,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
assert(splits(0).length === 99)
assert(bins.length === 2)
assert(bins(0).length === 100)
assert(splits(0).length === 99)
assert(bins(0).length === 100)

val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
Expand Down Expand Up @@ -441,8 +434,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
assert(splits(0).length === 99)
assert(bins.length === 2)
assert(bins(0).length === 100)
assert(splits(0).length === 99)
assert(bins(0).length === 100)

val treeInput = TreePoint.convertToTreeRDD(rdd, bins, metadata)
val bestSplits = DecisionTree.findBestSplits(treeInput, new Array(2), metadata, 0,
Expand All @@ -466,8 +457,6 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
assert(splits(0).length === 99)
assert(bins.length === 2)
assert(bins(0).length === 100)
assert(splits(0).length === 99)
assert(bins(0).length === 100)

// Train a 1-node model
val strategyOneNode = new Strategy(Classification, Entropy, 1, 2, 100)
Expand Down Expand Up @@ -602,7 +591,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
val arr = DecisionTreeSuite.generateContinuousDataPointsForMulticlass()
val rdd = sc.parallelize(arr)
val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
numClassesForClassification = 3)
numClassesForClassification = 3, maxBins = 100)
assert(strategy.isMulticlassClassification)
val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)

Expand All @@ -628,7 +617,7 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
val arr = DecisionTreeSuite.generateContinuousDataPointsForMulticlass()
val rdd = sc.parallelize(arr)
val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 3))
numClassesForClassification = 3, maxBins = 100, categoricalFeaturesInfo = Map(0 -> 3))
assert(strategy.isMulticlassClassification)
val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
assert(metadata.isUnordered(featureIndex = 0))
Expand All @@ -654,7 +643,8 @@ class DecisionTreeSuite extends FunSuite with LocalSparkContext {
val arr = DecisionTreeSuite.generateCategoricalDataPointsForMulticlassForOrderedFeatures()
val rdd = sc.parallelize(arr)
val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 4,
numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 -> 10, 1 -> 10))
numClassesForClassification = 3, maxBins = 100,
categoricalFeaturesInfo = Map(0 -> 10, 1 -> 10))
assert(strategy.isMulticlassClassification)
val metadata = DecisionTreeMetadata.buildMetadata(rdd, strategy)
assert(!metadata.isUnordered(featureIndex = 0))
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class DecisionTree(object):

@staticmethod
def trainClassifier(data, numClasses, categoricalFeaturesInfo,
impurity="gini", maxDepth=4, maxBins=100):
impurity="gini", maxDepth=5, maxBins=32):
"""
Train a DecisionTreeModel for classification.
Expand Down Expand Up @@ -170,7 +170,7 @@ def trainClassifier(data, numClasses, categoricalFeaturesInfo,

@staticmethod
def trainRegressor(data, categoricalFeaturesInfo,
impurity="variance", maxDepth=4, maxBins=100):
impurity="variance", maxDepth=5, maxBins=32):
"""
Train a DecisionTreeModel for regression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
protected val ASC = Keyword("ASC")
protected val APPROXIMATE = Keyword("APPROXIMATE")
protected val AVG = Keyword("AVG")
protected val BETWEEN = Keyword("BETWEEN")
protected val BY = Keyword("BY")
protected val CACHE = Keyword("CACHE")
protected val CAST = Keyword("CAST")
Expand Down Expand Up @@ -272,6 +273,9 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } |
termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(EqualTo(e1, e2)) } |
termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(EqualTo(e1, e2)) } |
termExpression ~ BETWEEN ~ termExpression ~ AND ~ termExpression ^^ {
case e ~ _ ~ el ~ _ ~ eu => And(GreaterThanOrEqual(e, el), LessThanOrEqual(e, eu))
} |
termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
termExpression ~ REGEXP ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } |
Expand Down
4 changes: 2 additions & 2 deletions sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
* @group userf
*/
def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = {
catalog.registerTable(None, tableName, rdd.logicalPlan)
catalog.registerTable(None, tableName, rdd.queryExecution.analyzed)
}

/**
Expand Down Expand Up @@ -411,7 +411,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
protected def stringOrError[A](f: => A): String =
try f.toString catch { case e: Throwable => e.toString }

def simpleString: String =
def simpleString: String =
s"""== Physical Plan ==
|${stringOrError(executedPlan)}
"""
Expand Down
18 changes: 18 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -597,4 +597,22 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
(3, null) ::
(4, 2147483644) :: Nil)
}

test("SPARK-3423 BETWEEN") {
checkAnswer(
sql("SELECT key, value FROM testData WHERE key BETWEEN 5 and 7"),
Seq((5, "5"), (6, "6"), (7, "7"))
)

checkAnswer(
sql("SELECT key, value FROM testData WHERE key BETWEEN 7 and 7"),
Seq((7, "7"))
)

checkAnswer(
sql("SELECT key, value FROM testData WHERE key BETWEEN 9 and 7"),
Seq()
)

}
}
Loading

0 comments on commit bb465ca

Please sign in to comment.