From 114bad606e7a17f980ea6c99e31c8ab0179fec2e Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 29 Apr 2015 17:26:46 -0700
Subject: [PATCH 1/4] [SPARK-7176] [ML] Add validation functionality to Param

Main change: Added isValid field to Param.  Modified all usages to use isValid when relevant.  Added helper methods in ParamValidate.

Also overrode Params.validate() in:
* CrossValidator + model
* Pipeline + model

I made a few updates for the elastic net patch:
* I changed "tol" to "convergenceTol"
* I added some documentation

This PR is Scala + Java only.  Python will be in a follow-up PR.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #5740 from jkbradley/enforce-validate and squashes the following commits:

ad9c6c1 [Joseph K. Bradley] re-generated sharedParams after merging with current master
76415e8 [Joseph K. Bradley] reverted convergenceTol to tol
af62f4b [Joseph K. Bradley] Removed changes to SparkBuild, python linalg.  Fixed test failures.  Renamed ParamValidate to ParamValidators.  Removed explicit type from ParamValidators calls where possible.
bb2665a [Joseph K. Bradley] merged with elastic net pr
ecda302 [Joseph K. Bradley] fix rat tests, plus add a little doc
6895dfc [Joseph K. Bradley] small cleanups
069ac6d [Joseph K. Bradley] many cleanups
928fb84 [Joseph K. Bradley] Maybe done
a910ac7 [Joseph K. Bradley] still workin
6d60e2e [Joseph K. Bradley] Still workin
b987319 [Joseph K. Bradley] Partly done with adding checks, but blocking on adding checking functionality to Param
dbc9fb2 [Joseph K. Bradley] merged with master.  enforcing Params.validate
---
 .../examples/ml/JavaDeveloperApiExample.java  |  14 +-
 .../scala/org/apache/spark/ml/Pipeline.scala  |  19 +-
 .../ml/classification/GBTClassifier.scala     |  13 +-
 .../apache/spark/ml/feature/HashingTF.scala   |  12 +-
 .../apache/spark/ml/feature/Normalizer.scala  |  11 +-
 .../ml/feature/PolynomialExpansion.scala      |   9 +-
 .../spark/ml/feature/StandardScaler.scala     |  10 +-
 .../apache/spark/ml/feature/Tokenizer.scala   |  20 +-
 .../spark/ml/feature/VectorIndexer.scala      |  18 +-
 .../spark/ml/impl/tree/treeParams.scala       | 115 ++++-------
 .../org/apache/spark/ml/param/params.scala    | 179 ++++++++++++++++--
 .../ml/param/shared/SharedParamsCodeGen.scala |  35 ++--
 .../spark/ml/param/shared/sharedParams.scala  | 122 +++++-------
 .../apache/spark/ml/recommendation/ALS.scala  |  35 ++--
 .../spark/ml/regression/GBTRegressor.scala    |  13 +-
 .../ml/regression/LinearRegression.scala      |  16 +-
 .../spark/ml/tuning/CrossValidator.scala      |  22 ++-
 .../mllib/tree/GradientBoostedTrees.scala     |   4 +-
 .../spark/ml/param/JavaParamsSuite.java       |  66 +++++++
 .../apache/spark/ml/param/JavaTestParams.java |  63 ++++++
 .../apache/spark/ml/param/ParamsSuite.scala   |  69 ++++++-
 .../apache/spark/ml/param/TestParams.scala    |   2 +-
 22 files changed, 593 insertions(+), 274 deletions(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index eaf00d09f550d..46377a99c4857 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -28,7 +28,6 @@
 import org.apache.spark.ml.classification.ClassificationModel;
 import org.apache.spark.ml.param.IntParam;
 import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.param.Params;
 import org.apache.spark.ml.param.Params$;
 import org.apache.spark.mllib.linalg.BLAS;
 import org.apache.spark.mllib.linalg.Vector;
@@ -100,11 +99,12 @@ public static void main(String[] args) throws Exception {
 /**
  * Example of defining a type of {@link Classifier}.
  *
- * NOTE: This is private since it is an example.  In practice, you may not want it to be private.
+ * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
+ *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
+ *       However, this should still compile and run successfully.
  */
 class MyJavaLogisticRegression
-    extends Classifier<Vector, MyJavaLogisticRegression, MyJavaLogisticRegressionModel>
-    implements Params {
+    extends Classifier<Vector, MyJavaLogisticRegression, MyJavaLogisticRegressionModel> {
 
   /**
    * Param for max number of iterations
@@ -145,10 +145,12 @@ public MyJavaLogisticRegressionModel train(DataFrame dataset, ParamMap paramMap)
 /**
  * Example of defining a type of {@link ClassificationModel}.
  *
- * NOTE: This is private since it is an example.  In practice, you may not want it to be private.
+ * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
+ *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
+ *       However, this should still compile and run successfully.
  */
 class MyJavaLogisticRegressionModel
-    extends ClassificationModel<Vector, MyJavaLogisticRegressionModel> implements Params {
+    extends ClassificationModel<Vector, MyJavaLogisticRegressionModel> {
 
   private MyJavaLogisticRegression parent_;
   public MyJavaLogisticRegression parent() { return parent_; }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 8eddf79cdfe28..6bfeecd764d75 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
-import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.param.{Params, Param, ParamMap}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
 
@@ -86,6 +86,14 @@ class Pipeline extends Estimator[PipelineModel] {
   def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this }
   def getStages: Array[PipelineStage] = getOrDefault(stages)
 
+  override def validate(paramMap: ParamMap): Unit = {
+    val map = extractParamMap(paramMap)
+    getStages.foreach {
+      case pStage: Params => pStage.validate(map)
+      case _ =>
+    }
+  }
+
   /**
    * Fits the pipeline to the input dataset with additional parameters. If a stage is an
    * [[Estimator]], its [[Estimator#fit]] method will be called on the input dataset to fit a model.
@@ -140,7 +148,7 @@ class Pipeline extends Estimator[PipelineModel] {
   override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = extractParamMap(paramMap)
     val theStages = map(stages)
-    require(theStages.toSet.size == theStages.size,
+    require(theStages.toSet.size == theStages.length,
       "Cannot have duplicate components in a pipeline.")
     theStages.foldLeft(schema)((cur, stage) => stage.transformSchema(cur, paramMap))
   }
@@ -157,6 +165,11 @@ class PipelineModel private[ml] (
     private[ml] val stages: Array[Transformer])
   extends Model[PipelineModel] with Logging {
 
+  override def validate(paramMap: ParamMap): Unit = {
+    val map = fittingParamMap ++ extractParamMap(paramMap)
+    stages.foreach(_.validate(map))
+  }
+
   /**
    * Gets the model produced by the input estimator. Throws an NoSuchElementException is the input
    * estimator does not exist in the pipeline.
@@ -168,7 +181,7 @@ class PipelineModel private[ml] (
     }
     if (matched.isEmpty) {
       throw new NoSuchElementException(s"Cannot find stage $stage from the pipeline.")
-    } else if (matched.size > 1) {
+    } else if (matched.length > 1) {
       throw new IllegalStateException(s"Cannot have duplicate estimators in the sample pipeline.")
     } else {
       matched.head.asInstanceOf[M]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index d2e052fbbbf22..3d849867d4c47 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -103,21 +103,16 @@ final class GBTClassifier
    */
   val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" +
     " tries to minimize (case-insensitive). Supported options:" +
-    s" ${GBTClassifier.supportedLossTypes.mkString(", ")}")
+    s" ${GBTClassifier.supportedLossTypes.mkString(", ")}",
+    (value: String) => GBTClassifier.supportedLossTypes.contains(value.toLowerCase))
 
   setDefault(lossType -> "logistic")
 
   /** @group setParam */
-  def setLossType(value: String): this.type = {
-    val lossStr = value.toLowerCase
-    require(GBTClassifier.supportedLossTypes.contains(lossStr), "GBTClassifier was given bad loss" +
-      s" type: $value. Supported options: ${GBTClassifier.supportedLossTypes.mkString(", ")}")
-    set(lossType, lossStr)
-    this
-  }
+  def setLossType(value: String): this.type = set(lossType, value)
 
   /** @group getParam */
-  def getLossType: String = getOrDefault(lossType)
+  def getLossType: String = getOrDefault(lossType).toLowerCase
 
   /** (private[ml]) Convert new loss to old loss. */
   override private[ml] def getOldLossType: OldLoss = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index b20f2fc49a8f6..0b3128f9ee8cd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.param.{IntParam, ParamMap}
+import org.apache.spark.ml.param.{ParamValidators, IntParam, ParamMap}
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{VectorUDT, Vector}
 import org.apache.spark.sql.types.DataType
@@ -32,10 +32,14 @@ import org.apache.spark.sql.types.DataType
 class HashingTF extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
 
   /**
-   * number of features
+   * Number of features.  Should be > 0.
+   * (default = 2^18^)
    * @group param
    */
-  val numFeatures = new IntParam(this, "numFeatures", "number of features")
+  val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)",
+    ParamValidators.gt(0))
+
+  setDefault(numFeatures -> (1 << 18))
 
   /** @group getParam */
   def getNumFeatures: Int = getOrDefault(numFeatures)
@@ -43,8 +47,6 @@ class HashingTF extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
   /** @group setParam */
   def setNumFeatures(value: Int): this.type = set(numFeatures, value)
 
-  setDefault(numFeatures -> (1 << 18))
-
   override protected def createTransformFunc(paramMap: ParamMap): Iterable[_] => Vector = {
     val hashingTF = new feature.HashingTF(paramMap(numFeatures))
     hashingTF.transform
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index decaeb0da6246..bd2b5f6067e2d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.param.{DoubleParam, ParamMap}
+import org.apache.spark.ml.param.{ParamValidators, DoubleParam, ParamMap}
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{VectorUDT, Vector}
 import org.apache.spark.sql.types.DataType
@@ -32,10 +32,13 @@ import org.apache.spark.sql.types.DataType
 class Normalizer extends UnaryTransformer[Vector, Vector, Normalizer] {
 
   /**
-   * Normalization in L^p^ space, p = 2 by default.
+   * Normalization in L^p^ space.  Must be >= 1.
+   * (default: p = 2)
    * @group param
    */
-  val p = new DoubleParam(this, "p", "the p norm value")
+  val p = new DoubleParam(this, "p", "the p norm value", ParamValidators.gtEq(1))
+
+  setDefault(p -> 2.0)
 
   /** @group getParam */
   def getP: Double = getOrDefault(p)
@@ -43,8 +46,6 @@ class Normalizer extends UnaryTransformer[Vector, Vector, Normalizer] {
   /** @group setParam */
   def setP(value: Double): this.type = set(p, value)
 
-  setDefault(p -> 2.0)
-
   override protected def createTransformFunc(paramMap: ParamMap): Vector => Vector = {
     val normalizer = new feature.Normalizer(paramMap(p))
     normalizer.transform
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index d855f04799ae7..1b7c939c2dffe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.param.{IntParam, ParamMap}
+import org.apache.spark.ml.param.{ParamValidators, IntParam, ParamMap}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.sql.types.DataType
 
@@ -37,10 +37,13 @@ import org.apache.spark.sql.types.DataType
 class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] {
 
   /**
-   * The polynomial degree to expand, which should be larger than 1.
+   * The polynomial degree to expand, which should be >= 1.  A value of 1 means no expansion.
+   * Default: 2
    * @group param
    */
-  val degree = new IntParam(this, "degree", "the polynomial degree to expand")
+  val degree = new IntParam(this, "degree", "the polynomial degree to expand (>= 1)",
+    ParamValidators.gt(1))
+
   setDefault(degree -> 2)
 
   /** @group getParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 447851ec034d6..a0e9ed32e0e4c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -31,17 +31,19 @@ import org.apache.spark.sql.types.{StructField, StructType}
  * Params for [[StandardScaler]] and [[StandardScalerModel]].
  */
 private[feature] trait StandardScalerParams extends Params with HasInputCol with HasOutputCol {
-  
+
   /**
-   * False by default. Centers the data with mean before scaling. 
+   * Centers the data with mean before scaling.
    * It will build a dense output, so this does not work on sparse input 
    * and will raise an exception.
+   * Default: false
    * @group param
    */
   val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean")
   
   /**
-   * True by default. Scales the data to unit standard deviation.
+   * Scales the data to unit standard deviation.
+   * Default: true
    * @group param
    */
   val withStd: BooleanParam = new BooleanParam(this, "withStd", "Scale to unit standard deviation")
@@ -56,7 +58,7 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
 class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {
 
   setDefault(withMean -> false, withStd -> true)
-  
+
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 376a004858b4c..01752ba482d0c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.UnaryTransformer
-import org.apache.spark.ml.param.{ParamMap, IntParam, BooleanParam, Param}
+import org.apache.spark.ml.param._
 import org.apache.spark.sql.types.{DataType, StringType, ArrayType}
 
 /**
@@ -43,20 +43,20 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
 /**
  * :: AlphaComponent ::
  * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) 
- * or using it to split the text (set matching to false). Optional parameters also allow to fold
- * the text to lowercase prior to it being tokenized and to filer tokens using a minimal length. 
+ * or using it to split the text (set matching to false). Optional parameters also allow filtering
+ * tokens using a minimal length.
  * It returns an array of strings that can be empty.
- * The default parameters are regex = "\\p{L}+|[^\\p{L}\\s]+", matching = true, 
- * lowercase = false, minTokenLength = 1
  */
 @AlphaComponent
 class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] {
 
   /**
-   * param for minimum token length, default is one to avoid returning empty strings
+   * Minimum token length, >= 0.
+   * Default: 1, to avoid returning empty strings
    * @group param
    */
-  val minTokenLength: IntParam = new IntParam(this, "minLength", "minimum token length")
+  val minTokenLength: IntParam = new IntParam(this, "minLength", "minimum token length (>= 0)",
+    ParamValidators.gtEq(0))
 
   /** @group setParam */
   def setMinTokenLength(value: Int): this.type = set(minTokenLength, value)
@@ -65,7 +65,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
   def getMinTokenLength: Int = getOrDefault(minTokenLength)
 
   /**
-   * param sets regex as splitting on gaps (true) or matching tokens (false)
+   * Indicates whether regex splits on gaps (true) or matching tokens (false).
+   * Default: false
    * @group param
    */
   val gaps: BooleanParam = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens")
@@ -77,7 +78,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
   def getGaps: Boolean = getOrDefault(gaps)
 
   /**
-   * param sets regex pattern used by tokenizer 
+   * Regex pattern used by tokenizer.
+   * Default: `"\\p{L}+|[^\\p{L}\\s]+"`
    * @group param
    */
   val pattern: Param[String] = new Param(this, "pattern", "regex pattern used for tokenizing")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 1e5ffd15afa51..ed833c63c7ef1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.util.SchemaUtils
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.attribute.{BinaryAttribute, NumericAttribute, NominalAttribute,
   Attribute, AttributeGroup}
-import org.apache.spark.ml.param.{IntParam, ParamMap, Params}
+import org.apache.spark.ml.param.{ParamValidators, IntParam, ParamMap, Params}
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, VectorUDT}
 import org.apache.spark.sql.{Row, DataFrame}
@@ -37,17 +37,19 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
   /**
    * Threshold for the number of values a categorical feature can take.
    * If a feature is found to have > maxCategories values, then it is declared continuous.
+   * Must be >= 2.
    *
    * (default = 20)
    */
   val maxCategories = new IntParam(this, "maxCategories",
-    "Threshold for the number of values a categorical feature can take." +
-      " If a feature is found to have > maxCategories values, then it is declared continuous.")
+    "Threshold for the number of values a categorical feature can take (>= 2)." +
+      " If a feature is found to have > maxCategories values, then it is declared continuous.",
+    ParamValidators.gtEq(2))
+
+  setDefault(maxCategories -> 20)
 
   /** @group getParam */
   def getMaxCategories: Int = getOrDefault(maxCategories)
-
-  setDefault(maxCategories -> 20)
 }
 
 /**
@@ -90,11 +92,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams {
 
   /** @group setParam */
-  def setMaxCategories(value: Int): this.type = {
-    require(value > 1,
-      s"DatasetIndexer given maxCategories = value, but requires maxCategories > 1.")
-    set(maxCategories, value)
-  }
+  def setMaxCategories(value: Int): this.type = set(maxCategories, value)
 
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala
index ab6281b9b2e34..fb770622e71f0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala
@@ -38,14 +38,15 @@ import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
 private[ml] trait DecisionTreeParams extends PredictorParams {
 
   /**
-   * Maximum depth of the tree.
+   * Maximum depth of the tree (>= 0).
    * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
    * (default = 5)
    * @group param
    */
   final val maxDepth: IntParam =
-    new IntParam(this, "maxDepth", "Maximum depth of the tree." +
-      " E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
+    new IntParam(this, "maxDepth", "Maximum depth of the tree. (>= 0)" +
+      " E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",
+      ParamValidators.gtEq(0))
 
   /**
    * Maximum number of bins used for discretizing continuous features and for choosing how to split
@@ -56,7 +57,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
    */
   final val maxBins: IntParam = new IntParam(this, "maxBins", "Max number of bins for" +
     " discretizing continuous features.  Must be >=2 and >= number of categories for any" +
-    " categorical feature.")
+    " categorical feature.", ParamValidators.gtEq(2))
 
   /**
    * Minimum number of instances each child must have after split.
@@ -69,7 +70,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   final val minInstancesPerNode: IntParam = new IntParam(this, "minInstancesPerNode", "Minimum" +
     " number of instances each child must have after split.  If a split causes the left or right" +
     " child to have fewer than minInstancesPerNode, the split will be discarded as invalid." +
-    " Should be >= 1.")
+    " Should be >= 1.", ParamValidators.gtEq(1))
 
   /**
    * Minimum information gain for a split to be considered at a tree node.
@@ -85,7 +86,8 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
    * @group expertParam
    */
   final val maxMemoryInMB: IntParam = new IntParam(this, "maxMemoryInMB",
-    "Maximum memory in MB allocated to histogram aggregation.")
+    "Maximum memory in MB allocated to histogram aggregation.",
+    ParamValidators.gtEq(0))
 
   /**
    * If false, the algorithm will pass trees to executors to match instances with nodes.
@@ -111,34 +113,26 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "Specifies" +
     " how often to checkpoint the cached node IDs.  E.g. 10 means that the cache will get" +
     " checkpointed every 10 iterations. This is only used if cacheNodeIds is true and if the" +
-    " checkpoint directory is set in the SparkContext. Must be >= 1.")
+    " checkpoint directory is set in the SparkContext. Must be >= 1.",
+    ParamValidators.gtEq(1))
 
   setDefault(maxDepth -> 5, maxBins -> 32, minInstancesPerNode -> 1, minInfoGain -> 0.0,
     maxMemoryInMB -> 256, cacheNodeIds -> false, checkpointInterval -> 10)
 
   /** @group setParam */
-  def setMaxDepth(value: Int): this.type = {
-    require(value >= 0, s"maxDepth parameter must be >= 0.  Given bad value: $value")
-    set(maxDepth, value)
-  }
+  def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
   /** @group getParam */
   final def getMaxDepth: Int = getOrDefault(maxDepth)
 
   /** @group setParam */
-  def setMaxBins(value: Int): this.type = {
-    require(value >= 2, s"maxBins parameter must be >= 2.  Given bad value: $value")
-    set(maxBins, value)
-  }
+  def setMaxBins(value: Int): this.type = set(maxBins, value)
 
   /** @group getParam */
   final def getMaxBins: Int = getOrDefault(maxBins)
 
   /** @group setParam */
-  def setMinInstancesPerNode(value: Int): this.type = {
-    require(value >= 1, s"minInstancesPerNode parameter must be >= 1.  Given bad value: $value")
-    set(minInstancesPerNode, value)
-  }
+  def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
   /** @group getParam */
   final def getMinInstancesPerNode: Int = getOrDefault(minInstancesPerNode)
@@ -150,10 +144,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   final def getMinInfoGain: Double = getOrDefault(minInfoGain)
 
   /** @group expertSetParam */
-  def setMaxMemoryInMB(value: Int): this.type = {
-    require(value > 0, s"maxMemoryInMB parameter must be > 0.  Given bad value: $value")
-    set(maxMemoryInMB, value)
-  }
+  def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
   /** @group expertGetParam */
   final def getMaxMemoryInMB: Int = getOrDefault(maxMemoryInMB)
@@ -165,10 +156,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   final def getCacheNodeIds: Boolean = getOrDefault(cacheNodeIds)
 
   /** @group expertSetParam */
-  def setCheckpointInterval(value: Int): this.type = {
-    require(value >= 1, s"checkpointInterval parameter must be >= 1.  Given bad value: $value")
-    set(checkpointInterval, value)
-  }
+  def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /** @group expertGetParam */
   final def getCheckpointInterval: Int = getOrDefault(checkpointInterval)
@@ -209,21 +197,16 @@ private[ml] trait TreeClassifierParams extends Params {
    */
   final val impurity: Param[String] = new Param[String](this, "impurity", "Criterion used for" +
     " information gain calculation (case-insensitive). Supported options:" +
-    s" ${TreeClassifierParams.supportedImpurities.mkString(", ")}")
+    s" ${TreeClassifierParams.supportedImpurities.mkString(", ")}",
+    (value: String) => TreeClassifierParams.supportedImpurities.contains(value.toLowerCase))
 
   setDefault(impurity -> "gini")
 
   /** @group setParam */
-  def setImpurity(value: String): this.type = {
-    val impurityStr = value.toLowerCase
-    require(TreeClassifierParams.supportedImpurities.contains(impurityStr),
-      s"Tree-based classifier was given unrecognized impurity: $value." +
-      s"  Supported options: ${TreeClassifierParams.supportedImpurities.mkString(", ")}")
-    set(impurity, impurityStr)
-  }
+  def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
-  final def getImpurity: String = getOrDefault(impurity)
+  final def getImpurity: String = getOrDefault(impurity).toLowerCase
 
   /** Convert new impurity to old impurity. */
   private[ml] def getOldImpurity: OldImpurity = {
@@ -256,21 +239,16 @@ private[ml] trait TreeRegressorParams extends Params {
    */
   final val impurity: Param[String] = new Param[String](this, "impurity", "Criterion used for" +
     " information gain calculation (case-insensitive). Supported options:" +
-    s" ${TreeRegressorParams.supportedImpurities.mkString(", ")}")
+    s" ${TreeRegressorParams.supportedImpurities.mkString(", ")}",
+    (value: String) => TreeRegressorParams.supportedImpurities.contains(value.toLowerCase))
 
   setDefault(impurity -> "variance")
 
   /** @group setParam */
-  def setImpurity(value: String): this.type = {
-    val impurityStr = value.toLowerCase
-    require(TreeRegressorParams.supportedImpurities.contains(impurityStr),
-      s"Tree-based regressor was given unrecognized impurity: $value." +
-        s"  Supported options: ${TreeRegressorParams.supportedImpurities.mkString(", ")}")
-    set(impurity, impurityStr)
-  }
+  def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
-  final def getImpurity: String = getOrDefault(impurity)
+  final def getImpurity: String = getOrDefault(impurity).toLowerCase
 
   /** Convert new impurity to old impurity. */
   private[ml] def getOldImpurity: OldImpurity = {
@@ -299,21 +277,18 @@ private[ml] object TreeRegressorParams {
 private[ml] trait TreeEnsembleParams extends DecisionTreeParams with HasSeed {
 
   /**
-   * Fraction of the training data used for learning each decision tree.
+   * Fraction of the training data used for learning each decision tree, in range (0, 1].
    * (default = 1.0)
    * @group param
    */
   final val subsamplingRate: DoubleParam = new DoubleParam(this, "subsamplingRate",
-    "Fraction of the training data used for learning each decision tree.")
+    "Fraction of the training data used for learning each decision tree, in range (0, 1].",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
 
   setDefault(subsamplingRate -> 1.0)
 
   /** @group setParam */
-  def setSubsamplingRate(value: Double): this.type = {
-    require(value > 0.0 && value <= 1.0,
-      s"Subsampling rate must be in range (0,1]. Bad rate: $value")
-    set(subsamplingRate, value)
-  }
+  def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
   /** @group getParam */
   final def getSubsamplingRate: Double = getOrDefault(subsamplingRate)
@@ -350,7 +325,8 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
    * (default = 20)
    * @group param
    */
-  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)")
+  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
+    ParamValidators.gtEq(1))
 
   /**
    * The number of features to consider for splits at each tree node.
@@ -378,30 +354,23 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
    */
   final val featureSubsetStrategy: Param[String] = new Param[String](this, "featureSubsetStrategy",
     "The number of features to consider for splits at each tree node." +
-      s" Supported options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}")
+      s" Supported options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}",
+    (value: String) =>
+      RandomForestParams.supportedFeatureSubsetStrategies.contains(value.toLowerCase))
 
   setDefault(numTrees -> 20, featureSubsetStrategy -> "auto")
 
   /** @group setParam */
-  def setNumTrees(value: Int): this.type = {
-    require(value >= 1, s"Random Forest numTrees parameter cannot be $value; it must be >= 1.")
-    set(numTrees, value)
-  }
+  def setNumTrees(value: Int): this.type = set(numTrees, value)
 
   /** @group getParam */
   final def getNumTrees: Int = getOrDefault(numTrees)
 
   /** @group setParam */
-  def setFeatureSubsetStrategy(value: String): this.type = {
-    val strategyStr = value.toLowerCase
-    require(RandomForestParams.supportedFeatureSubsetStrategies.contains(strategyStr),
-      s"RandomForestParams was given unrecognized featureSubsetStrategy: $value. Supported" +
-        s" options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}")
-    set(featureSubsetStrategy, strategyStr)
-  }
+  def setFeatureSubsetStrategy(value: String): this.type = set(featureSubsetStrategy, value)
 
   /** @group getParam */
-  final def getFeatureSubsetStrategy: String = getOrDefault(featureSubsetStrategy)
+  final def getFeatureSubsetStrategy: String = getOrDefault(featureSubsetStrategy).toLowerCase
 }
 
 private[ml] object RandomForestParams {
@@ -426,7 +395,8 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
    * @group param
    */
   final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size (a.k.a." +
-    " learning rate) in interval (0, 1] for shrinking the contribution of each estimator")
+    " learning rate) in interval (0, 1] for shrinking the contribution of each estimator",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -442,17 +412,10 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   setDefault(maxIter -> 20, stepSize -> 0.1)
 
   /** @group setParam */
-  def setMaxIter(value: Int): this.type = {
-    require(value >= 1, s"Gradient Boosting maxIter parameter cannot be $value; it must be >= 1.")
-    set(maxIter, value)
-  }
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /** @group setParam */
-  def setStepSize(value: Double): this.type = {
-    require(value > 0.0 && value <= 1.0,
-      s"GBT given invalid step size ($value).  Value should be in (0,1].")
-    set(stepSize, value)
-  }
+  def setStepSize(value: Double): this.type = set(stepSize, value)
 
   /** @group getParam */
   final def getStepSize: Double = getOrDefault(stepSize)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 014e124e440a3..df6360dce6013 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -34,10 +34,35 @@ import org.apache.spark.ml.util.Identifiable
  * @param parent parent object
  * @param name param name
  * @param doc documentation
+ * @param isValid optional validation method which indicates if a value is valid.
+ *                See [[ParamValidators]] for factory methods for common validation functions.
  * @tparam T param value type
  */
 @AlphaComponent
-class Param[T] (val parent: Params, val name: String, val doc: String) extends Serializable {
+class Param[T] (val parent: Params, val name: String, val doc: String, val isValid: T => Boolean)
+  extends Serializable {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue[T])
+
+  /**
+   * Assert that the given value is valid for this parameter.
+   *
+   * Note: Parameter checks involving interactions between multiple parameters should be
+   *       implemented in [[Params.validate()]].  Checks for input/output columns should be
+   *       implemented in [[org.apache.spark.ml.PipelineStage.transformSchema()]].
+   *
+   * DEVELOPERS: This method is only called by [[ParamPair]], which means that all parameters
+   *             should be specified via [[ParamPair]].
+   *
+   * @throws IllegalArgumentException if the value is invalid
+   */
+  private[param] def validate(value: T): Unit = {
+    if (!isValid(value)) {
+      throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value." +
+        s" Parameter description: $toString")
+    }
+  }
 
   /**
    * Creates a param pair with the given value (for Java).
@@ -65,38 +90,129 @@ class Param[T] (val parent: Params, val name: String, val doc: String) extends S
   }
 }
 
+/**
+ * Factory methods for common validation functions for [[Param.isValid]].
+ * The numerical methods only support Int, Long, Float, and Double.
+ */
+object ParamValidators {
+
+  /** (private[param]) Default validation always return true */
+  private[param] def alwaysTrue[T]: T => Boolean = (_: T) => true
+
+  /**
+   * Private method for checking numerical types and converting to Double.
+   * This is mainly for the sake of compilation; type checks are really handled
+   * by [[Params]] setters and the [[ParamPair]] constructor.
+   */
+  private def getDouble[T](value: T): Double = value match {
+    case x: Int => x.toDouble
+    case x: Long => x.toDouble
+    case x: Float => x.toDouble
+    case x: Double => x.toDouble
+    case _ =>
+      // The type should be checked before this is ever called.
+      throw new IllegalArgumentException("Numerical Param validation failed because" +
+        s" of unexpected input type: ${value.getClass}")
+  }
+
+  /** Check if value > lowerBound */
+  def gt[T](lowerBound: Double): T => Boolean = { (value: T) =>
+    getDouble(value) > lowerBound
+  }
+
+  /** Check if value >= lowerBound */
+  def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) =>
+    getDouble(value) >= lowerBound
+  }
+
+  /** Check if value < upperBound */
+  def lt[T](upperBound: Double): T => Boolean = { (value: T) =>
+    getDouble(value) < upperBound
+  }
+
+  /** Check if value <= upperBound */
+  def ltEq[T](upperBound: Double): T => Boolean = { (value: T) =>
+    getDouble(value) <= upperBound
+  }
+
+  /**
+   * Check for value in range lowerBound to upperBound.
+   * @param lowerInclusive  If true, check for value >= lowerBound.
+   *                        If false, check for value > lowerBound.
+   * @param upperInclusive  If true, check for value <= upperBound.
+   *                        If false, check for value < upperBound.
+   */
+  def inRange[T](
+      lowerBound: Double,
+      upperBound: Double,
+      lowerInclusive: Boolean,
+      upperInclusive: Boolean): T => Boolean = { (value: T) =>
+    val x: Double = getDouble(value)
+    val lowerValid = if (lowerInclusive) x >= lowerBound else x > lowerBound
+    val upperValid = if (upperInclusive) x <= upperBound else x < upperBound
+    lowerValid && upperValid
+  }
+
+  /** Version of [[inRange()]] which uses inclusive be default: [lowerBound, upperBound] */
+  def inRange[T](lowerBound: Double, upperBound: Double): T => Boolean = {
+    inRange[T](lowerBound, upperBound, lowerInclusive = true, upperInclusive = true)
+  }
+
+  /** Check for value in an allowed set of values. */
+  def inArray[T](allowed: Array[T]): T => Boolean = { (value: T) =>
+    allowed.contains(value)
+  }
+
+  /** Check for value in an allowed set of values. */
+  def inArray[T](allowed: java.util.List[T]): T => Boolean = { (value: T) =>
+    allowed.contains(value)
+  }
+}
+
 // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...
 
 /** Specialized version of [[Param[Double]]] for Java. */
-class DoubleParam(parent: Params, name: String, doc: String)
-  extends Param[Double](parent, name, doc) {
+class DoubleParam(parent: Params, name: String, doc: String, isValid: Double => Boolean)
+  extends Param[Double](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
 
   override def w(value: Double): ParamPair[Double] = super.w(value)
 }
 
 /** Specialized version of [[Param[Int]]] for Java. */
-class IntParam(parent: Params, name: String, doc: String)
-  extends Param[Int](parent, name, doc) {
+class IntParam(parent: Params, name: String, doc: String, isValid: Int => Boolean)
+  extends Param[Int](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
 
   override def w(value: Int): ParamPair[Int] = super.w(value)
 }
 
 /** Specialized version of [[Param[Float]]] for Java. */
-class FloatParam(parent: Params, name: String, doc: String)
-  extends Param[Float](parent, name, doc) {
+class FloatParam(parent: Params, name: String, doc: String, isValid: Float => Boolean)
+  extends Param[Float](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
 
   override def w(value: Float): ParamPair[Float] = super.w(value)
 }
 
 /** Specialized version of [[Param[Long]]] for Java. */
-class LongParam(parent: Params, name: String, doc: String)
-  extends Param[Long](parent, name, doc) {
+class LongParam(parent: Params, name: String, doc: String, isValid: Long => Boolean)
+  extends Param[Long](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
 
   override def w(value: Long): ParamPair[Long] = super.w(value)
 }
 
 /** Specialized version of [[Param[Boolean]]] for Java. */
-class BooleanParam(parent: Params, name: String, doc: String)
+class BooleanParam(parent: Params, name: String, doc: String) // No need for isValid
   extends Param[Boolean](parent, name, doc) {
 
   override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
@@ -105,7 +221,11 @@ class BooleanParam(parent: Params, name: String, doc: String)
 /**
  * A param amd its value.
  */
-case class ParamPair[T](param: Param[T], value: T)
+case class ParamPair[T](param: Param[T], value: T) {
+  // This is *the* place Param.validate is called.  Whenever a parameter is specified, we should
+  // always construct a ParamPair so that validate is called.
+  param.validate(value)
+}
 
 /**
  * :: AlphaComponent ::
@@ -132,12 +252,22 @@ trait Params extends Identifiable with Serializable {
   /**
    * Validates parameter values stored internally plus the input parameter map.
    * Raises an exception if any parameter is invalid.
+   *
+   * This only needs to check for interactions between parameters.
+   * Parameter value checks which do not depend on other parameters are handled by
+   * [[Param.validate()]].  This method does not handle input/output column parameters;
+   * those are checked during schema validation.
    */
-  def validate(paramMap: ParamMap): Unit = {}
+  def validate(paramMap: ParamMap): Unit = { }
 
   /**
    * Validates parameter values stored internally.
    * Raise an exception if any parameter value is invalid.
+   *
+   * This only needs to check for interactions between parameters.
+   * Parameter value checks which do not depend on other parameters are handled by
+   * [[Param.validate()]].  This method does not handle input/output column parameters;
+   * those are checked during schema validation.
    */
   def validate(): Unit = validate(ParamMap.empty)
 
@@ -221,6 +351,10 @@ trait Params extends Identifiable with Serializable {
 
   /**
    * Sets default values for a list of params.
+   *
+   * Note: Java developers should use the single-parameter [[setDefault()]].
+   *       Annotating this with varargs causes compilation failures.
+   *
    * @param paramPairs  a list of param pairs that specify params and their default values to set
    *                    respectively. Make sure that the params are initialized before this method
    *                    gets called.
@@ -305,6 +439,14 @@ private[spark] object Params {
   }
 }
 
+/**
+ * Java-friendly wrapper for [[Params]].
+ * Java developers who need to extend [[Params]] should use this class instead.
+ * If you need to extend a abstract class which already extends [[Params]], then that abstract
+ * class should be Java-friendly as well.
+ */
+abstract class JavaParams extends Params
+
 /**
  * :: AlphaComponent ::
  * A param to value map.
@@ -313,6 +455,12 @@ private[spark] object Params {
 final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   extends Serializable {
 
+  /* DEVELOPERS: About validating parameter values
+   *   This and ParamPair are the only two collections of parameters.
+   *   This class should always create ParamPairs when
+   *   specifying new parameter values.  ParamPair will then call Param.validate().
+   */
+
   /**
    * Creates an empty param map.
    */
@@ -321,10 +469,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   /**
    * Puts a (param, value) pair (overwrites if the input param exists).
    */
-  def put[T](param: Param[T], value: T): this.type = {
-    map(param.asInstanceOf[Param[Any]]) = value
-    this
-  }
+  def put[T](param: Param[T], value: T): this.type = put(ParamPair(param, value))
 
   /**
    * Puts a list of param pairs (overwrites if the input params exists).
@@ -332,7 +477,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any])
   @varargs
   def put(paramPairs: ParamPair[_]*): this.type = {
     paramPairs.foreach { p =>
-      put(p.param.asInstanceOf[Param[Any]], p.value)
+      map(p.param.asInstanceOf[Param[Any]]) = p.value
     }
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 654cd72d53074..7da4bb4b4bf25 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -21,6 +21,8 @@ import java.io.PrintWriter
 
 import scala.reflect.ClassTag
 
+import org.apache.spark.ml.param.ParamValidators
+
 /**
  * Code generator for shared params (sharedParams.scala). Run under the Spark folder with
  * {{{
@@ -31,8 +33,10 @@ private[shared] object SharedParamsCodeGen {
 
   def main(args: Array[String]): Unit = {
     val params = Seq(
-      ParamDesc[Double]("regParam", "regularization parameter"),
-      ParamDesc[Int]("maxIter", "max number of iterations"),
+      ParamDesc[Double]("regParam", "regularization parameter (>= 0)",
+        isValid = "ParamValidators.gtEq(0)"),
+      ParamDesc[Int]("maxIter", "max number of iterations (>= 0)",
+        isValid = "ParamValidators.gtEq(0)"),
       ParamDesc[String]("featuresCol", "features column name", Some("\"features\"")),
       ParamDesc[String]("labelCol", "label column name", Some("\"label\"")),
       ParamDesc[String]("predictionCol", "prediction column name", Some("\"prediction\"")),
@@ -40,14 +44,19 @@ private[shared] object SharedParamsCodeGen {
         Some("\"rawPrediction\"")),
       ParamDesc[String]("probabilityCol",
         "column name for predicted class conditional probabilities", Some("\"probability\"")),
-      ParamDesc[Double]("threshold", "threshold in binary classification prediction"),
+      ParamDesc[Double]("threshold",
+        "threshold in binary classification prediction, in range [0, 1]",
+        isValid = "ParamValidators.inRange(0, 1)"),
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
       ParamDesc[String]("outputCol", "output column name"),
-      ParamDesc[Int]("checkpointInterval", "checkpoint interval"),
+      ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
+        isValid = "ParamValidators.gtEq(1)"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[Long]("seed", "random seed", Some("Utils.random.nextLong()")),
-      ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter"),
+      ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
+        " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
+        isValid = "ParamValidators.inRange(0, 1)"),
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."))
 
@@ -62,7 +71,8 @@ private[shared] object SharedParamsCodeGen {
   private case class ParamDesc[T: ClassTag](
       name: String,
       doc: String,
-      defaultValueStr: Option[String] = None) {
+      defaultValueStr: Option[String] = None,
+      isValid: String = "") {
 
     require(name.matches("[a-z][a-zA-Z0-9]*"), s"Param name $name is invalid.")
     require(doc.nonEmpty) // TODO: more rigorous on doc
@@ -113,20 +123,23 @@ private[shared] object SharedParamsCodeGen {
          |  setDefault($name, $v)
          |""".stripMargin
     }.getOrElse("")
+    val isValid = if (param.isValid != "") {
+      ", " + param.isValid
+    } else {
+      ""
+    }
 
     s"""
       |/**
-      | * :: DeveloperApi ::
-      | * Trait for shared param $name$defaultValueDoc.
+      | * (private[ml]) Trait for shared param $name$defaultValueDoc.
       | */
-      |@DeveloperApi
-      |trait Has$Name extends Params {
+      |private[ml] trait Has$Name extends Params {
       |
       |  /**
       |   * Param for $doc.
       |   * @group param
       |   */
-      |  final val $name: $Param = new $Param(this, "$name", "$doc")
+      |  final val $name: $Param = new $Param(this, "$name", "$doc"$isValid)
       |$setDefault
       |  /** @group getParam */
       |  final def get$Name: $T = getOrDefault($name)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 96d11ed76fa8e..e1549f46a68d4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -26,45 +26,39 @@ import org.apache.spark.util.Utils
 // scalastyle:off
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param regParam.
+ * (private[ml]) Trait for shared param regParam.
  */
-@DeveloperApi
-trait HasRegParam extends Params {
+private[ml] trait HasRegParam extends Params {
 
   /**
-   * Param for regularization parameter.
+   * Param for regularization parameter (>= 0).
    * @group param
    */
-  final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter")
+  final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0))
 
   /** @group getParam */
   final def getRegParam: Double = getOrDefault(regParam)
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param maxIter.
+ * (private[ml]) Trait for shared param maxIter.
  */
-@DeveloperApi
-trait HasMaxIter extends Params {
+private[ml] trait HasMaxIter extends Params {
 
   /**
-   * Param for max number of iterations.
+   * Param for max number of iterations (>= 0).
    * @group param
    */
-  final val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations")
+  final val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0))
 
   /** @group getParam */
   final def getMaxIter: Int = getOrDefault(maxIter)
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param featuresCol (default: "features").
+ * (private[ml]) Trait for shared param featuresCol (default: "features").
  */
-@DeveloperApi
-trait HasFeaturesCol extends Params {
+private[ml] trait HasFeaturesCol extends Params {
 
   /**
    * Param for features column name.
@@ -79,11 +73,9 @@ trait HasFeaturesCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param labelCol (default: "label").
+ * (private[ml]) Trait for shared param labelCol (default: "label").
  */
-@DeveloperApi
-trait HasLabelCol extends Params {
+private[ml] trait HasLabelCol extends Params {
 
   /**
    * Param for label column name.
@@ -98,11 +90,9 @@ trait HasLabelCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param predictionCol (default: "prediction").
+ * (private[ml]) Trait for shared param predictionCol (default: "prediction").
  */
-@DeveloperApi
-trait HasPredictionCol extends Params {
+private[ml] trait HasPredictionCol extends Params {
 
   /**
    * Param for prediction column name.
@@ -117,11 +107,9 @@ trait HasPredictionCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param rawPredictionCol (default: "rawPrediction").
+ * (private[ml]) Trait for shared param rawPredictionCol (default: "rawPrediction").
  */
-@DeveloperApi
-trait HasRawPredictionCol extends Params {
+private[ml] trait HasRawPredictionCol extends Params {
 
   /**
    * Param for raw prediction (a.k.a. confidence) column name.
@@ -136,11 +124,9 @@ trait HasRawPredictionCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param probabilityCol (default: "probability").
+ * (private[ml]) Trait for shared param probabilityCol (default: "probability").
  */
-@DeveloperApi
-trait HasProbabilityCol extends Params {
+private[ml] trait HasProbabilityCol extends Params {
 
   /**
    * Param for column name for predicted class conditional probabilities.
@@ -155,28 +141,24 @@ trait HasProbabilityCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param threshold.
+ * (private[ml]) Trait for shared param threshold.
  */
-@DeveloperApi
-trait HasThreshold extends Params {
+private[ml] trait HasThreshold extends Params {
 
   /**
-   * Param for threshold in binary classification prediction.
+   * Param for threshold in binary classification prediction, in range [0, 1].
    * @group param
    */
-  final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction")
+  final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1))
 
   /** @group getParam */
   final def getThreshold: Double = getOrDefault(threshold)
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param inputCol.
+ * (private[ml]) Trait for shared param inputCol.
  */
-@DeveloperApi
-trait HasInputCol extends Params {
+private[ml] trait HasInputCol extends Params {
 
   /**
    * Param for input column name.
@@ -189,11 +171,9 @@ trait HasInputCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param inputCols.
+ * (private[ml]) Trait for shared param inputCols.
  */
-@DeveloperApi
-trait HasInputCols extends Params {
+private[ml] trait HasInputCols extends Params {
 
   /**
    * Param for input column names.
@@ -206,11 +186,9 @@ trait HasInputCols extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param outputCol.
+ * (private[ml]) Trait for shared param outputCol.
  */
-@DeveloperApi
-trait HasOutputCol extends Params {
+private[ml] trait HasOutputCol extends Params {
 
   /**
    * Param for output column name.
@@ -223,28 +201,24 @@ trait HasOutputCol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param checkpointInterval.
+ * (private[ml]) Trait for shared param checkpointInterval.
  */
-@DeveloperApi
-trait HasCheckpointInterval extends Params {
+private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for checkpoint interval.
+   * Param for checkpoint interval (>= 1).
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval")
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1))
 
   /** @group getParam */
   final def getCheckpointInterval: Int = getOrDefault(checkpointInterval)
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param fitIntercept (default: true).
+ * (private[ml]) Trait for shared param fitIntercept (default: true).
  */
-@DeveloperApi
-trait HasFitIntercept extends Params {
+private[ml] trait HasFitIntercept extends Params {
 
   /**
    * Param for whether to fit an intercept term.
@@ -259,11 +233,9 @@ trait HasFitIntercept extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param seed (default: Utils.random.nextLong()).
+ * (private[ml]) Trait for shared param seed (default: Utils.random.nextLong()).
  */
-@DeveloperApi
-trait HasSeed extends Params {
+private[ml] trait HasSeed extends Params {
 
   /**
    * Param for random seed.
@@ -278,28 +250,24 @@ trait HasSeed extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param elasticNetParam.
+ * (private[ml]) Trait for shared param elasticNetParam.
  */
-@DeveloperApi
-trait HasElasticNetParam extends Params {
+private[ml] trait HasElasticNetParam extends Params {
 
   /**
-   * Param for the ElasticNet mixing parameter.
+   * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
    * @group param
    */
-  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter")
+  final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1))
 
   /** @group getParam */
   final def getElasticNetParam: Double = getOrDefault(elasticNetParam)
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param tol.
+ * (private[ml]) Trait for shared param tol.
  */
-@DeveloperApi
-trait HasTol extends Params {
+private[ml] trait HasTol extends Params {
 
   /**
    * Param for the convergence tolerance for iterative algorithms.
@@ -312,11 +280,9 @@ trait HasTol extends Params {
 }
 
 /**
- * :: DeveloperApi ::
- * Trait for shared param stepSize.
+ * (private[ml]) Trait for shared param stepSize.
  */
-@DeveloperApi
-trait HasStepSize extends Params {
+private[ml] trait HasStepSize extends Params {
 
   /**
    * Param for Step size to be used for each iteration of optimization..
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index bd793beba35b6..f9f2b2764ddb1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -52,35 +52,40 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
   with HasPredictionCol with HasCheckpointInterval {
 
   /**
-   * Param for rank of the matrix factorization.
+   * Param for rank of the matrix factorization (>= 1).
+   * Default: 10
    * @group param
    */
-  val rank = new IntParam(this, "rank", "rank of the factorization")
+  val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1))
 
   /** @group getParam */
   def getRank: Int = getOrDefault(rank)
 
   /**
-   * Param for number of user blocks.
+   * Param for number of user blocks (>= 1).
+   * Default: 10
    * @group param
    */
-  val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks")
+  val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks",
+    ParamValidators.gtEq(1))
 
   /** @group getParam */
   def getNumUserBlocks: Int = getOrDefault(numUserBlocks)
 
   /**
-   * Param for number of item blocks.
+   * Param for number of item blocks (>= 1).
+   * Default: 10
    * @group param
    */
-  val numItemBlocks =
-    new IntParam(this, "numItemBlocks", "number of item blocks")
+  val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks",
+      ParamValidators.gtEq(1))
 
   /** @group getParam */
   def getNumItemBlocks: Int = getOrDefault(numItemBlocks)
 
   /**
    * Param to decide whether to use implicit preference.
+   * Default: false
    * @group param
    */
   val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference")
@@ -89,16 +94,19 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
   def getImplicitPrefs: Boolean = getOrDefault(implicitPrefs)
 
   /**
-   * Param for the alpha parameter in the implicit preference formulation.
+   * Param for the alpha parameter in the implicit preference formulation (>= 0).
+   * Default: 1.0
    * @group param
    */
-  val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference")
+  val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference",
+    ParamValidators.gtEq(0))
 
   /** @group getParam */
   def getAlpha: Double = getOrDefault(alpha)
 
   /**
    * Param for the column name for user ids.
+   * Default: "user"
    * @group param
    */
   val userCol = new Param[String](this, "userCol", "column name for user ids")
@@ -108,6 +116,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 
   /**
    * Param for the column name for item ids.
+   * Default: "item"
    * @group param
    */
   val itemCol = new Param[String](this, "itemCol", "column name for item ids")
@@ -117,6 +126,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 
   /**
    * Param for the column name for ratings.
+   * Default: "rating"
    * @group param
    */
   val ratingCol = new Param[String](this, "ratingCol", "column name for ratings")
@@ -126,6 +136,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 
   /**
    * Param for whether to apply nonnegativity constraints.
+   * Default: false
    * @group param
    */
   val nonnegative = new BooleanParam(
@@ -136,7 +147,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR
 
   setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
     implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
-    ratingCol -> "rating", nonnegative -> false)
+    ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10)
 
   /**
    * Validates and transforms the input schema.
@@ -281,10 +292,6 @@ class ALS extends Estimator[ALSModel] with ALSParams {
     this
   }
 
-  setMaxIter(20)
-  setRegParam(1.0)
-  setCheckpointInterval(10)
-
   override def fit(dataset: DataFrame, paramMap: ParamMap): ALSModel = {
     val map = extractParamMap(paramMap)
     val ratings = dataset
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index c784cf39ed31a..76c98376930c5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -102,21 +102,16 @@ final class GBTRegressor
    */
   val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" +
     " tries to minimize (case-insensitive). Supported options:" +
-    s" ${GBTRegressor.supportedLossTypes.mkString(", ")}")
+    s" ${GBTRegressor.supportedLossTypes.mkString(", ")}",
+    (value: String) => GBTRegressor.supportedLossTypes.contains(value.toLowerCase))
 
   setDefault(lossType -> "squared")
 
   /** @group setParam */
-  def setLossType(value: String): this.type = {
-    val lossStr = value.toLowerCase
-    require(GBTRegressor.supportedLossTypes.contains(lossStr), "GBTRegressor was given bad loss" +
-      s" type: $value. Supported options: ${GBTRegressor.supportedLossTypes.mkString(", ")}")
-    set(lossType, lossStr)
-    this
-  }
+  def setLossType(value: String): this.type = set(lossType, value)
 
   /** @group getParam */
-  def getLossType: String = getOrDefault(lossType)
+  def getLossType: String = getOrDefault(lossType).toLowerCase
 
   /** (private[ml]) Convert new loss to old loss. */
   override private[ml] def getOldLossType: OldLoss = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index cc9ad22cb860f..11c6cea0f8c8c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -25,7 +25,8 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction}
 
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param.{Params, ParamMap}
-import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
+import org.apache.spark.ml.param.shared.{HasTol, HasElasticNetParam, HasMaxIter,
+  HasRegParam}
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
@@ -46,6 +47,16 @@ private[regression] trait LinearRegressionParams extends RegressorParams
  * :: AlphaComponent ::
  *
  * Linear regression.
+ *
+ * The learning objective is to minimize the squared error, with regularization.
+ * The specific squared error loss function used is:
+ *   L = 1/2n ||A weights - y||^2^
+ *
+ * This support multiple types of regularization:
+ *  - none (a.k.a. ordinary least squares)
+ *  - L2 (ridge regression)
+ *  - L1 (Lasso)
+ *  - L2 + L1 (elastic net)
  */
 @AlphaComponent
 class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
@@ -135,7 +146,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
     val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
       new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
     } else {
-      new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam, paramMap(tol))
+      new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam,
+        paramMap(tol))
     }
 
     val initialWeights = Vectors.zeros(numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 4bb4ed813c006..d1ad0893cd044 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -22,7 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
 import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
-import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
+import org.apache.spark.ml.param._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
@@ -61,10 +61,12 @@ private[ml] trait CrossValidatorParams extends Params {
   def getEvaluator: Evaluator = getOrDefault(evaluator)
 
   /**
-   * param for number of folds for cross validation
+   * Param for number of folds for cross validation.  Must be >= 2.
+   * Default: 3
    * @group param
    */
-  val numFolds: IntParam = new IntParam(this, "numFolds", "number of folds for cross validation")
+  val numFolds: IntParam = new IntParam(this, "numFolds",
+    "number of folds for cross validation (>= 2)", ParamValidators.gtEq(2))
 
   /** @group getParam */
   def getNumFolds: Int = getOrDefault(numFolds)
@@ -93,6 +95,12 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
   /** @group setParam */
   def setNumFolds(value: Int): this.type = set(numFolds, value)
 
+  override def validate(paramMap: ParamMap): Unit = {
+    getEstimatorParamMaps.foreach { eMap =>
+      getEstimator.validate(eMap ++ paramMap)
+    }
+  }
+
   override def fit(dataset: DataFrame, paramMap: ParamMap): CrossValidatorModel = {
     val map = extractParamMap(paramMap)
     val schema = dataset.schema
@@ -101,8 +109,8 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
     val est = map(estimator)
     val eval = map(evaluator)
     val epm = map(estimatorParamMaps)
-    val numModels = epm.size
-    val metrics = new Array[Double](epm.size)
+    val numModels = epm.length
+    val metrics = new Array[Double](epm.length)
     val splits = MLUtils.kFold(dataset.rdd, map(numFolds), 0)
     splits.zipWithIndex.foreach { case ((training, validation), splitIndex) =>
       val trainingDataset = sqlCtx.createDataFrame(training, schema).cache()
@@ -148,6 +156,10 @@ class CrossValidatorModel private[ml] (
     val bestModel: Model[_])
   extends Model[CrossValidatorModel] with CrossValidatorParams {
 
+  override def validate(paramMap: ParamMap): Unit = {
+    bestModel.validate(paramMap)
+  }
+
   override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
     bestModel.transform(dataset, paramMap)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index deac390130128..1f779584dcffd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -180,7 +180,9 @@ object GradientBoostedTrees extends Logging {
     val persistedInput = if (input.getStorageLevel == StorageLevel.NONE) {
       input.persist(StorageLevel.MEMORY_AND_DISK)
       true
-    } else false
+    } else {
+      false
+    }
 
     timer.stop("init")
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
new file mode 100644
index 0000000000000..e7df10dfa63ac
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaParamsSuite.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.param;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaSparkContext;
+
+/**
+ * Test Param and related classes in Java
+ */
+public class JavaParamsSuite {
+
+  private transient JavaSparkContext jsc;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaParamsSuite");
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  @Test
+  public void testParams() {
+    JavaTestParams testParams = new JavaTestParams();
+    Assert.assertEquals(testParams.getMyIntParam(), 1);
+    testParams.setMyIntParam(2).setMyDoubleParam(0.4).setMyStringParam("a");
+    Assert.assertEquals(testParams.getMyDoubleParam(), 0.4, 0.0);
+    Assert.assertEquals(testParams.getMyStringParam(), "a");
+  }
+
+  @Test
+  public void testParamValidate() {
+    ParamValidators.gt(1.0);
+    ParamValidators.gtEq(1.0);
+    ParamValidators.lt(1.0);
+    ParamValidators.ltEq(1.0);
+    ParamValidators.inRange(0, 1, true, false);
+    ParamValidators.inRange(0, 1);
+    ParamValidators.inArray(Lists.newArrayList(0, 1, 3));
+    ParamValidators.inArray(Lists.newArrayList("a", "b"));
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
new file mode 100644
index 0000000000000..8abe575610d19
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.param;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A subclass of Params for testing.
+ */
+public class JavaTestParams extends JavaParams {
+
+  public IntParam myIntParam;
+
+  public int getMyIntParam() { return (Integer)getOrDefault(myIntParam); }
+
+  public JavaTestParams setMyIntParam(int value) {
+    set(myIntParam, value); return this;
+  }
+
+  public DoubleParam myDoubleParam;
+
+  public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam); }
+
+  public JavaTestParams setMyDoubleParam(double value) {
+    set(myDoubleParam, value); return this;
+  }
+
+  public Param<String> myStringParam;
+
+  public String getMyStringParam() { return (String)getOrDefault(myStringParam); }
+
+  public JavaTestParams setMyStringParam(String value) {
+    set(myStringParam, value); return this;
+  }
+
+  public JavaTestParams() {
+    myIntParam = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0));
+    myDoubleParam = new DoubleParam(this, "myDoubleParam", "this is a double param",
+      ParamValidators.inRange(0.0, 1.0));
+    List<String> validStrings = Lists.newArrayList("a", "b");
+    myStringParam = new Param<String>(this, "myStringParam", "this is a string param",
+      ParamValidators.inArray(validStrings));
+    setDefault(myIntParam, 1);
+    setDefault(myDoubleParam, 0.5);
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index 88ea679eeaad5..f8852606abbf2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -26,14 +26,22 @@ class ParamsSuite extends FunSuite {
     import solver.{maxIter, inputCol}
 
     assert(maxIter.name === "maxIter")
-    assert(maxIter.doc === "max number of iterations")
+    assert(maxIter.doc === "max number of iterations (>= 0)")
     assert(maxIter.parent.eq(solver))
-    assert(maxIter.toString === "maxIter: max number of iterations (default: 10)")
+    assert(maxIter.toString === "maxIter: max number of iterations (>= 0) (default: 10)")
+    assert(!maxIter.isValid(-1))
+    assert(maxIter.isValid(0))
+    assert(maxIter.isValid(1))
 
     solver.setMaxIter(5)
-    assert(maxIter.toString === "maxIter: max number of iterations (default: 10, current: 5)")
+    assert(maxIter.toString ===
+      "maxIter: max number of iterations (>= 0) (default: 10, current: 5)")
 
     assert(inputCol.toString === "inputCol: input column name (undefined)")
+
+    intercept[IllegalArgumentException] {
+      solver.setMaxIter(-1)
+    }
   }
 
   test("param pair") {
@@ -47,6 +55,9 @@ class ParamsSuite extends FunSuite {
       assert(pair.param.eq(maxIter))
       assert(pair.value === 5)
     }
+    intercept[IllegalArgumentException] {
+      val pair = maxIter -> -1
+    }
   }
 
   test("param map") {
@@ -59,6 +70,9 @@ class ParamsSuite extends FunSuite {
     map0.put(maxIter, 10)
     assert(map0.contains(maxIter))
     assert(map0(maxIter) === 10)
+    intercept[IllegalArgumentException] {
+      map0.put(maxIter, -1)
+    }
 
     assert(!map0.contains(inputCol))
     intercept[NoSuchElementException] {
@@ -122,14 +136,57 @@ class ParamsSuite extends FunSuite {
     assert(solver.getInputCol === "input")
     solver.validate()
     intercept[IllegalArgumentException] {
-      solver.validate(ParamMap(maxIter -> -10))
+      ParamMap(maxIter -> -10)
     }
-    solver.setMaxIter(-10)
     intercept[IllegalArgumentException] {
-      solver.validate()
+      solver.setMaxIter(-10)
     }
 
     solver.clearMaxIter()
     assert(!solver.isSet(maxIter))
   }
+
+  test("ParamValidate") {
+    val alwaysTrue = ParamValidators.alwaysTrue[Int]
+    assert(alwaysTrue(1))
+
+    val gt1Int = ParamValidators.gt[Int](1)
+    assert(!gt1Int(1) && gt1Int(2))
+    val gt1Double = ParamValidators.gt[Double](1)
+    assert(!gt1Double(1.0) && gt1Double(1.1))
+
+    val gtEq1Int = ParamValidators.gtEq[Int](1)
+    assert(!gtEq1Int(0) && gtEq1Int(1))
+    val gtEq1Double = ParamValidators.gtEq[Double](1)
+    assert(!gtEq1Double(0.9) && gtEq1Double(1.0))
+
+    val lt1Int = ParamValidators.lt[Int](1)
+    assert(lt1Int(0) && !lt1Int(1))
+    val lt1Double = ParamValidators.lt[Double](1)
+    assert(lt1Double(0.9) && !lt1Double(1.0))
+
+    val ltEq1Int = ParamValidators.ltEq[Int](1)
+    assert(ltEq1Int(1) && !ltEq1Int(2))
+    val ltEq1Double = ParamValidators.ltEq[Double](1)
+    assert(ltEq1Double(1.0) && !ltEq1Double(1.1))
+
+    val inRange02IntInclusive = ParamValidators.inRange[Int](0, 2)
+    assert(inRange02IntInclusive(0) && inRange02IntInclusive(1) && inRange02IntInclusive(2) &&
+      !inRange02IntInclusive(-1) && !inRange02IntInclusive(3))
+    val inRange02IntExclusive =
+      ParamValidators.inRange[Int](0, 2, lowerInclusive = false, upperInclusive = false)
+    assert(!inRange02IntExclusive(0) && inRange02IntExclusive(1) && !inRange02IntExclusive(2))
+
+    val inRange02DoubleInclusive = ParamValidators.inRange[Double](0, 2)
+    assert(inRange02DoubleInclusive(0) && inRange02DoubleInclusive(1) &&
+      inRange02DoubleInclusive(2) &&
+      !inRange02DoubleInclusive(-0.1) && !inRange02DoubleInclusive(2.1))
+    val inRange02DoubleExclusive =
+      ParamValidators.inRange[Double](0, 2, lowerInclusive = false, upperInclusive = false)
+    assert(!inRange02DoubleExclusive(0) && inRange02DoubleExclusive(1) &&
+      !inRange02DoubleExclusive(2))
+
+    val inArray = ParamValidators.inArray[Int](Array(1, 2))
+    assert(inArray(1) && inArray(2) && !inArray(0))
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
index 641b64b42a5e7..6f9c9cb5166ae 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala
@@ -29,7 +29,7 @@ class TestParams extends Params with HasMaxIter with HasInputCol {
 
   override def validate(paramMap: ParamMap): Unit = {
     val m = extractParamMap(paramMap)
-    require(m(maxIter) >= 0)
+    // Note: maxIter is validated when it is set.
     require(m.contains(inputCol))
   }
 

From 1b7106b867bc0aa4d64b669d79b646f862acaf47 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 29 Apr 2015 18:22:14 -0700
Subject: [PATCH 2/4] [SPARK-6862] [STREAMING] [WEBUI] Add BatchPage to display
 details of a batch

This is an initial commit for SPARK-6862. Once SPARK-6796 is merged, I will add the links to StreamingPage so that the user can jump to BatchPage.

Screenshots:
![success](https://cloud.githubusercontent.com/assets/1000778/7102439/bbe75406-e0b3-11e4-84fe-3e6de629a49a.png)
![failure](https://cloud.githubusercontent.com/assets/1000778/7102440/bc124454-e0b3-11e4-921a-c8b39d6b61bc.png)

Author: zsxwing <zsxwing@gmail.com>

Closes #5473 from zsxwing/SPARK-6862 and squashes the following commits:

0727d35 [zsxwing] Change BatchUIData to a case class
b380cfb [zsxwing] Add createJobStart to eliminate duplicate codes
9a3083d [zsxwing] Rename XxxDatas -> XxxData
087ba98 [zsxwing] Refactor BatchInfo to store only necessary fields
cb62e4f [zsxwing] Use Seq[(OutputOpId, SparkJobId)] to store the id relations
72f8e7e [zsxwing] Add unit tests for BatchPage
1282b10 [zsxwing] Handle some corner cases and add tests for StreamingJobProgressListener
77a69ae [zsxwing] Refactor codes as per TD's comments
35ffd80 [zsxwing] Merge branch 'master' into SPARK-6862
15bdf9b [zsxwing] Add batch links and unit tests
4bf66b6 [zsxwing] Merge branch 'master' into SPARK-6862
7168807 [zsxwing] Limit the max width of the error message and fix nits in the UI
0b226f9 [zsxwing] Change 'Last Error' to 'Error'
fc98a43 [zsxwing] Put clearing local properties to finally and remove redundant private[streaming]
0c7b2eb [zsxwing] Add BatchPage to display details of a batch
---
 .../org/apache/spark/ui/jobs/UIData.scala     |   2 +-
 .../spark/streaming/dstream/DStream.scala     |   2 +-
 .../spark/streaming/scheduler/Job.scala       |  44 ++-
 .../streaming/scheduler/JobScheduler.scala    |  28 +-
 .../spark/streaming/scheduler/JobSet.scala    |   2 +-
 .../spark/streaming/ui/AllBatchesTable.scala  |  26 +-
 .../apache/spark/streaming/ui/BatchPage.scala | 264 ++++++++++++++++++
 .../spark/streaming/ui/BatchUIData.scala      |  75 +++++
 .../ui/StreamingJobProgressListener.scala     | 161 ++++++++---
 .../spark/streaming/ui/StreamingTab.scala     |   4 +-
 .../spark/streaming/UISeleniumSuite.scala     |  83 +++++-
 .../StreamingJobProgressListenerSuite.scala   | 100 ++++++-
 12 files changed, 710 insertions(+), 81 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 711a3697bda15..935c8a4f80e7b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -24,7 +24,7 @@ import org.apache.spark.util.collection.OpenHashSet
 
 import scala.collection.mutable.HashMap
 
-private[jobs] object UIData {
+private[spark] object UIData {
 
   class ExecutorSummary {
     var taskTime : Long = 0
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 24f99a2b929f5..83d41f5762444 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -626,7 +626,7 @@ abstract class DStream[T: ClassTag] (
         println("Time: " + time)
         println("-------------------------------------------")
         firstNum.take(num).foreach(println)
-        if (firstNum.size > num) println("...")
+        if (firstNum.length > num) println("...")
         println()
       }
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
index 30cf87f5b7dd1..3c481bf3491f9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
@@ -25,15 +25,49 @@ import scala.util.Try
  */
 private[streaming]
 class Job(val time: Time, func: () => _) {
-  var id: String = _
-  var result: Try[_] = null
+  private var _id: String = _
+  private var _outputOpId: Int = _
+  private var isSet = false
+  private var _result: Try[_] = null
 
   def run() {
-    result = Try(func())
+    _result = Try(func())
   }
 
-  def setId(number: Int) {
-    id = "streaming job " + time + "." + number
+  def result: Try[_] = {
+    if (_result == null) {
+      throw new IllegalStateException("Cannot access result before job finishes")
+    }
+    _result
+  }
+
+  /**
+   * @return the global unique id of this Job.
+   */
+  def id: String = {
+    if (!isSet) {
+      throw new IllegalStateException("Cannot access id before calling setId")
+    }
+    _id
+  }
+
+  /**
+   * @return the output op id of this Job. Each Job has a unique output op id in the same JobSet.
+   */
+  def outputOpId: Int = {
+    if (!isSet) {
+      throw new IllegalStateException("Cannot access number before calling setId")
+    }
+    _outputOpId
+  }
+
+  def setOutputOpId(outputOpId: Int) {
+    if (isSet) {
+      throw new IllegalStateException("Cannot call setOutputOpId more than once")
+    }
+    isSet = true
+    _id = s"streaming job $time.$outputOpId"
+    _outputOpId = outputOpId
   }
 
   override def toString: String = id
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 508b89278dcba..c7a2c1141a128 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -172,16 +172,28 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     ssc.waiter.notifyError(e)
   }
 
-  private class JobHandler(job: Job) extends Runnable {
+  private class JobHandler(job: Job) extends Runnable with Logging {
     def run() {
-      eventLoop.post(JobStarted(job))
-      // Disable checks for existing output directories in jobs launched by the streaming scheduler,
-      // since we may need to write output to an existing directory during checkpoint recovery;
-      // see SPARK-4835 for more details.
-      PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
-        job.run()
+      ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
+      ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
+      try {
+        eventLoop.post(JobStarted(job))
+        // Disable checks for existing output directories in jobs launched by the streaming
+        // scheduler, since we may need to write output to an existing directory during checkpoint
+        // recovery; see SPARK-4835 for more details.
+        PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
+          job.run()
+        }
+        eventLoop.post(JobCompleted(job))
+      } finally {
+        ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
+        ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
       }
-      eventLoop.post(JobCompleted(job))
     }
   }
 }
+
+private[streaming] object JobScheduler {
+  val BATCH_TIME_PROPERTY_KEY = "spark.streaming.internal.batchTime"
+  val OUTPUT_OP_ID_PROPERTY_KEY = "spark.streaming.internal.outputOpId"
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 5b134877d0b2d..24b3794236ea5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -35,7 +35,7 @@ case class JobSet(
   private var processingStartTime = -1L // when the first job of this jobset started processing
   private var processingEndTime = -1L // when the last job of this jobset finished processing
 
-  jobs.zipWithIndex.foreach { case (job, i) => job.setId(i) }
+  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
   incompleteJobs ++= jobs
 
   def handleJobStart(job: Job) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index df1c0a10704c3..e219e27785533 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -19,7 +19,6 @@ package org.apache.spark.streaming.ui
 
 import scala.xml.Node
 
-import org.apache.spark.streaming.scheduler.BatchInfo
 import org.apache.spark.ui.UIUtils
 
 private[ui] abstract class BatchTableBase(tableId: String) {
@@ -31,18 +30,20 @@ private[ui] abstract class BatchTableBase(tableId: String) {
       <th>Processing Time</th>
   }
 
-  protected def baseRow(batch: BatchInfo): Seq[Node] = {
+  protected def baseRow(batch: BatchUIData): Seq[Node] = {
     val batchTime = batch.batchTime.milliseconds
     val formattedBatchTime = UIUtils.formatDate(batch.batchTime.milliseconds)
-    val eventCount = batch.receivedBlockInfo.values.map {
-      receivers => receivers.map(_.numRecords).sum
-    }.sum
+    val eventCount = batch.numRecords
     val schedulingDelay = batch.schedulingDelay
     val formattedSchedulingDelay = schedulingDelay.map(UIUtils.formatDuration).getOrElse("-")
     val processingTime = batch.processingDelay
     val formattedProcessingTime = processingTime.map(UIUtils.formatDuration).getOrElse("-")
 
-    <td sorttable_customkey={batchTime.toString}>{formattedBatchTime}</td>
+    <td sorttable_customkey={batchTime.toString}>
+      <a href={s"batch?id=$batchTime"}>
+        {formattedBatchTime}
+      </a>
+    </td>
       <td sorttable_customkey={eventCount.toString}>{eventCount.toString} events</td>
       <td sorttable_customkey={schedulingDelay.getOrElse(Long.MaxValue).toString}>
         {formattedSchedulingDelay}
@@ -73,8 +74,9 @@ private[ui] abstract class BatchTableBase(tableId: String) {
   protected def renderRows: Seq[Node]
 }
 
-private[ui] class ActiveBatchTable(runningBatches: Seq[BatchInfo], waitingBatches: Seq[BatchInfo])
-  extends BatchTableBase("active-batches-table") {
+private[ui] class ActiveBatchTable(
+    runningBatches: Seq[BatchUIData],
+    waitingBatches: Seq[BatchUIData]) extends BatchTableBase("active-batches-table") {
 
   override protected def columns: Seq[Node] = super.columns ++ <th>Status</th>
 
@@ -85,16 +87,16 @@ private[ui] class ActiveBatchTable(runningBatches: Seq[BatchInfo], waitingBatche
       runningBatches.flatMap(batch => <tr>{runningBatchRow(batch)}</tr>)
   }
 
-  private def runningBatchRow(batch: BatchInfo): Seq[Node] = {
+  private def runningBatchRow(batch: BatchUIData): Seq[Node] = {
     baseRow(batch) ++ <td>processing</td>
   }
 
-  private def waitingBatchRow(batch: BatchInfo): Seq[Node] = {
+  private def waitingBatchRow(batch: BatchUIData): Seq[Node] = {
     baseRow(batch) ++ <td>queued</td>
   }
 }
 
-private[ui] class CompletedBatchTable(batches: Seq[BatchInfo])
+private[ui] class CompletedBatchTable(batches: Seq[BatchUIData])
   extends BatchTableBase("completed-batches-table") {
 
   override protected def columns: Seq[Node] = super.columns ++ <th>Total Delay</th>
@@ -103,7 +105,7 @@ private[ui] class CompletedBatchTable(batches: Seq[BatchInfo])
     batches.flatMap(batch => <tr>{completedBatchRow(batch)}</tr>)
   }
 
-  private def completedBatchRow(batch: BatchInfo): Seq[Node] = {
+  private def completedBatchRow(batch: BatchUIData): Seq[Node] = {
     val totalDelay = batch.totalDelay
     val formattedTotalDelay = totalDelay.map(UIUtils.formatDuration).getOrElse("-")
     baseRow(batch) ++
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
new file mode 100644
index 0000000000000..2da9a29e2529e
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.ui
+
+import javax.servlet.http.HttpServletRequest
+
+import scala.xml.{NodeSeq, Node}
+
+import org.apache.commons.lang3.StringEscapeUtils
+
+import org.apache.spark.streaming.Time
+import org.apache.spark.ui.{UIUtils, WebUIPage}
+import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId}
+import org.apache.spark.ui.jobs.UIData.JobUIData
+
+
+private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
+  private val streamingListener = parent.listener
+  private val sparkListener = parent.ssc.sc.jobProgressListener
+
+  private def columns: Seq[Node] = {
+    <th>Output Op Id</th>
+      <th>Description</th>
+      <th>Duration</th>
+      <th>Job Id</th>
+      <th>Duration</th>
+      <th class="sorttable_nosort">Stages: Succeeded/Total</th>
+      <th class="sorttable_nosort">Tasks (for all stages): Succeeded/Total</th>
+      <th>Error</th>
+  }
+
+  /**
+   * Generate a row for a Spark Job. Because duplicated output op infos needs to be collapsed into
+   * one cell, we use "rowspan" for the first row of a output op.
+   */
+  def generateJobRow(
+      outputOpId: OutputOpId,
+      formattedOutputOpDuration: String,
+      numSparkJobRowsInOutputOp: Int,
+      isFirstRow: Boolean,
+      sparkJob: JobUIData): Seq[Node] = {
+    val lastStageInfo = Option(sparkJob.stageIds)
+      .filter(_.nonEmpty)
+      .flatMap { ids => sparkListener.stageIdToInfo.get(ids.max) }
+    val lastStageData = lastStageInfo.flatMap { s =>
+      sparkListener.stageIdToData.get((s.stageId, s.attemptId))
+    }
+
+    val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)")
+    val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("")
+    val duration: Option[Long] = {
+      sparkJob.submissionTime.map { start =>
+        val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis())
+        end - start
+      }
+    }
+    val lastFailureReason =
+      sparkJob.stageIds.sorted.reverse.flatMap(sparkListener.stageIdToInfo.get).
+      dropWhile(_.failureReason == None).take(1). // get the first info that contains failure
+      flatMap(info => info.failureReason).headOption.getOrElse("")
+    val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("-")
+    val detailUrl = s"${UIUtils.prependBaseUri(parent.basePath)}/jobs/job?id=${sparkJob.jobId}"
+
+    // In the first row, output op id and its information needs to be shown. In other rows, these
+    // cells will be taken up due to "rowspan".
+    // scalastyle:off
+    val prefixCells =
+      if (isFirstRow) {
+        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
+        <td rowspan={numSparkJobRowsInOutputOp.toString}>
+          <span class="description-input" title={lastStageDescription}>
+            {lastStageDescription}
+          </span>{lastStageName}
+        </td>
+        <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td>
+      } else {
+        Nil
+      }
+    // scalastyle:on
+
+    <tr>
+      {prefixCells}
+      <td sorttable_customkey={sparkJob.jobId.toString}>
+        <a href={detailUrl}>
+          {sparkJob.jobId}{sparkJob.jobGroup.map(id => s"($id)").getOrElse("")}
+        </a>
+      </td>
+      <td sorttable_customkey={duration.getOrElse(Long.MaxValue).toString}>
+        {formattedDuration}
+      </td>
+      <td class="stage-progress-cell">
+        {sparkJob.completedStageIndices.size}/{sparkJob.stageIds.size - sparkJob.numSkippedStages}
+        {if (sparkJob.numFailedStages > 0) s"(${sparkJob.numFailedStages} failed)"}
+        {if (sparkJob.numSkippedStages > 0) s"(${sparkJob.numSkippedStages} skipped)"}
+      </td>
+      <td class="progress-cell">
+        {
+          UIUtils.makeProgressBar(
+            started = sparkJob.numActiveTasks,
+            completed = sparkJob.numCompletedTasks,
+            failed = sparkJob.numFailedTasks,
+            skipped = sparkJob.numSkippedTasks,
+            total = sparkJob.numTasks - sparkJob.numSkippedTasks)
+        }
+      </td>
+      {failureReasonCell(lastFailureReason)}
+    </tr>
+  }
+
+  private def generateOutputOpIdRow(
+      outputOpId: OutputOpId, sparkJobs: Seq[JobUIData]): Seq[Node] = {
+    val sparkjobDurations = sparkJobs.map(sparkJob => {
+      sparkJob.submissionTime.map { start =>
+        val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis())
+        end - start
+      }
+    })
+    val formattedOutputOpDuration =
+      if (sparkjobDurations.exists(_ == None)) {
+        // If any job does not finish, set "formattedOutputOpDuration" to "-"
+        "-"
+      } else {
+        UIUtils.formatDuration(sparkjobDurations.flatMap(x => x).sum)
+      }
+    generateJobRow(outputOpId, formattedOutputOpDuration, sparkJobs.size, true, sparkJobs.head) ++
+      sparkJobs.tail.map { sparkJob =>
+        generateJobRow(outputOpId, formattedOutputOpDuration, sparkJobs.size, false, sparkJob)
+      }.flatMap(x => x)
+  }
+
+  private def failureReasonCell(failureReason: String): Seq[Node] = {
+    val isMultiline = failureReason.indexOf('\n') >= 0
+    // Display the first line by default
+    val failureReasonSummary = StringEscapeUtils.escapeHtml4(
+      if (isMultiline) {
+        failureReason.substring(0, failureReason.indexOf('\n'))
+      } else {
+        failureReason
+      })
+    val details = if (isMultiline) {
+      // scalastyle:off
+      <span onclick="this.parentNode.querySelector('.stacktrace-details').classList.toggle('collapsed')"
+            class="expand-details">
+        +details
+      </span> ++
+        <div class="stacktrace-details collapsed">
+          <pre>{failureReason}</pre>
+        </div>
+      // scalastyle:on
+    } else {
+      ""
+    }
+    <td valign="middle" style="max-width: 300px">{failureReasonSummary}{details}</td>
+  }
+
+  private def getJobData(sparkJobId: SparkJobId): Option[JobUIData] = {
+    sparkListener.activeJobs.get(sparkJobId).orElse {
+      sparkListener.completedJobs.find(_.jobId == sparkJobId).orElse {
+        sparkListener.failedJobs.find(_.jobId == sparkJobId)
+      }
+    }
+  }
+
+  /**
+   * Generate the job table for the batch.
+   */
+  private def generateJobTable(batchUIData: BatchUIData): Seq[Node] = {
+    val outputOpIdToSparkJobIds = batchUIData.outputOpIdSparkJobIdPairs.groupBy(_.outputOpId).toSeq.
+      sortBy(_._1). // sorted by OutputOpId
+      map { case (outputOpId, outputOpIdAndSparkJobIds) =>
+        // sort SparkJobIds for each OutputOpId
+        (outputOpId, outputOpIdAndSparkJobIds.map(_.sparkJobId).sorted)
+      }
+    sparkListener.synchronized {
+      val outputOpIdWithJobs: Seq[(OutputOpId, Seq[JobUIData])] =
+        outputOpIdToSparkJobIds.map { case (outputOpId, sparkJobIds) =>
+          // Filter out spark Job ids that don't exist in sparkListener
+          (outputOpId, sparkJobIds.flatMap(getJobData))
+        }
+
+      <table id="batch-job-table" class="table table-bordered table-striped table-condensed">
+        <thead>
+          {columns}
+        </thead>
+        <tbody>
+          {
+            outputOpIdWithJobs.map {
+              case (outputOpId, jobs) => generateOutputOpIdRow(outputOpId, jobs)
+            }
+          }
+        </tbody>
+      </table>
+    }
+  }
+
+  def render(request: HttpServletRequest): Seq[Node] = {
+    val batchTime = Option(request.getParameter("id")).map(id => Time(id.toLong)).getOrElse {
+      throw new IllegalArgumentException(s"Missing id parameter")
+    }
+    val formattedBatchTime = UIUtils.formatDate(batchTime.milliseconds)
+
+    val batchUIData = streamingListener.getBatchUIData(batchTime).getOrElse {
+      throw new IllegalArgumentException(s"Batch $formattedBatchTime does not exist")
+    }
+
+    val formattedSchedulingDelay =
+      batchUIData.schedulingDelay.map(UIUtils.formatDuration).getOrElse("-")
+    val formattedProcessingTime =
+      batchUIData.processingDelay.map(UIUtils.formatDuration).getOrElse("-")
+    val formattedTotalDelay = batchUIData.totalDelay.map(UIUtils.formatDuration).getOrElse("-")
+
+    val summary: NodeSeq =
+      <div>
+        <ul class="unstyled">
+          <li>
+            <strong>Batch Duration: </strong>
+            {UIUtils.formatDuration(streamingListener.batchDuration)}
+          </li>
+          <li>
+            <strong>Input data size: </strong>
+            {batchUIData.numRecords} records
+          </li>
+          <li>
+            <strong>Scheduling delay: </strong>
+            {formattedSchedulingDelay}
+          </li>
+          <li>
+            <strong>Processing time: </strong>
+            {formattedProcessingTime}
+          </li>
+          <li>
+            <strong>Total delay: </strong>
+            {formattedTotalDelay}
+          </li>
+        </ul>
+      </div>
+
+    val jobTable =
+      if (batchUIData.outputOpIdSparkJobIdPairs.isEmpty) {
+        <div>Cannot find any job for Batch {formattedBatchTime}.</div>
+      } else {
+        generateJobTable(batchUIData)
+      }
+
+    val content = summary ++ jobTable
+
+    UIUtils.headerSparkPage(s"Details of batch at $formattedBatchTime", content, parent)
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
new file mode 100644
index 0000000000000..f45c291b7c0fe
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.streaming.ui
+
+import org.apache.spark.streaming.Time
+import org.apache.spark.streaming.scheduler.BatchInfo
+import org.apache.spark.streaming.ui.StreamingJobProgressListener._
+
+private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId)
+
+private[ui] case class BatchUIData(
+    val batchTime: Time,
+    val receiverNumRecords: Map[Int, Long],
+    val submissionTime: Long,
+    val processingStartTime: Option[Long],
+    val processingEndTime: Option[Long],
+    var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) {
+
+  /**
+   * Time taken for the first job of this batch to start processing from the time this batch
+   * was submitted to the streaming scheduler. Essentially, it is
+   * `processingStartTime` - `submissionTime`.
+   */
+  def schedulingDelay: Option[Long] = processingStartTime.map(_ - submissionTime)
+
+  /**
+   * Time taken for the all jobs of this batch to finish processing from the time they started
+   * processing. Essentially, it is `processingEndTime` - `processingStartTime`.
+   */
+  def processingDelay: Option[Long] = {
+    for (start <- processingStartTime;
+         end <- processingEndTime)
+      yield end - start
+  }
+
+  /**
+   * Time taken for all the jobs of this batch to finish processing from the time they
+   * were submitted.  Essentially, it is `processingDelay` + `schedulingDelay`.
+   */
+  def totalDelay: Option[Long] = processingEndTime.map(_ - submissionTime)
+
+  /**
+   * The number of recorders received by the receivers in this batch.
+   */
+  def numRecords: Long = receiverNumRecords.map(_._2).sum
+}
+
+private[ui] object BatchUIData {
+
+  def apply(batchInfo: BatchInfo): BatchUIData = {
+    new BatchUIData(
+      batchInfo.batchTime,
+      batchInfo.receivedBlockInfo.mapValues(_.map(_.numRecords).sum),
+      batchInfo.submissionTime,
+      batchInfo.processingStartTime,
+      batchInfo.processingEndTime
+    )
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index be1e8686cf9fa..34b55717a1db2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -17,29 +17,58 @@
 
 package org.apache.spark.streaming.ui
 
-import scala.collection.mutable.{Queue, HashMap}
+import java.util.LinkedHashMap
+import java.util.{Map => JMap}
+import java.util.Properties
 
+import scala.collection.mutable.{ArrayBuffer, Queue, HashMap, SynchronizedBuffer}
+
+import org.apache.spark.scheduler._
 import org.apache.spark.streaming.{Time, StreamingContext}
 import org.apache.spark.streaming.scheduler._
 import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
 import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted
-import org.apache.spark.streaming.scheduler.BatchInfo
 import org.apache.spark.streaming.scheduler.StreamingListenerBatchSubmitted
 import org.apache.spark.util.Distribution
 
 
 private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
-  extends StreamingListener {
+  extends StreamingListener with SparkListener {
 
-  private val waitingBatchInfos = new HashMap[Time, BatchInfo]
-  private val runningBatchInfos = new HashMap[Time, BatchInfo]
-  private val completedBatchInfos = new Queue[BatchInfo]
-  private val batchInfoLimit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 100)
+  private val waitingBatchUIData = new HashMap[Time, BatchUIData]
+  private val runningBatchUIData = new HashMap[Time, BatchUIData]
+  private val completedBatchUIData = new Queue[BatchUIData]
+  private val batchUIDataLimit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 100)
   private var totalCompletedBatches = 0L
   private var totalReceivedRecords = 0L
   private var totalProcessedRecords = 0L
   private val receiverInfos = new HashMap[Int, ReceiverInfo]
 
+  // Because onJobStart and onBatchXXX messages are processed in different threads,
+  // we may not be able to get the corresponding BatchUIData when receiving onJobStart. So here we
+  // cannot use a map of (Time, BatchUIData).
+  private[ui] val batchTimeToOutputOpIdSparkJobIdPair =
+    new LinkedHashMap[Time, SynchronizedBuffer[OutputOpIdAndSparkJobId]] {
+      override def removeEldestEntry(
+          p1: JMap.Entry[Time, SynchronizedBuffer[OutputOpIdAndSparkJobId]]): Boolean = {
+        // If a lot of "onBatchCompleted"s happen before "onJobStart" (image if
+        // SparkContext.listenerBus is very slow), "batchTimeToOutputOpIdToSparkJobIds"
+        // may add some information for a removed batch when processing "onJobStart". It will be a
+        // memory leak.
+        //
+        // To avoid the memory leak, we control the size of "batchTimeToOutputOpIdToSparkJobIds" and
+        // evict the eldest one.
+        //
+        // Note: if "onJobStart" happens before "onBatchSubmitted", the size of
+        // "batchTimeToOutputOpIdToSparkJobIds" may be greater than the number of the retained
+        // batches temporarily, so here we use "10" to handle such case. This is not a perfect
+        // solution, but at least it can handle most of cases.
+        size() >
+          waitingBatchUIData.size + runningBatchUIData.size + completedBatchUIData.size + 10
+      }
+    }
+
+
   val batchDuration = ssc.graph.batchDuration.milliseconds
 
   override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) {
@@ -62,37 +91,62 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
 
   override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
     synchronized {
-      waitingBatchInfos(batchSubmitted.batchInfo.batchTime) = batchSubmitted.batchInfo
+      waitingBatchUIData(batchSubmitted.batchInfo.batchTime) =
+        BatchUIData(batchSubmitted.batchInfo)
     }
   }
 
   override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = synchronized {
-    runningBatchInfos(batchStarted.batchInfo.batchTime) = batchStarted.batchInfo
-    waitingBatchInfos.remove(batchStarted.batchInfo.batchTime)
+    val batchUIData = BatchUIData(batchStarted.batchInfo)
+    runningBatchUIData(batchStarted.batchInfo.batchTime) = BatchUIData(batchStarted.batchInfo)
+    waitingBatchUIData.remove(batchStarted.batchInfo.batchTime)
 
-    batchStarted.batchInfo.receivedBlockInfo.foreach { case (_, infos) =>
-      totalReceivedRecords += infos.map(_.numRecords).sum
-    }
+    totalReceivedRecords += batchUIData.numRecords
   }
 
   override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
     synchronized {
-      waitingBatchInfos.remove(batchCompleted.batchInfo.batchTime)
-      runningBatchInfos.remove(batchCompleted.batchInfo.batchTime)
-      completedBatchInfos.enqueue(batchCompleted.batchInfo)
-      if (completedBatchInfos.size > batchInfoLimit) completedBatchInfos.dequeue()
+      waitingBatchUIData.remove(batchCompleted.batchInfo.batchTime)
+      runningBatchUIData.remove(batchCompleted.batchInfo.batchTime)
+      val batchUIData = BatchUIData(batchCompleted.batchInfo)
+      completedBatchUIData.enqueue(batchUIData)
+      if (completedBatchUIData.size > batchUIDataLimit) {
+        val removedBatch = completedBatchUIData.dequeue()
+        batchTimeToOutputOpIdSparkJobIdPair.remove(removedBatch.batchTime)
+      }
       totalCompletedBatches += 1L
 
-      batchCompleted.batchInfo.receivedBlockInfo.foreach { case (_, infos) =>
-        totalProcessedRecords += infos.map(_.numRecords).sum
+      totalProcessedRecords += batchUIData.numRecords
+    }
+  }
+
+  override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized {
+    getBatchTimeAndOutputOpId(jobStart.properties).foreach { case (batchTime, outputOpId) =>
+      var outputOpIdToSparkJobIds = batchTimeToOutputOpIdSparkJobIdPair.get(batchTime)
+      if (outputOpIdToSparkJobIds == null) {
+        outputOpIdToSparkJobIds =
+          new ArrayBuffer[OutputOpIdAndSparkJobId]()
+            with SynchronizedBuffer[OutputOpIdAndSparkJobId]
+        batchTimeToOutputOpIdSparkJobIdPair.put(batchTime, outputOpIdToSparkJobIds)
       }
+      outputOpIdToSparkJobIds += OutputOpIdAndSparkJobId(outputOpId, jobStart.jobId)
     }
   }
 
-  def numReceivers: Int = synchronized {
-    ssc.graph.getReceiverInputStreams().size
+  private def getBatchTimeAndOutputOpId(properties: Properties): Option[(Time, Int)] = {
+    val batchTime = properties.getProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY)
+    if (batchTime == null) {
+      // Not submitted from JobScheduler
+      None
+    } else {
+      val outputOpId = properties.getProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY)
+      assert(outputOpId != null)
+      Some(Time(batchTime.toLong) -> outputOpId.toInt)
+    }
   }
 
+  def numReceivers: Int = ssc.graph.getReceiverInputStreams().size
+
   def numTotalCompletedBatches: Long = synchronized {
     totalCompletedBatches
   }
@@ -106,19 +160,19 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   }
 
   def numUnprocessedBatches: Long = synchronized {
-    waitingBatchInfos.size + runningBatchInfos.size
+    waitingBatchUIData.size + runningBatchUIData.size
   }
 
-  def waitingBatches: Seq[BatchInfo] = synchronized {
-    waitingBatchInfos.values.toSeq
+  def waitingBatches: Seq[BatchUIData] = synchronized {
+    waitingBatchUIData.values.toSeq
   }
 
-  def runningBatches: Seq[BatchInfo] = synchronized {
-    runningBatchInfos.values.toSeq
+  def runningBatches: Seq[BatchUIData] = synchronized {
+    runningBatchUIData.values.toSeq
   }
 
-  def retainedCompletedBatches: Seq[BatchInfo] = synchronized {
-    completedBatchInfos.toSeq
+  def retainedCompletedBatches: Seq[BatchUIData] = synchronized {
+    completedBatchUIData.toSeq
   }
 
   def processingDelayDistribution: Option[Distribution] = synchronized {
@@ -134,15 +188,11 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   }
 
   def receivedRecordsDistributions: Map[Int, Option[Distribution]] = synchronized {
-    val latestBatchInfos = retainedBatches.reverse.take(batchInfoLimit)
-    val latestBlockInfos = latestBatchInfos.map(_.receivedBlockInfo)
+    val latestBatches = retainedBatches.reverse.take(batchUIDataLimit)
     (0 until numReceivers).map { receiverId =>
-      val blockInfoOfParticularReceiver = latestBlockInfos.map { batchInfo =>
-        batchInfo.get(receiverId).getOrElse(Array.empty)
-      }
-      val recordsOfParticularReceiver = blockInfoOfParticularReceiver.map { blockInfo =>
-      // calculate records per second for each batch
-        blockInfo.map(_.numRecords).sum.toDouble * 1000 / batchDuration
+      val recordsOfParticularReceiver = latestBatches.map { batch =>
+        // calculate records per second for each batch
+        batch.receiverNumRecords.get(receiverId).sum.toDouble * 1000 / batchDuration
       }
       val distributionOption = Distribution(recordsOfParticularReceiver)
       (receiverId, distributionOption)
@@ -150,10 +200,10 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   }
 
   def lastReceivedBatchRecords: Map[Int, Long] = synchronized {
-    val lastReceivedBlockInfoOption = lastReceivedBatch.map(_.receivedBlockInfo)
+    val lastReceivedBlockInfoOption = lastReceivedBatch.map(_.receiverNumRecords)
     lastReceivedBlockInfoOption.map { lastReceivedBlockInfo =>
       (0 until numReceivers).map { receiverId =>
-        (receiverId, lastReceivedBlockInfo(receiverId).map(_.numRecords).sum)
+        (receiverId, lastReceivedBlockInfo.getOrElse(receiverId, 0L))
       }.toMap
     }.getOrElse {
       (0 until numReceivers).map(receiverId => (receiverId, 0L)).toMap
@@ -164,20 +214,39 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     receiverInfos.get(receiverId)
   }
 
-  def lastCompletedBatch: Option[BatchInfo] = synchronized {
-    completedBatchInfos.sortBy(_.batchTime)(Time.ordering).lastOption
+  def lastCompletedBatch: Option[BatchUIData] = synchronized {
+    completedBatchUIData.sortBy(_.batchTime)(Time.ordering).lastOption
   }
 
-  def lastReceivedBatch: Option[BatchInfo] = synchronized {
+  def lastReceivedBatch: Option[BatchUIData] = synchronized {
     retainedBatches.lastOption
   }
 
-  private def retainedBatches: Seq[BatchInfo] = {
-    (waitingBatchInfos.values.toSeq ++
-      runningBatchInfos.values.toSeq ++ completedBatchInfos).sortBy(_.batchTime)(Time.ordering)
+  private def retainedBatches: Seq[BatchUIData] = {
+    (waitingBatchUIData.values.toSeq ++
+      runningBatchUIData.values.toSeq ++ completedBatchUIData).sortBy(_.batchTime)(Time.ordering)
+  }
+
+  private def extractDistribution(getMetric: BatchUIData => Option[Long]): Option[Distribution] = {
+    Distribution(completedBatchUIData.flatMap(getMetric(_)).map(_.toDouble))
   }
 
-  private def extractDistribution(getMetric: BatchInfo => Option[Long]): Option[Distribution] = {
-    Distribution(completedBatchInfos.flatMap(getMetric(_)).map(_.toDouble))
+  def getBatchUIData(batchTime: Time): Option[BatchUIData] = synchronized {
+    val batchUIData = waitingBatchUIData.get(batchTime).orElse {
+      runningBatchUIData.get(batchTime).orElse {
+        completedBatchUIData.find(batch => batch.batchTime == batchTime)
+      }
+    }
+    batchUIData.foreach { _batchUIData =>
+      val outputOpIdToSparkJobIds =
+        Option(batchTimeToOutputOpIdSparkJobIdPair.get(batchTime)).getOrElse(Seq.empty)
+      _batchUIData.outputOpIdSparkJobIdPairs = outputOpIdToSparkJobIds
+    }
+    batchUIData
   }
 }
+
+private[streaming] object StreamingJobProgressListener {
+  type SparkJobId = Int
+  type OutputOpId = Int
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
index 9a860ea4a6c68..e4039639adbad 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
@@ -27,14 +27,16 @@ import StreamingTab._
  * Spark Web UI tab that shows statistics of a streaming job.
  * This assumes the given SparkContext has enabled its SparkUI.
  */
-private[spark] class StreamingTab(ssc: StreamingContext)
+private[spark] class StreamingTab(val ssc: StreamingContext)
   extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {
 
   val parent = getSparkUI(ssc)
   val listener = ssc.progressListener
 
   ssc.addStreamingListener(listener)
+  ssc.sc.addSparkListener(listener)
   attachPage(new StreamingPage(this))
+  attachPage(new BatchPage(this))
   parent.attachTab(this)
 
   def detach() {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index 205ddf6dbe9b0..8de43baabc21d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.streaming
 
+import scala.collection.mutable.Queue
+
 import org.openqa.selenium.WebDriver
 import org.openqa.selenium.htmlunit.HtmlUnitDriver
 import org.scalatest._
@@ -60,8 +62,28 @@ class UISeleniumSuite
     ssc
   }
 
+  private def setupStreams(ssc: StreamingContext): Unit = {
+    val rdds = Queue(ssc.sc.parallelize(1 to 4, 4))
+    val inputStream = ssc.queueStream(rdds)
+    inputStream.foreachRDD { rdd =>
+      rdd.foreach(_ => {})
+      rdd.foreach(_ => {})
+    }
+    inputStream.foreachRDD { rdd =>
+      rdd.foreach(_ => {})
+      try {
+        rdd.foreach(_ => throw new RuntimeException("Oops"))
+      } catch {
+        case e: SparkException if e.getMessage.contains("Oops") =>
+      }
+    }
+  }
+
   test("attaching and detaching a Streaming tab") {
     withStreamingContext(newSparkStreamingContext()) { ssc =>
+      setupStreams(ssc)
+      ssc.start()
+
       val sparkUI = ssc.sparkContext.ui.get
 
       eventually(timeout(10 seconds), interval(50 milliseconds)) {
@@ -77,8 +99,8 @@ class UISeleniumSuite
         statisticText should contain("Batch interval:")
 
         val h4Text = findAll(cssSelector("h4")).map(_.text).toSeq
-        h4Text should contain("Active Batches (0)")
-        h4Text should contain("Completed Batches (last 0 out of 0)")
+        h4Text.exists(_.matches("Active Batches \\(\\d+\\)")) should be (true)
+        h4Text.exists(_.matches("Completed Batches \\(last \\d+ out of \\d+\\)")) should be (true)
 
         findAll(cssSelector("""#active-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay", "Processing Time", "Status")
@@ -86,6 +108,63 @@ class UISeleniumSuite
         findAll(cssSelector("""#completed-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay", "Processing Time", "Total Delay")
         }
+
+        val batchLinks =
+          findAll(cssSelector("""#completed-batches-table a""")).flatMap(_.attribute("href")).toSeq
+        batchLinks.size should be >= 1
+
+        // Check a normal batch page
+        go to (batchLinks.last) // Last should be the first batch, so it will have some jobs
+        val summaryText = findAll(cssSelector("li strong")).map(_.text).toSeq
+        summaryText should contain ("Batch Duration:")
+        summaryText should contain ("Input data size:")
+        summaryText should contain ("Scheduling delay:")
+        summaryText should contain ("Processing time:")
+        summaryText should contain ("Total delay:")
+
+        findAll(cssSelector("""#batch-job-table th""")).map(_.text).toSeq should be {
+          List("Output Op Id", "Description", "Duration", "Job Id", "Duration",
+            "Stages: Succeeded/Total", "Tasks (for all stages): Succeeded/Total", "Error")
+        }
+
+        // Check we have 2 output op ids
+        val outputOpIds = findAll(cssSelector(".output-op-id-cell")).toSeq
+        outputOpIds.map(_.attribute("rowspan")) should be (List(Some("2"), Some("2")))
+        outputOpIds.map(_.text) should be (List("0", "1"))
+
+        // Check job ids
+        val jobIdCells = findAll(cssSelector( """#batch-job-table a""")).toSeq
+        jobIdCells.map(_.text) should be (List("0", "1", "2", "3"))
+
+        val jobLinks = jobIdCells.flatMap(_.attribute("href"))
+        jobLinks.size should be (4)
+
+        // Check stage progress
+        findAll(cssSelector(""".stage-progress-cell""")).map(_.text).toSeq should be
+          (List("1/1", "1/1", "1/1", "0/1 (1 failed)"))
+
+        // Check job progress
+        findAll(cssSelector(""".progress-cell""")).map(_.text).toSeq should be
+          (List("1/1", "1/1", "1/1", "0/1 (1 failed)"))
+
+        // Check stacktrace
+        val errorCells = findAll(cssSelector(""".stacktrace-details""")).map(_.text).toSeq
+        errorCells should have size 1
+        errorCells(0) should include("java.lang.RuntimeException: Oops")
+
+        // Check the job link in the batch page is right
+        go to (jobLinks(0))
+        val jobDetails = findAll(cssSelector("li strong")).map(_.text).toSeq
+        jobDetails should contain("Status:")
+        jobDetails should contain("Completed Stages:")
+
+        // Check a batch page without id
+        go to (sparkUI.appUIAddress.stripSuffix("/") + "/streaming/batch/")
+        webDriver.getPageSource should include ("Missing id parameter")
+
+        // Check a non-exist batch
+        go to (sparkUI.appUIAddress.stripSuffix("/") + "/streaming/batch/?id=12345")
+        webDriver.getPageSource should include ("does not exist")
       }
 
       ssc.stop(false)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 94b1985116feb..fa89536de4054 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.streaming.ui
 
+import java.util.Properties
+
 import org.scalatest.Matchers
 
+import org.apache.spark.scheduler.SparkListenerJobStart
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.scheduler._
 import org.apache.spark.streaming.{Duration, Time, Milliseconds, TestSuiteBase}
@@ -28,6 +31,17 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
   val input = (1 to 4).map(Seq(_)).toSeq
   val operation = (d: DStream[Int]) => d.map(x => x)
 
+  private def createJobStart(
+      batchTime: Time, outputOpId: Int, jobId: Int): SparkListenerJobStart = {
+    val properties = new Properties()
+    properties.setProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, batchTime.milliseconds.toString)
+    properties.setProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, outputOpId.toString)
+    SparkListenerJobStart(jobId = jobId,
+      0L, // unused
+      Nil, // unused
+      properties)
+  }
+
   override def batchDuration: Duration = Milliseconds(100)
 
   test("onBatchSubmitted, onBatchStarted, onBatchCompleted, " +
@@ -43,7 +57,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     // onBatchSubmitted
     val batchInfoSubmitted = BatchInfo(Time(1000), receivedBlockInfo, 1000, None, None)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
-    listener.waitingBatches should be (List(batchInfoSubmitted))
+    listener.waitingBatches should be (List(BatchUIData(batchInfoSubmitted)))
     listener.runningBatches should be (Nil)
     listener.retainedCompletedBatches should be (Nil)
     listener.lastCompletedBatch should be (None)
@@ -56,7 +70,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     val batchInfoStarted = BatchInfo(Time(1000), receivedBlockInfo, 1000, Some(2000), None)
     listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
     listener.waitingBatches should be (Nil)
-    listener.runningBatches should be (List(batchInfoStarted))
+    listener.runningBatches should be (List(BatchUIData(batchInfoStarted)))
     listener.retainedCompletedBatches should be (Nil)
     listener.lastCompletedBatch should be (None)
     listener.numUnprocessedBatches should be (1)
@@ -64,13 +78,40 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalProcessedRecords should be (0)
     listener.numTotalReceivedRecords should be (600)
 
+    // onJobStart
+    val jobStart1 = createJobStart(Time(1000), outputOpId = 0, jobId = 0)
+    listener.onJobStart(jobStart1)
+
+    val jobStart2 = createJobStart(Time(1000), outputOpId = 0, jobId = 1)
+    listener.onJobStart(jobStart2)
+
+    val jobStart3 = createJobStart(Time(1000), outputOpId = 1, jobId = 0)
+    listener.onJobStart(jobStart3)
+
+    val jobStart4 = createJobStart(Time(1000), outputOpId = 1, jobId = 1)
+    listener.onJobStart(jobStart4)
+
+    val batchUIData = listener.getBatchUIData(Time(1000))
+    batchUIData should not be None
+    batchUIData.get.batchTime should be (batchInfoStarted.batchTime)
+    batchUIData.get.schedulingDelay should be (batchInfoStarted.schedulingDelay)
+    batchUIData.get.processingDelay should be (batchInfoStarted.processingDelay)
+    batchUIData.get.totalDelay should be (batchInfoStarted.totalDelay)
+    batchUIData.get.receiverNumRecords should be (Map(0 -> 300L, 1 -> 300L))
+    batchUIData.get.numRecords should be(600)
+    batchUIData.get.outputOpIdSparkJobIdPairs should be
+      Seq(OutputOpIdAndSparkJobId(0, 0),
+        OutputOpIdAndSparkJobId(0, 1),
+        OutputOpIdAndSparkJobId(1, 0),
+        OutputOpIdAndSparkJobId(1, 1))
+
     // onBatchCompleted
     val batchInfoCompleted = BatchInfo(Time(1000), receivedBlockInfo, 1000, Some(2000), None)
     listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (Nil)
-    listener.retainedCompletedBatches should be (List(batchInfoCompleted))
-    listener.lastCompletedBatch should be (Some(batchInfoCompleted))
+    listener.retainedCompletedBatches should be (List(BatchUIData(batchInfoCompleted)))
+    listener.lastCompletedBatch should be (Some(BatchUIData(batchInfoCompleted)))
     listener.numUnprocessedBatches should be (0)
     listener.numTotalCompletedBatches should be (1)
     listener.numTotalProcessedRecords should be (600)
@@ -116,4 +157,55 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.retainedCompletedBatches.size should be (limit)
     listener.numTotalCompletedBatches should be(limit + 10)
   }
+
+  test("out-of-order onJobStart and onBatchXXX") {
+    val ssc = setupStreams(input, operation)
+    val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 100)
+    val listener = new StreamingJobProgressListener(ssc)
+
+    // fulfill completedBatchInfos
+    for(i <- 0 until limit) {
+      val batchInfoCompleted =
+        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
+      val jobStart = createJobStart(Time(1000 + i * 100), outputOpId = 0, jobId = 1)
+      listener.onJobStart(jobStart)
+    }
+
+    // onJobStart happens before onBatchSubmitted
+    val jobStart = createJobStart(Time(1000 + limit * 100), outputOpId = 0, jobId = 0)
+    listener.onJobStart(jobStart)
+
+    val batchInfoSubmitted =
+      BatchInfo(Time(1000 + limit * 100), Map.empty, (1000 + limit * 100), None, None)
+    listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
+
+    // We still can see the info retrieved from onJobStart
+    val batchUIData = listener.getBatchUIData(Time(1000 + limit * 100))
+    batchUIData should not be None
+    batchUIData.get.batchTime should be (batchInfoSubmitted.batchTime)
+    batchUIData.get.schedulingDelay should be (batchInfoSubmitted.schedulingDelay)
+    batchUIData.get.processingDelay should be (batchInfoSubmitted.processingDelay)
+    batchUIData.get.totalDelay should be (batchInfoSubmitted.totalDelay)
+    batchUIData.get.receiverNumRecords should be (Map.empty)
+    batchUIData.get.numRecords should be (0)
+    batchUIData.get.outputOpIdSparkJobIdPairs should be (Seq(OutputOpIdAndSparkJobId(0, 0)))
+
+    // A lot of "onBatchCompleted"s happen before "onJobStart"
+    for(i <- limit + 1 to limit * 2) {
+      val batchInfoCompleted =
+        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
+    }
+
+    for(i <- limit + 1 to limit * 2) {
+      val jobStart = createJobStart(Time(1000 + i * 100), outputOpId = 0, jobId = 1)
+      listener.onJobStart(jobStart)
+    }
+
+    // We should not leak memory
+    listener.batchTimeToOutputOpIdSparkJobIdPair.size() should be <=
+      (listener.waitingBatches.size + listener.runningBatches.size +
+        listener.retainedCompletedBatches.size + 10)
+  }
 }

From 7143f6e9718bae9cffa0a73df03ba8c9860ee129 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=91=E5=B3=A4?= <chensong.cs@alibaba-inc.com>
Date: Wed, 29 Apr 2015 18:23:42 -0700
Subject: [PATCH 3/4] [SPARK-7234][SQL] Fix DateType mismatch when codegen on.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: 云峤 <chensong.cs@alibaba-inc.com>

Closes #5778 from kaka1992/fix_codegenon_datetype_mismatch and squashes the following commits:

1ad4cff [云峤] SPARK-7234 fix dateType mismatch
---
 .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index dbc92fb93e95e..d17af0e7ff87e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -672,6 +672,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
     case DoubleType => ru.Literal(Constant(-1.toDouble))
     case DecimalType() => q"org.apache.spark.sql.types.Decimal(-1)"
     case IntegerType => ru.Literal(Constant(-1))
+    case DateType => ru.Literal(Constant(-1))
     case _ => ru.Literal(Constant(null))
   }
 

From 5553198fe521fb38b600b7687f7780d89a6e1cb9 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 29 Apr 2015 19:13:47 -0700
Subject: [PATCH 4/4] [SPARK-7156][SQL] Addressed follow up comments for
 randomSplit

small fixes regarding comments in PR #5761

cc rxin

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #5795 from brkyvz/split-followup and squashes the following commits:

369c522 [Burak Yavuz] changed wording a little
1ea456f [Burak Yavuz] Addressed follow up comments
---
 python/pyspark/sql/dataframe.py                            | 7 ++++++-
 .../src/main/scala/org/apache/spark/sql/DataFrame.scala    | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3074af3ed2e83..5908ebc990a56 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -437,6 +437,10 @@ def sample(self, withReplacement, fraction, seed=None):
     def randomSplit(self, weights, seed=None):
         """Randomly splits this :class:`DataFrame` with the provided weights.
 
+        :param weights: list of doubles as weights with which to split the DataFrame. Weights will
+            be normalized if they don't sum up to 1.0.
+        :param seed: The seed for sampling.
+
         >>> splits = df4.randomSplit([1.0, 2.0], 24)
         >>> splits[0].count()
         1
@@ -445,7 +449,8 @@ def randomSplit(self, weights, seed=None):
         3
         """
         for w in weights:
-            assert w >= 0.0, "Negative weight value: %s" % w
+            if w < 0.0:
+                raise ValueError("Weights must be positive. Found weight value: %s" % w)
         seed = seed if seed is not None else random.randint(0, sys.maxsize)
         rdd_array = self._jdf.randomSplit(_to_seq(self.sql_ctx._sc, weights), long(seed))
         return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 0d02e14c21be0..2669300029545 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -752,7 +752,7 @@ class DataFrame private[sql](
    * @param seed Seed for sampling.
    * @group dfops
    */
-  def randomSplit(weights: List[Double], seed: Long): Array[DataFrame] = {
+  private[spark] def randomSplit(weights: List[Double], seed: Long): Array[DataFrame] = {
     randomSplit(weights.toArray, seed)
   }