Merge branch 'master' into SPARK-31337

apache · May 27, 2020 · 587a502 · 587a502
2 parents 16687d6 + 8f2b6f3
commit 587a502
Show file tree

Hide file tree

Showing 76 changed files with 2,544 additions and 789 deletions.
diff --git a/docs/img/webui-structured-streaming-detail.png b/docs/img/webui-structured-streaming-detail.png
diff --git a/docs/sql-ref-datetime-pattern.md b/docs/sql-ref-datetime-pattern.md
@@ -30,25 +30,25 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 
 |Symbol|Meaning|Presentation|Examples|
 |------|-------|------------|--------|
-|**G**|era|text|AD; Anno Domini; A|
+|**G**|era|text|AD; Anno Domini|
 |**y**|year|year|2020; 20|
-|**D**|day-of-year|number|189|
-|**M/L**|month-of-year|number/text|7; 07; Jul; July; J|
-|**d**|day-of-month|number|28|
+|**D**|day-of-year|number(3)|189|
+|**M/L**|month-of-year|month|7; 07; Jul; July|
+|**d**|day-of-month|number(3)|28|
 |**Q/q**|quarter-of-year|number/text|3; 03; Q3; 3rd quarter|
 |**Y**|week-based-year|year|1996; 96|
-|**w**|week-of-week-based-year|number|27|
-|**W**|week-of-month|number|4|
-|**E**|day-of-week|text|Tue; Tuesday; T|
-|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday; T|
-|**F**|week-of-month|number|3|
-|**a**|am-pm-of-day|text|PM|
-|**h**|clock-hour-of-am-pm (1-12)|number|12|
-|**K**|hour-of-am-pm (0-11)|number|0|
-|**k**|clock-hour-of-day (1-24)|number|0|
-|**H**|hour-of-day (0-23)|number|0|
-|**m**|minute-of-hour|number|30|
-|**s**|second-of-minute|number|55|
+|**w**|week-of-week-based-year|number(2)|27|
+|**W**|week-of-month|number(1)|4|
+|**E**|day-of-week|text|Tue; Tuesday|
+|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday|
+|**F**|week-of-month|number(1)|3|
+|**a**|am-pm-of-day|am-pm|PM|
+|**h**|clock-hour-of-am-pm (1-12)|number(2)|12|
+|**K**|hour-of-am-pm (0-11)|number(2)|0|
+|**k**|clock-hour-of-day (1-24)|number(2)|0|
+|**H**|hour-of-day (0-23)|number(2)|0|
+|**m**|minute-of-hour|number(2)|30|
+|**s**|second-of-minute|number(2)|55|
 |**S**|fraction-of-second|fraction|978|
 |**V**|time-zone ID|zone-id|America/Los_Angeles; Z; -08:30|
 |**z**|time-zone name|zone-name|Pacific Standard Time; PST|
@@ -63,9 +63,9 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 
 The count of pattern letters determines the format.
 
-- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. Six or more letters will fail.
+- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail.
 
-- Number: If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary. The following pattern letters have constraints on the count of letters. Only one letter 'F' can be specified. Up to two letters of 'd', 'H', 'h', 'K', 'k', 'm', and 's' can be specified. Up to three letters of 'D' can be specified.
+- Number(n): The n here represents the maximum count of letters this type of datetime pattern can be used. If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary.
 
 - Number/Text: If the count of pattern letters is 3 or greater, use the Text rules above. Otherwise use the Number rules above.
 
@@ -76,7 +76,7 @@ The count of pattern letters determines the format.
 
 - Year: The count of letters determines the minimum field width below which padding is used. If the count of letters is two, then a reduced two digit form is used. For printing, this outputs the rightmost two digits. For parsing, this will parse using the base value of 2000, resulting in a year within the range 2000 to 2099 inclusive. If the count of letters is less than four (but not two), then the sign is only output for negative years. Otherwise, the sign is output if the pad width is exceeded when 'G' is not present.
 
-- Month: If the number of pattern letters is 3 or more, the month is interpreted as text; otherwise, it is interpreted as a number. The text form is depend on letters - 'M' denotes the 'standard' form, and 'L' is for 'stand-alone' form. The difference between the 'standard' and 'stand-alone' forms is trickier to describe as there is no difference in English. However, in other languages there is a difference in the word used when the text is used alone, as opposed to in a complete date. For example, the word used for a month when used alone in a date picker is different to the word used for month in association with a day and year in a date. In Russian, 'Июль' is the stand-alone form of July, and 'Июля' is the standard form. Here are examples for all supported pattern letters (more than 5 letters is invalid):
+- Month: If the number of pattern letters is 3 or more, the month is interpreted as text; otherwise, it is interpreted as a number. The text form is depend on letters - 'M' denotes the 'standard' form, and 'L' is for 'stand-alone' form. The difference between the 'standard' and 'stand-alone' forms is trickier to describe as there is no difference in English. However, in other languages there is a difference in the word used when the text is used alone, as opposed to in a complete date. For example, the word used for a month when used alone in a date picker is different to the word used for month in association with a day and year in a date. In Russian, 'Июль' is the stand-alone form of July, and 'Июля' is the standard form. Here are examples for all supported pattern letters (more than 4 letters is invalid):
   - `'M'` or `'L'`: Month number in a year starting from 1. There is no difference between 'M' and 'L'. Month from 1 to 9 are printed without padding.
     ```sql
     spark-sql> select date_format(date '1970-01-01', "M");
@@ -119,13 +119,8 @@ The count of pattern letters determines the format.
     spark-sql> select to_csv(named_struct('date', date '1970-01-01'), map('dateFormat', 'LLLL', 'locale', 'RU'));
     январь
     ```
-  - `'LLLLL'` or `'MMMMM'`: Narrow textual representation of standard or stand-alone forms. Typically it is a single letter.
-    ```sql
-    spark-sql> select date_format(date '1970-07-01', "LLLLL");
-    J
-    spark-sql> select date_format(date '1970-01-01', "MMMMM");
-    J
-    ```
+
+- am-pm: This outputs the am-pm-of-day. Pattern letter count must be 1.
 
 - Zone ID(V): This outputs the display the time-zone ID. Pattern letter count must be 2.
 
@@ -147,5 +142,3 @@ More details for the text style:
 - Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".
 
 - Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
-
-- Narrow Form: Narrow text, typically a single letter. For example, day-of-week Monday might output "M".
diff --git a/docs/web-ui.md b/docs/web-ui.md
@@ -407,6 +407,34 @@ Here is the list of SQL metrics:
 
 </table>
 
+## Structured Streaming Tab
+When running Structured Streaming jobs in micro-batch mode, a Structured Streaming tab will be 
+available on the Web UI. The overview page displays some brief statistics for running and completed
+queries. Also, you can check the latest exception of a failed query. For detailed statistics, please
+click a "run id" in the tables.
+
+<p style="text-align: center;">
+  <img src="img/webui-structured-streaming-detail.png" title="Structured Streaming Query Statistics" alt="Structured Streaming Query Statistics">
+</p>
+
+The statistics page displays some useful metrics for insight into the status of your streaming 
+queries. Currently, it contains the following metrics.
+
+* **Input Rate.** The aggregate (across all sources) rate of data arriving.
+* **Process Rate.** The aggregate (across all sources) rate at which Spark is processing data.
+* **Input Rows.** The aggregate (across all sources) number of records processed in a trigger.
+* **Batch Duration.** The process duration of each batch. 
+* **Operation Duration.** The amount of time taken to perform various operations in milliseconds.
+The tracked operations are listed as follows.
+    * addBatch: Adds result data of the current batch to the sink.
+    * getBatch: Gets a new batch of data to process.
+    * latestOffset: Gets the latest offsets for sources. 
+    * queryPlanning: Generates the execution plan.
+    * walCommit: Writes the offsets to the metadata log.
+
+As an early-release version, the statistics page is still under development and will be improved in
+future releases.
+
 ## Streaming Tab
 The web UI includes a Streaming tab if the application uses Spark streaming. This tab displays
 scheduling delay and processing time for each micro-batch in the data stream, which can be useful

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
@@ -19,10 +19,11 @@ package org.apache.spark.ml.evaluation
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
-import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol, HasWeightCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
-import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * Evaluator for clustering results.
@@ -34,7 +35,8 @@ import org.apache.spark.sql.functions.col
  */
 @Since("2.3.0")
 class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String)
-  extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable {
+  extends Evaluator with HasPredictionCol with HasFeaturesCol with HasWeightCol
+    with DefaultParamsWritable {
 
   @Since("2.3.0")
   def this() = this(Identifiable.randomUID("cluEval"))
@@ -53,6 +55,10 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str
   @Since("2.3.0")
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
 
+  /** @group setParam */
+  @Since("3.1.0")
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+
   /**
    * param for metric name in evaluation
    * (supports `"silhouette"` (default))
@@ -116,12 +122,26 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str
    */
   @Since("3.1.0")
   def getMetrics(dataset: Dataset[_]): ClusteringMetrics = {
-    SchemaUtils.validateVectorCompatibleColumn(dataset.schema, $(featuresCol))
-    SchemaUtils.checkNumericType(dataset.schema, $(predictionCol))
+    val schema = dataset.schema
+    SchemaUtils.validateVectorCompatibleColumn(schema, $(featuresCol))
+    SchemaUtils.checkNumericType(schema, $(predictionCol))
+    if (isDefined(weightCol)) {
+      SchemaUtils.checkNumericType(schema, $(weightCol))
+    }
+
+    val weightColName = if (!isDefined(weightCol)) "weightCol" else $(weightCol)
 
     val vectorCol = DatasetUtils.columnToVector(dataset, $(featuresCol))
-    val df = dataset.select(col($(predictionCol)),
-      vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata))
+    val df = if (!isDefined(weightCol) || $(weightCol).isEmpty) {
+      dataset.select(col($(predictionCol)),
+        vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata),
+        lit(1.0).as(weightColName))
+    } else {
+      dataset.select(col($(predictionCol)),
+        vectorCol.as($(featuresCol), dataset.schema($(featuresCol)).metadata),
+        col(weightColName).cast(DoubleType))
+    }
+
     val metrics = new ClusteringMetrics(df)
     metrics.setDistanceMeasure($(distanceMeasure))
     metrics