From a13eec35231e63740b0c8c6dbf9712a236364e09 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Tue, 30 Jan 2018 17:41:48 -0800
Subject: [PATCH 1/7] init pr

---
 docs/ml-statistics.md                         | 22 ++++++
 .../examples/ml/JavaSummarizerExample.java    | 71 +++++++++++++++++++
 .../spark/examples/ml/SummarizerExample.scala | 60 ++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index abfb3cab1e566..09bdc1fc19861 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -89,4 +89,26 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat
 {% include_example python/ml/chi_square_test_example.py %}
 </div>
 
+</div>
+
+## Summarizer
+
+We provide vector column summary statistics for `Dataframe` through `Summarizer`.
+Available metrics contain the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
+compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight.
+
+{% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
+compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight.
+
+{% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
+</div>
+
 </div>
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
new file mode 100644
index 0000000000000..67392e9eb006b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.*;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.stat.Summarizer;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaSummarizerExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaSummarizerExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(Vectors.dense(2.0, 3.0, 5.0), 1.0),
+      RowFactory.create(Vectors.dense(4.0, 6.0, 7.0), 2.0)
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+      new StructField("weight", DataTypes.DoubleType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    Row result1 = df.select(Summarizer.metrics("mean", "variance")
+        .summary(new Column("features"), new Column("weight")))
+        .first().getStruct(0);
+    System.out.println("with weight: mean = " + result1.<Vector>getAs(0).toString() +
+      ", variance = " + result1.<Vector>getAs(1).toString());
+
+    Row result2 = df.select(
+      Summarizer.mean(new Column("features")),
+      Summarizer.variance(new Column("features"))
+    ).first();
+    System.out.println("without weight: mean = " + result2.<Vector>getAs(0).toString() +
+      ", variance = " + result2.<Vector>getAs(1).toString());
+    // $example off$
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
new file mode 100644
index 0000000000000..e5e9fb0594379
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.stat.Summarizer
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object SummarizerExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("SummarizerExample")
+      .getOrCreate()
+
+    import spark.implicits._
+    import Summarizer._
+
+    // $example on$
+    val data = Seq(
+      (Vectors.dense(2.0, 3.0, 5.0), 1.0),
+      (Vectors.dense(4.0, 6.0, 7.0), 2.0)
+    )
+
+    val df = data.toDF("features", "weight")
+
+    val Tuple1((meanVal, varianceVal)) = df.select(metrics("mean", "variance")
+      .summary($"features", $"weight"))
+      .as[Tuple1[(Vector, Vector)]].first()
+
+    println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}")
+
+    val (meanVal2, varianceVal2) = df.select(mean($"features"), variance($"features"))
+      .as[(Vector, Vector)].first()
+
+    println(s"without weight: mean = ${meanVal2}, sum = ${varianceVal2}")
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println

From 0935fd1a4a8bf96f0f9290a8d5ff99121e4f958c Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 1 Feb 2018 22:17:40 -0800
Subject: [PATCH 2/7] address nick's comments

---
 docs/ml-statistics.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index 09bdc1fc19861..2c707a83ee858 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -94,19 +94,21 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat
 ## Summarizer
 
 We provide vector column summary statistics for `Dataframe` through `Summarizer`.
-Available metrics contain the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count.
+Available metrics are the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
-compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight.
+The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with
+and without a weight column.
 
 {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
-compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight.
+The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with
+and without a weight column.
 
 {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
 </div>

From a152f7bec28759244b835e2c3bca2d874d9d134b Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 1 Feb 2018 22:22:04 -0800
Subject: [PATCH 3/7] update doc format

---
 docs/ml-statistics.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index 2c707a83ee858..c10ef6618c6d0 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -98,17 +98,15 @@ Available metrics are the column-wise max, min, mean, variance, and number of no
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
-The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with
-and without a weight column.
+The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
+to compute the mean and variance for the input dataframe, with and without a weight column.
 
 {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
-[`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
-The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with
-and without a weight column.
+The following example demonstrates using [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
+to compute the mean and variance for the input dataframe, with and without a weight column.
 
 {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
 </div>

From 4cbad959405647fbfd6b6fb42335f91f8ed1ccd7 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 1 Feb 2018 23:31:34 -0800
Subject: [PATCH 4/7] update doc

---
 docs/ml-statistics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index c10ef6618c6d0..1bb6db2f13163 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -99,14 +99,14 @@ Available metrics are the column-wise max, min, mean, variance, and number of no
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
-to compute the mean and variance for the input dataframe, with and without a weight column.
+to compute the mean and variance for a vector column of the input dataframe, with and without a weight column.
 
 {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 The following example demonstrates using [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
-to compute the mean and variance for the input dataframe, with and without a weight column.
+to compute the mean and variance for a vector column of the input dataframe, with and without a weight column.
 
 {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
 </div>

From 60286a84a46127207c91e5600b89386cdf5d1204 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 1 Feb 2018 23:58:32 -0800
Subject: [PATCH 5/7] extract struct type column

---
 .../apache/spark/examples/ml/JavaSummarizerExample.java    | 4 ++--
 .../org/apache/spark/examples/ml/SummarizerExample.scala   | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
index 67392e9eb006b..e9b84365d86ed 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java
@@ -54,8 +54,8 @@ public static void main(String[] args) {
     Dataset<Row> df = spark.createDataFrame(data, schema);
 
     Row result1 = df.select(Summarizer.metrics("mean", "variance")
-        .summary(new Column("features"), new Column("weight")))
-        .first().getStruct(0);
+      .summary(new Column("features"), new Column("weight")).as("summary"))
+      .select("summary.mean", "summary.variance").first();
     System.out.println("with weight: mean = " + result1.<Vector>getAs(0).toString() +
       ", variance = " + result1.<Vector>getAs(1).toString());
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
index e5e9fb0594379..2f54d1d81bc48 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala
@@ -42,9 +42,10 @@ object SummarizerExample {
 
     val df = data.toDF("features", "weight")
 
-    val Tuple1((meanVal, varianceVal)) = df.select(metrics("mean", "variance")
-      .summary($"features", $"weight"))
-      .as[Tuple1[(Vector, Vector)]].first()
+    val (meanVal, varianceVal) = df.select(metrics("mean", "variance")
+      .summary($"features", $"weight").as("summary"))
+      .select("summary.mean", "summary.variance")
+      .as[(Vector, Vector)].first()
 
     println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}")
 

From f9eb02a1a82d411cdc5ddba562ab982db4b583df Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 19 Apr 2018 17:34:45 +0800
Subject: [PATCH 6/7] add python example and guide entry

---
 docs/ml-statistics.md                         |  6 ++
 .../src/main/python/ml/summarizer_example.py  | 58 +++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 examples/src/main/python/ml/summarizer_example.py

diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md
index 1bb6db2f13163..6c82b3bb94b24 100644
--- a/docs/ml-statistics.md
+++ b/docs/ml-statistics.md
@@ -111,4 +111,10 @@ to compute the mean and variance for a vector column of the input dataframe, wit
 {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
 </div>
 
+<div data-lang="python" markdown="1">
+Refer to the [`Summarizer` Python docs](api/python/index.html#pyspark.ml.stat.Summarizer$) for details on the API.
+
+{% include_example python/ml/summarizer_example.py %}
+</div>
+
 </div>
\ No newline at end of file
diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py
new file mode 100644
index 0000000000000..ece5301535c82
--- /dev/null
+++ b/examples/src/main/python/ml/summarizer_example.py
@@ -0,0 +1,58 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+An example for summarizer.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/summarizer_example.py
+"""
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+# $example on$
+from pyspark.ml.stat import Summarizer
+from pyspark.sql import Row
+from pyspark.ml.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("SummarizerExample") \
+        .getOrCreate()
+
+    # $example on$
+    df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
+                         Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()
+
+    # create summarizer for multiple metrics "mean" and "count"
+    summarizer = Summarizer.metrics("mean", "count")
+
+    # compute statistics for multiple metrics with weight
+    df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)
+
+    # compute statistics for multiple metrics without weight
+    df.select(summarizer.summary(df.features)).show(truncate=False)
+
+    # compute statistics for single metric "mean" with weight
+    df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)
+
+    # compute statistics for single metric "mean" without weight
+    df.select(Summarizer.mean(df.features)).show(truncate=False)
+    # $example off$
+
+    spark.stop()

From ee9d3686f3e48650668bf26a7003b0bde912b6a0 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 19 Apr 2018 18:09:30 +0800
Subject: [PATCH 7/7] fix py

---
 examples/src/main/python/ml/summarizer_example.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py
index ece5301535c82..8835f189a1ad4 100644
--- a/examples/src/main/python/ml/summarizer_example.py
+++ b/examples/src/main/python/ml/summarizer_example.py
@@ -34,6 +34,7 @@
         .builder \
         .appName("SummarizerExample") \
         .getOrCreate()
+    sc = spark.sparkContext
 
     # $example on$
     df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),