From a13eec35231e63740b0c8c6dbf9712a236364e09 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 30 Jan 2018 17:41:48 -0800 Subject: [PATCH 1/7] init pr --- docs/ml-statistics.md | 22 ++++++ .../examples/ml/JavaSummarizerExample.java | 71 +++++++++++++++++++ .../spark/examples/ml/SummarizerExample.scala | 60 ++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md index abfb3cab1e566..09bdc1fc19861 100644 --- a/docs/ml-statistics.md +++ b/docs/ml-statistics.md @@ -89,4 +89,26 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat {% include_example python/ml/chi_square_test_example.py %} + + +## Summarizer + +We provide vector column summary statistics for `Dataframe` through `Summarizer`. +Available metrics contain the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. + +
+
+[`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$) +compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight. + +{% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %} +
+ +
+[`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html) +compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight. + +{% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %} +
+
\ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java new file mode 100644 index 0000000000000..67392e9eb006b --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +import org.apache.spark.sql.*; + +// $example on$ +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.stat.Summarizer; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +// $example off$ + +public class JavaSummarizerExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaSummarizerExample") + .getOrCreate(); + + // $example on$ + List data = Arrays.asList( + RowFactory.create(Vectors.dense(2.0, 3.0, 5.0), 1.0), + RowFactory.create(Vectors.dense(4.0, 6.0, 7.0), 2.0) + ); + + StructType schema = new StructType(new StructField[]{ + new StructField("features", new VectorUDT(), false, Metadata.empty()), + new StructField("weight", DataTypes.DoubleType, false, Metadata.empty()) + }); + + Dataset df = spark.createDataFrame(data, schema); + + Row result1 = df.select(Summarizer.metrics("mean", "variance") + .summary(new Column("features"), new Column("weight"))) + .first().getStruct(0); + System.out.println("with weight: mean = " + result1.getAs(0).toString() + + ", variance = " + result1.getAs(1).toString()); + + Row result2 = df.select( + Summarizer.mean(new Column("features")), + Summarizer.variance(new Column("features")) + ).first(); + System.out.println("without weight: mean = " + result2.getAs(0).toString() + + ", variance = " + result2.getAs(1).toString()); + // $example off$ + spark.stop(); + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala new file mode 100644 index 0000000000000..e5e9fb0594379 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.stat.Summarizer +// $example off$ +import org.apache.spark.sql.SparkSession + +object SummarizerExample { + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName("SummarizerExample") + .getOrCreate() + + import spark.implicits._ + import Summarizer._ + + // $example on$ + val data = Seq( + (Vectors.dense(2.0, 3.0, 5.0), 1.0), + (Vectors.dense(4.0, 6.0, 7.0), 2.0) + ) + + val df = data.toDF("features", "weight") + + val Tuple1((meanVal, varianceVal)) = df.select(metrics("mean", "variance") + .summary($"features", $"weight")) + .as[Tuple1[(Vector, Vector)]].first() + + println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}") + + val (meanVal2, varianceVal2) = df.select(mean($"features"), variance($"features")) + .as[(Vector, Vector)].first() + + println(s"without weight: mean = ${meanVal2}, sum = ${varianceVal2}") + // $example off$ + + spark.stop() + } +} +// scalastyle:on println From 0935fd1a4a8bf96f0f9290a8d5ff99121e4f958c Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 1 Feb 2018 22:17:40 -0800 Subject: [PATCH 2/7] address nick's comments --- docs/ml-statistics.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md index 09bdc1fc19861..2c707a83ee858 100644 --- a/docs/ml-statistics.md +++ b/docs/ml-statistics.md @@ -94,19 +94,21 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat ## Summarizer We provide vector column summary statistics for `Dataframe` through `Summarizer`. -Available metrics contain the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. +Available metrics are the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count.
[`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$) -compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight. +The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with +and without a weight column. {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
[`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html) -compute and print the mean and variace for the input dataframe. Including the two cases of with and without weight. +The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with +and without a weight column. {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
From a152f7bec28759244b835e2c3bca2d874d9d134b Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 1 Feb 2018 22:22:04 -0800 Subject: [PATCH 3/7] update doc format --- docs/ml-statistics.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md index 2c707a83ee858..c10ef6618c6d0 100644 --- a/docs/ml-statistics.md +++ b/docs/ml-statistics.md @@ -98,17 +98,15 @@ Available metrics are the column-wise max, min, mean, variance, and number of no
-[`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$) -The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with -and without a weight column. +The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$) +to compute the mean and variance for the input dataframe, with and without a weight column. {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
-[`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html) -The following example demonstrates using `Summarizer` to compute the mean and variance for the input dataframe, with -and without a weight column. +The following example demonstrates using [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html) +to compute the mean and variance for the input dataframe, with and without a weight column. {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
From 4cbad959405647fbfd6b6fb42335f91f8ed1ccd7 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 1 Feb 2018 23:31:34 -0800 Subject: [PATCH 4/7] update doc --- docs/ml-statistics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md index c10ef6618c6d0..1bb6db2f13163 100644 --- a/docs/ml-statistics.md +++ b/docs/ml-statistics.md @@ -99,14 +99,14 @@ Available metrics are the column-wise max, min, mean, variance, and number of no
The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$) -to compute the mean and variance for the input dataframe, with and without a weight column. +to compute the mean and variance for a vector column of the input dataframe, with and without a weight column. {% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
The following example demonstrates using [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html) -to compute the mean and variance for the input dataframe, with and without a weight column. +to compute the mean and variance for a vector column of the input dataframe, with and without a weight column. {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
From 60286a84a46127207c91e5600b89386cdf5d1204 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 1 Feb 2018 23:58:32 -0800 Subject: [PATCH 5/7] extract struct type column --- .../apache/spark/examples/ml/JavaSummarizerExample.java | 4 ++-- .../org/apache/spark/examples/ml/SummarizerExample.scala | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java index 67392e9eb006b..e9b84365d86ed 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java @@ -54,8 +54,8 @@ public static void main(String[] args) { Dataset df = spark.createDataFrame(data, schema); Row result1 = df.select(Summarizer.metrics("mean", "variance") - .summary(new Column("features"), new Column("weight"))) - .first().getStruct(0); + .summary(new Column("features"), new Column("weight")).as("summary")) + .select("summary.mean", "summary.variance").first(); System.out.println("with weight: mean = " + result1.getAs(0).toString() + ", variance = " + result1.getAs(1).toString()); diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala index e5e9fb0594379..2f54d1d81bc48 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummarizerExample.scala @@ -42,9 +42,10 @@ object SummarizerExample { val df = data.toDF("features", "weight") - val Tuple1((meanVal, varianceVal)) = df.select(metrics("mean", "variance") - .summary($"features", $"weight")) - .as[Tuple1[(Vector, Vector)]].first() + val (meanVal, varianceVal) = df.select(metrics("mean", "variance") + .summary($"features", $"weight").as("summary")) + .select("summary.mean", "summary.variance") + .as[(Vector, Vector)].first() println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}") From f9eb02a1a82d411cdc5ddba562ab982db4b583df Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 19 Apr 2018 17:34:45 +0800 Subject: [PATCH 6/7] add python example and guide entry --- docs/ml-statistics.md | 6 ++ .../src/main/python/ml/summarizer_example.py | 58 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 examples/src/main/python/ml/summarizer_example.py diff --git a/docs/ml-statistics.md b/docs/ml-statistics.md index 1bb6db2f13163..6c82b3bb94b24 100644 --- a/docs/ml-statistics.md +++ b/docs/ml-statistics.md @@ -111,4 +111,10 @@ to compute the mean and variance for a vector column of the input dataframe, wit {% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
+
+Refer to the [`Summarizer` Python docs](api/python/index.html#pyspark.ml.stat.Summarizer$) for details on the API. + +{% include_example python/ml/summarizer_example.py %} +
+
\ No newline at end of file diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py new file mode 100644 index 0000000000000..ece5301535c82 --- /dev/null +++ b/examples/src/main/python/ml/summarizer_example.py @@ -0,0 +1,58 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +An example for summarizer. +Run with: + bin/spark-submit examples/src/main/python/ml/summarizer_example.py +""" +from __future__ import print_function + +from pyspark.sql import SparkSession +# $example on$ +from pyspark.ml.stat import Summarizer +from pyspark.sql import Row +from pyspark.ml.linalg import Vectors +# $example off$ + +if __name__ == "__main__": + spark = SparkSession \ + .builder \ + .appName("SummarizerExample") \ + .getOrCreate() + + # $example on$ + df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), + Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() + + # create summarizer for multiple metrics "mean" and "count" + summarizer = Summarizer.metrics("mean", "count") + + # compute statistics for multiple metrics with weight + df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) + + # compute statistics for multiple metrics without weight + df.select(summarizer.summary(df.features)).show(truncate=False) + + # compute statistics for single metric "mean" with weight + df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) + + # compute statistics for single metric "mean" without weight + df.select(Summarizer.mean(df.features)).show(truncate=False) + # $example off$ + + spark.stop() From ee9d3686f3e48650668bf26a7003b0bde912b6a0 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 19 Apr 2018 18:09:30 +0800 Subject: [PATCH 7/7] fix py --- examples/src/main/python/ml/summarizer_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/src/main/python/ml/summarizer_example.py b/examples/src/main/python/ml/summarizer_example.py index ece5301535c82..8835f189a1ad4 100644 --- a/examples/src/main/python/ml/summarizer_example.py +++ b/examples/src/main/python/ml/summarizer_example.py @@ -34,6 +34,7 @@ .builder \ .appName("SummarizerExample") \ .getOrCreate() + sc = spark.sparkContext # $example on$ df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),