diff --git a/docs/ml-features.md b/docs/ml-features.md index 71aec5ec39f53..1d1d28c265560 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1285,7 +1285,7 @@ for more details on the API. ## VectorSizeHint -It can sometimes be useful to explicitly specify the size of the vectors a column of +It can sometimes be useful to explicitly specify the size of the vectors for a column of `VectorType`. For example, `VectorAssembler` uses size information from its input columns to produce size information and metadata for its output column. While in some cases this information can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java index ed30b4e399a62..289dad2155a71 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java @@ -17,6 +17,11 @@ package org.apache.spark.examples.ml; +import org.apache.spark.sql.SparkSession; + +// $example on$ +import java.util.Arrays; + import org.apache.spark.ml.feature.VectorAssembler; import org.apache.spark.ml.feature.VectorSizeHint; import org.apache.spark.ml.linalg.VectorUDT; @@ -24,15 +29,9 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; - -import java.util.Arrays; - import static org.apache.spark.sql.types.DataTypes.*; - -// $example on$ // $example off$ public class JavaVectorSizeHintExample { @@ -66,7 +65,7 @@ public static void main(String[] args) { .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); - // This dataframe can be used by used by downstream transformers as before + // This dataframe can be used by downstream transformers as before Dataset output = assembler.transform(datasetWithSize); System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + "'features'"); diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py index 56cede3fb41da..fb77dacec629d 100644 --- a/examples/src/main/python/ml/vector_size_hint_example.py +++ b/examples/src/main/python/ml/vector_size_hint_example.py @@ -48,7 +48,7 @@ inputCols=["hour", "mobile", "userFeatures"], outputCol="features") - # This dataframe can be used by used by downstream transformers as before + # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala index 81b3f5be220cf..688731a791f35 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala @@ -51,7 +51,7 @@ object VectorSizeHintExample { .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") - // This dataframe can be used by used by downstream transformers as before + // This dataframe can be used by downstream transformers as before val output = assembler.transform(datasetWithSize) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false)