From 549fa7a51b30b162dcd6fc70b42cf1779de1900b Mon Sep 17 00:00:00 2001 From: Sivabalan Narayanan Date: Mon, 31 Jan 2022 09:57:02 -0500 Subject: [PATCH] [MINOR] Fixing 0.10.1 docs (#4725) --- website/docs/quick-start-guide.md | 41 +++++++++++++------ .../version-0.10.1/quick-start-guide.md | 41 +++++++++++++------ 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/website/docs/quick-start-guide.md b/website/docs/quick-start-guide.md index 685492e4fb60..b85ee1004577 100644 --- a/website/docs/quick-start-guide.md +++ b/website/docs/quick-start-guide.md @@ -38,19 +38,24 @@ values={[ From the extracted directory run spark-shell with Hudi as: ```scala -// spark-shell for spark 3 +// spark-shell for spark 3.1 spark-shell \ - --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 \ + --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' + +// spark-shell for spark 3.2 +spark-shell \ + --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' // spark-shell for spark 2 with scala 2.12 spark-shell \ - --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' // spark-shell for spark 2 with scala 2.11 spark-shell \ - --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` @@ -61,19 +66,24 @@ Hudi support using Spark SQL to write and read data with the **HoodieSparkSessio From the extracted directory run Spark SQL with Hudi as: ```shell -# Spark SQL for spark 3 -spark-sql --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 \ +# Spark SQL for spark 3.1 +spark-sql --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 \ +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ +--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' + +# Spark SQL for spark 3.0 +spark-sql --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' # Spark SQL for spark 2 with scala 2.11 -spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \ +spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' # Spark SQL for spark 2 with scala 2.12 spark-sql \ - --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' ``` @@ -87,19 +97,24 @@ From the extracted directory run pyspark with Hudi as: # pyspark export PYSPARK_PYTHON=$(which python3) -# for spark3 +# for spark3.1 +pyspark +--packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' + +# for spark3.0 pyspark ---packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 +--packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' # for spark2 with scala 2.12 pyspark ---packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 +--packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' # for spark2 with scala 2.11 pyspark ---packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 +--packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` @@ -1085,7 +1100,7 @@ Currently, the result of `show partitions` is based on the filesystem table pat You can also do the quickstart by [building hudi yourself](https://github.com/apache/hudi#building-apache-hudi-from-source), and using `--jars /packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.1?-*.*.*-SNAPSHOT.jar` in the spark-shell command above -instead of `--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0`. Hudi also supports scala 2.12. Refer [build with scala 2.12](https://github.com/apache/hudi#build-with-scala-212) +instead of `--packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1`. Hudi also supports scala 2.12. Refer [build with scala 2.12](https://github.com/apache/hudi#build-with-scala-212) for more info. Also, we used Spark here to show case the capabilities of Hudi. However, Hudi can support multiple table types/query types and diff --git a/website/versioned_docs/version-0.10.1/quick-start-guide.md b/website/versioned_docs/version-0.10.1/quick-start-guide.md index 685492e4fb60..2d661943f746 100644 --- a/website/versioned_docs/version-0.10.1/quick-start-guide.md +++ b/website/versioned_docs/version-0.10.1/quick-start-guide.md @@ -38,19 +38,24 @@ values={[ From the extracted directory run spark-shell with Hudi as: ```scala -// spark-shell for spark 3 +// spark-shell for spark 3.1 spark-shell \ - --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 \ + --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 \ + --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' + +// spark-shell for spark 3.0 +spark-shell \ + --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' // spark-shell for spark 2 with scala 2.12 spark-shell \ - --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' // spark-shell for spark 2 with scala 2.11 spark-shell \ - --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` @@ -61,19 +66,24 @@ Hudi support using Spark SQL to write and read data with the **HoodieSparkSessio From the extracted directory run Spark SQL with Hudi as: ```shell -# Spark SQL for spark 3 -spark-sql --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 \ +# Spark SQL for spark 3.1 +spark-sql --packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 \ +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ +--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' + +# Spark SQL for spark 3.0 +spark-sql --packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' # Spark SQL for spark 2 with scala 2.11 -spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \ +spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' # Spark SQL for spark 2 with scala 2.12 spark-sql \ - --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 \ + --packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \ --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' ``` @@ -87,19 +97,24 @@ From the extracted directory run pyspark with Hudi as: # pyspark export PYSPARK_PYTHON=$(which python3) -# for spark3 +# for spark3.1 +pyspark +--packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.1.2 +--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' + +# for spark3.2 pyspark ---packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:3.1.2 +--packages org.apache.hudi:hudi-spark3.0.3-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:3.0.3 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' # for spark2 with scala 2.12 pyspark ---packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.0,org.apache.spark:spark-avro_2.12:2.4.4 +--packages org.apache.hudi:hudi-spark-bundle_2.12:0.10.1,org.apache.spark:spark-avro_2.12:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' # for spark2 with scala 2.11 pyspark ---packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 +--packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.1,org.apache.spark:spark-avro_2.11:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` @@ -1085,7 +1100,7 @@ Currently, the result of `show partitions` is based on the filesystem table pat You can also do the quickstart by [building hudi yourself](https://github.com/apache/hudi#building-apache-hudi-from-source), and using `--jars /packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.1?-*.*.*-SNAPSHOT.jar` in the spark-shell command above -instead of `--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0`. Hudi also supports scala 2.12. Refer [build with scala 2.12](https://github.com/apache/hudi#build-with-scala-212) +instead of `--packages org.apache.hudi:hudi-spark3.1.2-bundle_2.12:0.10.1`. Hudi also supports scala 2.12. Refer [build with scala 2.12](https://github.com/apache/hudi#build-with-scala-212) for more info. Also, we used Spark here to show case the capabilities of Hudi. However, Hudi can support multiple table types/query types and