From 028eb25bea490c53aa8736da0dce2f7e8785ca97 Mon Sep 17 00:00:00 2001 From: xubo245 Date: Mon, 7 Jan 2019 20:27:37 +0800 Subject: [PATCH] [CARBONDATA-3232] Add example and doc for alluxio integration Optimize carbonData usage with alluxio: 1.Add doc 2.optimize the example This closes #3054 --- README.md | 1 + docs/alluxio-guide.md | 136 ++++++++++++++++++ docs/documentation.md | 6 +- docs/introduction.md | 4 +- docs/quick-start-guide.md | 17 ++- examples/spark2/pom.xml | 10 ++ .../carbondata/examples/AlluxioExample.scala | 115 +++++++++++---- .../examples/util/ExampleUtils.scala | 13 +- 8 files changed, 264 insertions(+), 38 deletions(-) create mode 100644 docs/alluxio-guide.md diff --git a/README.md b/README.md index a788ceae5a3..bed906f8466 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ CarbonData is built using Apache Maven, to [build CarbonData](https://github.com ## Integration * [Hive](https://github.com/apache/carbondata/blob/master/docs/hive-guide.md) * [Presto](https://github.com/apache/carbondata/blob/master/docs/presto-guide.md) +* [Alluxio](https://github.com/apache/carbondata/blob/master/docs/alluxio-guide.md) ## Other Technical Material * [Apache CarbonData meetup material](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=66850609) diff --git a/docs/alluxio-guide.md b/docs/alluxio-guide.md new file mode 100644 index 00000000000..b1bfeeb3dfd --- /dev/null +++ b/docs/alluxio-guide.md @@ -0,0 +1,136 @@ + + + +# Alluxio guide +This tutorial provides a brief introduction to using Alluxio. + - How to use Alluxio in CarbonData? + - [Running alluxio example in CarbonData project by IDEA](#Running alluxio example in CarbonData project by IDEA) + - [CarbonData supports alluxio by spark-shell](#CarbonData supports alluxio by spark-shell) + - [CarbonData supports alluxio by spark-submit](#CarbonData supports alluxio by spark-submit) + +## Running alluxio example in CarbonData project by IDEA + +### [Building CarbonData](https://github.com/apache/carbondata/tree/master/build) + - Please refer to [Building CarbonData](https://github.com/apache/carbondata/tree/master/build). + - Users need to install IDEA and scala plugin, and import CarbonData project. + +### Installing and starting Alluxio + - Please refer to [https://www.alluxio.org/docs/1.8/en/Getting-Started.html#starting-alluxio](https://www.alluxio.org/docs/1.8/en/Getting-Started.html#starting-alluxio) + - Access the Alluxio web: [http://localhost:19999/home](http://localhost:19999/home) + +### Running Example + - Please refer to [AlluxioExample](https://github.com/apache/carbondata/blob/master/examples/spark2/src/main/scala/org/apache/carbondata/examples/AlluxioExample.scala) + +## CarbonData supports alluxio by spark-shell + +### [Building CarbonData](https://github.com/apache/carbondata/tree/master/build) + - Please refer to [Building CarbonData](https://github.com/apache/carbondata/tree/master/build). + +### Preparing Spark + - Please refer to [http://spark.apache.org/docs/latest/](http://spark.apache.org/docs/latest/) + +### Downloading alluxio and uncompressing it + - Please refer to [https://www.alluxio.org/download](https://www.alluxio.org/download) + +### Running spark-shell + - Running the command in spark path + ```$command +./bin/spark-shell --jars ${CARBONDATA_PATH}/assembly/target/scala-2.11/apache-carbondata-1.6.0-SNAPSHOT-bin-spark2.2.1-hadoop2.7.2.jar,${ALLUXIO_PATH}/client/alluxio-1.8.1-client.jar +``` + - Testing use alluxio by CarbonSession + ```$scala +import org.apache.spark.sql.CarbonSession._ +import org.apache.spark.sql.SparkSession + +val carbon = SparkSession.builder().master("local").appName("test").getOrCreateCarbonSession("alluxio://localhost:19998/carbondata"); +carbon.sql("CREATE TABLE carbon_alluxio(id String,name String, city String,age Int) STORED as carbondata"); +carbon.sql(s"LOAD DATA LOCAL INPATH '${CARBONDATA_PATH}/integration/spark-common-test/src/test/resources/sample.csv' into table carbon_alluxio"); +carbon.sql("select * from carbon_alluxio").show +``` + - Result + ```$scala + scala> carbon.sql("select * from carbon_alluxio").show + +---+------+---------+---+ + | id| name| city|age| + +---+------+---------+---+ + | 1| david| shenzhen| 31| + | 2| eason| shenzhen| 27| + | 3| jarry| wuhan| 35| + | 3| jarry|Bangalore| 35| + | 4| kunal| Delhi| 26| + | 4|vishal|Bangalore| 29| + +---+------+---------+---+ + ``` +## CarbonData supports alluxio by spark-submit + +### [Building CarbonData](https://github.com/apache/carbondata/tree/master/build) + - Please refer to [Building CarbonData](https://github.com/apache/carbondata/tree/master/build). + +### Preparing Spark + - Please refer to [http://spark.apache.org/docs/latest/](http://spark.apache.org/docs/latest/) + +### Downloading alluxio and uncompressing it + - Please refer to [https://www.alluxio.org/download](https://www.alluxio.org/download) + +### Running spark-submit +#### Upload data to alluxio +```$command +./bin/alluxio fs copyFromLocal ${CARBONDATA_PATH}/hadoop/src/test/resources/data.csv / +``` +#### Command +```$command +./bin/spark-submit \ +--master local \ +--jars ${ALLUXIO_PATH}/client/alluxio-1.8.1-client.jar,${CARBONDATA_PATH}/examples/spark2/target/carbondata-examples-1.6.0-SNAPSHOT.jar \ +--class org.apache.carbondata.examples.AlluxioExample \ +${CARBONDATA_PATH}/assembly/target/scala-2.11/apache-carbondata-1.6.0-SNAPSHOT-bin-spark2.2.1-hadoop2.7.2.jar \ +false +``` +**NOTE**: Please set runShell as false, which can avoid dependency on alluxio shell module. + +#### Result +```$command ++-----------------+-------+--------------------+--------------------+---------+-----------+---------+----------+ +|SegmentSequenceId| Status| Load Start Time| Load End Time|Merged To|File Format|Data Size|Index Size| ++-----------------+-------+--------------------+--------------------+---------+-----------+---------+----------+ +| 1|Success|2019-01-09 15:10:...|2019-01-09 15:10:...| NA|COLUMNAR_V3| 23.92KB| 1.07KB| +| 0|Success|2019-01-09 15:10:...|2019-01-09 15:10:...| NA|COLUMNAR_V3| 23.92KB| 1.07KB| ++-----------------+-------+--------------------+--------------------+---------+-----------+---------+----------+ + ++-------+------+ +|country|amount| ++-------+------+ +| france| 202| +| china| 1698| ++-------+------+ + ++-----------------+---------+--------------------+--------------------+---------+-----------+---------+----------+ +|SegmentSequenceId| Status| Load Start Time| Load End Time|Merged To|File Format|Data Size|Index Size| ++-----------------+---------+--------------------+--------------------+---------+-----------+---------+----------+ +| 3|Compacted|2019-01-09 15:10:...|2019-01-09 15:10:...| 0.1|COLUMNAR_V3| 23.92KB| 1.03KB| +| 2|Compacted|2019-01-09 15:10:...|2019-01-09 15:10:...| 0.1|COLUMNAR_V3| 23.92KB| 1.07KB| +| 1|Compacted|2019-01-09 15:10:...|2019-01-09 15:10:...| 0.1|COLUMNAR_V3| 23.92KB| 1.07KB| +| 0.1| Success|2019-01-09 15:10:...|2019-01-09 15:10:...| NA|COLUMNAR_V3| 37.65KB| 1.08KB| +| 0|Compacted|2019-01-09 15:10:...|2019-01-09 15:10:...| 0.1|COLUMNAR_V3| 23.92KB| 1.07KB| ++-----------------+---------+--------------------+--------------------+---------+-----------+---------+----------+ + +``` + +## Reference +[1] https://www.alluxio.org/docs/1.8/en/Getting-Started.html +[2] https://www.alluxio.org/docs/1.8/en/compute/Spark.html \ No newline at end of file diff --git a/docs/documentation.md b/docs/documentation.md index d1261a115d5..a40eed1250f 100644 --- a/docs/documentation.md +++ b/docs/documentation.md @@ -29,7 +29,7 @@ Apache CarbonData is a new big data file format for faster interactive query usi **Quick Start:** [Run an example program](./quick-start-guide.md#installing-and-configuring-carbondata-to-run-locally-with-spark-shell) on your local machine or [study some examples](https://github.com/apache/carbondata/tree/master/examples/spark2/src/main/scala/org/apache/carbondata/examples). -**CarbonData SQL Language Reference:** CarbonData extends the Spark SQL language and adds several [DDL](./ddl-of-carbondata.md) and [DML](./dml-of-carbondata.md) statements to support operations on it.Refer to the [Reference Manual](./language-manual.md) to understand the supported features and functions. +**CarbonData SQL Language Reference:** CarbonData extends the Spark SQL language and adds several [DDL](./ddl-of-carbondata.md) and [DML](./dml-of-carbondata.md) statements to support operations on it. Refer to the [Reference Manual](./language-manual.md) to understand the supported features and functions. **Programming Guides:** You can read our guides about [Java APIs supported](./sdk-guide.md) or [C++ APIs supported](./csdk-guide.md) to learn how to integrate CarbonData with your applications. @@ -37,7 +37,9 @@ Apache CarbonData is a new big data file format for faster interactive query usi ## Integration -CarbonData can be integrated with popular Execution engines like [Spark](./quick-start-guide.md#spark) , [Presto](./quick-start-guide.md#presto) and [Hive](./quick-start-guide.md#hive).Refer to the [Installation and Configuration](./quick-start-guide.md#integration) section to understand all modes of Integrating CarbonData. + - CarbonData can be integrated with popular execution engines like [Spark](./quick-start-guide.md#spark) , [Presto](./quick-start-guide.md#presto) and [Hive](./quick-start-guide.md#hive). + - CarbonData can be integrated with popular storage engines like HDFS, Huawei Cloud(OBS) and [Alluxio](./quick-start-guide.md#alluxio). + Refer to the [Installation and Configuration](./quick-start-guide.md#integration) section to understand all modes of Integrating CarbonData. diff --git a/docs/introduction.md b/docs/introduction.md index 2ab6dd4026a..037a29657c6 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -115,8 +115,10 @@ CarbonData has rich set of features to support various use cases in Big Data ana - ##### HDFS - CarbonData uses HDFS api to write and read data from HDFS.CarbonData can take advantage of the locality information to efficiently suggest spark to run tasks near to the data. + CarbonData uses HDFS api to write and read data from HDFS. CarbonData can take advantage of the locality information to efficiently suggest spark to run tasks near to the data. +- ##### Alluxio + CarbonData also supports read and write with [Alluxio](./quick-start-guide.md#alluxio). ## Integration with Big Data ecosystem diff --git a/docs/quick-start-guide.md b/docs/quick-start-guide.md index b7b20a819e5..244a9aee5b7 100644 --- a/docs/quick-start-guide.md +++ b/docs/quick-start-guide.md @@ -35,9 +35,10 @@ This tutorial provides a quick introduction to using CarbonData. To follow along ## Integration -CarbonData can be integrated with Spark,Presto and Hive Execution Engines. The below documentation guides on Installing and Configuring with these execution engines. +### Integration with Execution Engines +CarbonData can be integrated with Spark,Presto and Hive execution engines. The below documentation guides on Installing and Configuring with these execution engines. -### Spark +#### Spark [Installing and Configuring CarbonData to run locally with Spark Shell](#installing-and-configuring-carbondata-to-run-locally-with-spark-shell) @@ -48,13 +49,21 @@ CarbonData can be integrated with Spark,Presto and Hive Execution Engines. The b [Installing and Configuring CarbonData Thrift Server for Query Execution](#query-execution-using-carbondata-thrift-server) -### Presto +#### Presto [Installing and Configuring CarbonData on Presto](#installing-and-configuring-carbondata-on-presto) -### Hive +#### Hive [Installing and Configuring CarbonData on Hive](https://github.com/apache/carbondata/blob/master/docs/hive-guide.md) +### Integration with Storage Engines +#### HDFS +[CarbonData supports read and write with HDFS](https://github.com/apache/carbondata/blob/master/docs/quick-start-guide.md#installing-and-configuring-carbondata-on-standalone-spark-cluster) +#### S3 +[CarbonData supports read and write with S3](https://github.com/apache/carbondata/blob/master/docs/s3-guide.md) + +#### Alluxio +[CarbonData supports read and write with Alluxio](https://github.com/apache/carbondata/blob/master/docs/alluxio-guide.md) ## Installing and Configuring CarbonData to run locally with Spark Shell diff --git a/examples/spark2/pom.xml b/examples/spark2/pom.xml index 24ca4b13e20..4836a1ef84a 100644 --- a/examples/spark2/pom.xml +++ b/examples/spark2/pom.xml @@ -105,6 +105,16 @@ carbondata-core ${project.version} + + org.alluxio + alluxio-core-client-hdfs + 1.8.1 + + + org.alluxio + alluxio-shell + 1.8.1 + diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/AlluxioExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/AlluxioExample.scala index 31110ce93aa..f757bee8609 100644 --- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/AlluxioExample.scala +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/AlluxioExample.scala @@ -17,6 +17,11 @@ package org.apache.carbondata.examples +import java.io.File +import java.text.SimpleDateFormat +import java.util.Date + +import alluxio.cli.fs.FileSystemShell import org.apache.spark.sql.SparkSession import org.apache.carbondata.core.constants.CarbonCommonConstants @@ -24,50 +29,104 @@ import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.examples.util.ExampleUtils - /** * configure alluxio: * 1.start alluxio - * 2.upload the jar :"/alluxio_path/core/client/target/ - * alluxio-core-client-YOUR-VERSION-jar-with-dependencies.jar" - * 3.Get more detail at:http://www.alluxio.org/docs/master/en/Running-Spark-on-Alluxio.html + * 2. Please upload data to alluxio if you set runShell as false + * ./bin/alluxio fs copyFromLocal /carbondata_path/hadoop/src/test/resources/data.csv / + * 3.Get more details at: https://www.alluxio.org/docs/1.8/en/compute/Spark.html */ - object AlluxioExample { - def main(args: Array[String]) { - val spark = ExampleUtils.createCarbonSession("AlluxioExample") - exampleBody(spark) - spark.close() + def main (args: Array[String]) { + val carbon = ExampleUtils.createCarbonSession("AlluxioExample", + storePath = "alluxio://localhost:19998/carbondata") + val runShell: Boolean = if (null != args && args.length > 0) { + args(0).toBoolean + } else { + true + } + exampleBody(carbon, runShell) + carbon.close() } - def exampleBody(spark : SparkSession): Unit = { + def exampleBody (spark: SparkSession, runShell: Boolean = true): Unit = { + val rootPath = new File(this.getClass.getResource("/").getPath + + "../../../..").getCanonicalPath spark.sparkContext.hadoopConfiguration.set("fs.alluxio.impl", "alluxio.hadoop.FileSystem") FileFactory.getConfiguration.set("fs.alluxio.impl", "alluxio.hadoop.FileSystem") // Specify date format based on raw data CarbonProperties.getInstance() - .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd") + .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd") + val time = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date()) + val alluxioPath = "alluxio://localhost:19998" + var alluxioFile = alluxioPath + "/data.csv" + + val remoteFile = "/carbon_alluxio" + time + ".csv" + + var mFsShell: FileSystemShell = null + + // avoid dependency alluxio shell when running it with spark-submit + if (runShell) { + mFsShell = new FileSystemShell() + alluxioFile = alluxioPath + remoteFile + val localFile = rootPath + "/hadoop/src/test/resources/data.csv" + mFsShell.run("copyFromLocal", localFile, remoteFile) + } + + import spark._ + + sql("DROP TABLE IF EXISTS alluxio_table") - spark.sql("DROP TABLE IF EXISTS alluxio_table") + sql( + s""" + | CREATE TABLE IF NOT EXISTS alluxio_table( + | ID Int, + | date Date, + | country String, + | name String, + | phonetype String, + | serialname String, + | salary Int) + | STORED BY 'carbondata' + | TBLPROPERTIES( + | 'SORT_COLUMNS' = 'phonetype,name', + | 'DICTIONARY_INCLUDE'='phonetype', + | 'TABLE_BLOCKSIZE'='32', + | 'AUTO_LOAD_MERGE'='true') + """.stripMargin) - spark.sql(""" - CREATE TABLE IF NOT EXISTS alluxio_table - (ID Int, date Date, country String, - name String, phonetype String, serialname String, salary Int) - STORED BY 'carbondata' - """) + for (i <- 0 until 2) { + sql( + s""" + | LOAD DATA LOCAL INPATH '$alluxioFile' + | into table alluxio_table + """.stripMargin) + } - spark.sql(s""" - LOAD DATA LOCAL INPATH 'alluxio://localhost:19998/data.csv' into table alluxio_table - """) + sql("SELECT * FROM alluxio_table").show() - spark.sql(""" - SELECT country, count(salary) AS amount - FROM alluxio_table - WHERE country IN ('china','france') - GROUP BY country - """).show() + sql("SHOW SEGMENTS FOR TABLE alluxio_table").show() + sql( + """ + | SELECT country, count(salary) AS amount + | FROM alluxio_table + | WHERE country IN ('china','france') + | GROUP BY country + """.stripMargin).show() - spark.sql("DROP TABLE IF EXISTS alluxio_table") + for (i <- 0 until 2) { + sql( + s""" + | LOAD DATA LOCAL INPATH '$alluxioFile' + | into table alluxio_table + """.stripMargin) + } + sql("SHOW SEGMENTS FOR TABLE alluxio_table").show() + if (runShell) { + mFsShell.run("rm", remoteFile) + mFsShell.close() + } + sql("DROP TABLE IF EXISTS alluxio_table") } } diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala index bb9f4d0d984..b6e3f4b9266 100644 --- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/util/ExampleUtils.scala @@ -30,13 +30,20 @@ object ExampleUtils { .getCanonicalPath val storeLocation: String = currentPath + "/target/store" - def createCarbonSession(appName: String, workThreadNum: Int = 1): SparkSession = { + def createCarbonSession (appName: String, workThreadNum: Int = 1, + storePath: String = null): SparkSession = { val rootPath = new File(this.getClass.getResource("/").getPath - + "../../../..").getCanonicalPath - val storeLocation = s"$rootPath/examples/spark2/target/store" + + "../../../..").getCanonicalPath + val warehouse = s"$rootPath/examples/spark2/target/warehouse" val metaStoreDB = s"$rootPath/examples/spark2/target" + val storeLocation = if (null != storePath) { + storePath + } else { + s"$rootPath/examples/spark2/target/store" + } + CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd HH:mm:ss") .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd")