From f778c024802559dd06ea233a5244ca01ed1c5530 Mon Sep 17 00:00:00 2001 From: chenliang613 Date: Thu, 18 Jan 2018 15:50:23 +0800 Subject: [PATCH 1/3] [CARBONDATA-2050] Add example of query data with specified segments --- .../examples/QuerySegmentExample.scala | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 examples/spark2/src/main/scala/org/apache/carbondata/examples/QuerySegmentExample.scala diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/QuerySegmentExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/QuerySegmentExample.scala new file mode 100644 index 00000000000..03312a0eb15 --- /dev/null +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/QuerySegmentExample.scala @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.examples + +import java.io.File + +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.util.CarbonProperties + +/** + * This example introduces how to query data with specified segments + */ + +object QuerySegmentExample { + + def main(args: Array[String]) { + val spark = ExampleUtils.createCarbonSession("QuerySegmentExample") + spark.sparkContext.setLogLevel("ERROR") + + spark.sql("DROP TABLE IF EXISTS carbon_table") + + // Create table + spark.sql( + s""" + | CREATE TABLE carbon_table( + | shortField SHORT, + | intField INT, + | bigintField LONG, + | doubleField DOUBLE, + | stringField STRING, + | timestampField TIMESTAMP, + | decimalField DECIMAL(18,2), + | dateField DATE, + | charField CHAR(5), + | floatField FLOAT + | ) + | STORED BY 'carbondata' + """.stripMargin) + + val rootPath = new File(this.getClass.getResource("/").getPath + + "../../../..").getCanonicalPath + val path = s"$rootPath/examples/spark2/src/main/resources/data.csv" + + // load 4 segments, each load has 10 rows data + // scalastyle:off + (1 to 4).foreach(_ => spark.sql( + s""" + | LOAD DATA LOCAL INPATH '$path' + | INTO TABLE carbon_table + | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') + """.stripMargin)) + // scalastyle:on + + // 1.Query data with specified segments without compaction + + spark.sql("SHOW SEGMENTS FOR TABLE carbon_table").show() + // 40 rows + spark.sql( + s""" + | SELECT count(*) + | FROM carbon_table + """.stripMargin).show() + + // specify segments to query + spark.sql("SET carbon.input.segments.default.carbon_table = 1,3") + // 20 rows from segment1 and segment3 + spark.sql( + s""" + | SELECT count(*) + | FROM carbon_table + """.stripMargin).show() + + // 2.Query data with specified segments after compaction + + CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.COMPACTION_SEGMENT_LEVEL_THRESHOLD, "3,2") + + spark.sql("ALTER TABLE carbon_table COMPACT 'MINOR'") + spark.sql("SHOW SEGMENTS FOR TABLE carbon_table").show() + + // Reset to query all segments data + spark.sql("SET carbon.input.segments.default.carbon_table = *") + // 40 rows from all segments + spark.sql( + s""" + | SELECT count(*) + | FROM carbon_table + """.stripMargin).show() + // After MINOR compaction, 0.1 has 30 rows data(compact 3 segments) + spark.sql("SET carbon.input.segments.default.carbon_table = 0.1") + spark.sql( + s""" + | SELECT count(*) + | FROM carbon_table + """.stripMargin).show() + + spark.sql("ALTER TABLE carbon_table COMPACT 'MAJOR'") + spark.sql("CLEAN FILES FOR TABLE carbon_table") + spark.sql("SHOW SEGMENTS FOR TABLE carbon_table").show() + + // Load 2 new segments + (1 to 2).foreach(_ => spark.sql( + s""" + | LOAD DATA LOCAL INPATH '$path' + | INTO TABLE carbon_table + | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') + """.stripMargin)) + + spark.sql("SHOW SEGMENTS FOR TABLE carbon_table").show() + // 50 rows: segment0.2 has 40 rows after major compaction, plus segment5 with 10 rows + spark.sql("SET carbon.input.segments.default.carbon_table = 0.2,5") + spark.sql( + s""" + | SELECT count(*) + | FROM carbon_table + """.stripMargin).show() + + // Drop table + spark.sql("DROP TABLE IF EXISTS carbon_table") + spark.stop() + } + +} \ No newline at end of file From 646cafafe9a6144064209078e70bdd288689f0c5 Mon Sep 17 00:00:00 2001 From: chenliang613 Date: Thu, 18 Jan 2018 16:06:38 +0800 Subject: [PATCH 2/3] [CARBONDATA-2050] Add example of query data with specified segments --- docs/data-management-on-carbondata.md | 34 ++++++--------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md index 859a060c018..70eccdc7d30 100644 --- a/docs/data-management-on-carbondata.md +++ b/docs/data-management-on-carbondata.md @@ -781,59 +781,39 @@ This tutorial is going to introduce all commands and data operations on CarbonDa ``` DELETE FROM TABLE CarbonDatabase.CarbonTable WHERE SEGMENT.STARTTIME BEFORE '2017-06-01 12:05:06' ``` -### SEGMENT READING + +### QUERY DATA WITH SPECIFIED SEGMENTS This command is used to read data from specified segments during CarbonScan. - Get the Segment ID: - ``` SHOW SEGMENTS FOR TABLE [db_name.]table_name LIMIT number_of_segments ``` - Set the segment IDs - + Set the segment IDs for table ``` - SET cabon.input.segments.. = ; + SET cabon.input.segments.. = ``` - **Property:** - - cabon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only. - - ``` - SET cabon.input.segments.. = ; - ``` + NOTE: + cabon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only. If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. - ``` CarbonSession.threadSet ("cabon.input.segments..",""); ``` - Reset the segment IDs: - + Reset the segment IDs ``` SET cabon.input.segments.. = *; ``` If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. - ``` CarbonSession.threadSet ("cabon.input.segments..","*"); ``` - Reset - - It will reset all the properties set for carbondata. It is not recommended if you do not want to reset all the properties except cabon.input.segments. - - ``` - RESET - ``` - - **NOTE**: It is not recommended to set this property in carbon.properties file, because all the sessions will take this segments list unless it is overwritten at session or thread level. - **Examples:** * Example to show the list of segment IDs,segment status, and other required details and then specify the list of segments to be read. From 2c52f7a784316bf61cdf64e95c27066af0d6ff59 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Thu, 18 Jan 2018 17:40:29 +0800 Subject: [PATCH 3/3] fix commit --- docs/data-management-on-carbondata.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md index 70eccdc7d30..3a2c1d398a1 100644 --- a/docs/data-management-on-carbondata.md +++ b/docs/data-management-on-carbondata.md @@ -793,25 +793,25 @@ This tutorial is going to introduce all commands and data operations on CarbonDa Set the segment IDs for table ``` - SET cabon.input.segments.. = + SET carbon.input.segments.. = ``` NOTE: - cabon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only. + carbon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only. If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. ``` - CarbonSession.threadSet ("cabon.input.segments..",""); + CarbonSession.threadSet ("carbon.input.segments..",""); ``` Reset the segment IDs ``` - SET cabon.input.segments.. = *; + SET carbon.input.segments.. = *; ``` If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. ``` - CarbonSession.threadSet ("cabon.input.segments..","*"); + CarbonSession.threadSet ("carbon.input.segments..","*"); ``` **Examples:** @@ -821,13 +821,13 @@ This tutorial is going to introduce all commands and data operations on CarbonDa ``` SHOW SEGMENTS FOR carbontable1; - SET cabon.input.segments.db.carbontable1 = 1,3,9; + SET carbon.input.segments.db.carbontable1 = 1,3,9; ``` * Example to query with segments reading in multi threading mode: ``` - CarbonSession.threadSet ("cabon.input.segments.db.carbontable_Multi_Thread","1,3"); + CarbonSession.threadSet ("carbon.input.segments.db.carbontable_Multi_Thread","1,3"); ``` * Example for threadset in multithread environment (following shows how it is used in Scala code): @@ -835,8 +835,8 @@ This tutorial is going to introduce all commands and data operations on CarbonDa ``` def main(args: Array[String]) { Future { - CarbonSession.threadSet ("cabon.input.segments.db.carbontable_Multi_Thread","1") - spark.sql("select count(empno) from cabon.input.segments.db.carbontable_Multi_Thread").show(); + CarbonSession.threadSet ("carbon.input.segments.db.carbontable_Multi_Thread","1") + spark.sql("select count(empno) from carbon.input.segments.db.carbontable_Multi_Thread").show(); } } ```