From 8b81262b1609eea1d7201872a3eb8d50a0261648 Mon Sep 17 00:00:00 2001 From: xubo245 <601450868@qq.com> Date: Fri, 2 Mar 2018 17:42:40 +0800 Subject: [PATCH 1/3] [CARBONDATA-2144] Optimize preaggregate table documentation, include timeseries --- docs/data-management-on-carbondata.md | 271 ++++++++++++-------------- 1 file changed, 128 insertions(+), 143 deletions(-) diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md index 9678a32d3c3..f4cdae3be48 100644 --- a/docs/data-management-on-carbondata.md +++ b/docs/data-management-on-carbondata.md @@ -127,14 +127,14 @@ This tutorial is going to introduce all commands and data operations on CarbonDa ``` CREATE TABLE IF NOT EXISTS productSchema.productSalesTable ( - productNumber Int, - productName String, - storeCity String, - storeProvince String, - productCategory String, - productBatch String, - saleQuantity Int, - revenue Int) + productNumber INT, + productName STRING, + storeCity STRING, + storeProvince STRING, + productCategory STRING, + productBatch STRING, + saleQuantity INT, + revenue INT) STORED BY 'carbondata' TBLPROPERTIES ('SORT_COLUMNS'='productName,storeCity', 'SORT_SCOPE'='NO_SORT') @@ -647,13 +647,13 @@ This tutorial is going to introduce all commands and data operations on CarbonDa Example: ``` CREATE TABLE IF NOT EXISTS productSchema.productSalesTable ( - productNumber Int, - productName String, - storeCity String, - storeProvince String, - saleQuantity Int, - revenue Int) - PARTITIONED BY (productCategory String, productBatch String) + productNumber INT, + productName STRING, + storeCity STRING, + storeProvince STRING, + saleQuantity INT, + revenue INT) + PARTITIONED BY (productCategory STRING, productBatch STRING) STORED BY 'carbondata' ``` @@ -745,12 +745,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa Example: ``` CREATE TABLE IF NOT EXISTS hash_partition_table( - col_A String, - col_B Int, - col_C Long, - col_D Decimal(10,2), - col_F Timestamp - ) PARTITIONED BY (col_E Long) + col_A STRING, + col_B INT, + col_C LONG, + col_D DECIMAL(10,2), + col_F TIMESTAMP + ) PARTITIONED BY (col_E LONG) STORED BY 'carbondata' TBLPROPERTIES('PARTITION_TYPE'='HASH','NUM_PARTITIONS'='9') ``` @@ -773,11 +773,11 @@ This tutorial is going to introduce all commands and data operations on CarbonDa Example: ``` CREATE TABLE IF NOT EXISTS range_partition_table( - col_A String, - col_B Int, - col_C Long, - col_D Decimal(10,2), - col_E Long + col_A STRING, + col_B INT, + col_C LONG, + col_D DECIMAL(10,2), + col_E LONG ) partitioned by (col_F Timestamp) PARTITIONED BY 'carbondata' TBLPROPERTIES('PARTITION_TYPE'='RANGE', @@ -800,12 +800,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa Example: ``` CREATE TABLE IF NOT EXISTS list_partition_table( - col_B Int, - col_C Long, - col_D Decimal(10,2), - col_E Long, - col_F Timestamp - ) PARTITIONED BY (col_A String) + col_B INT, + col_C LONG, + col_D DECIMAL(10,2), + col_E LONG, + col_F TIMESTAMP + ) PARTITIONED BY (col_A STRING) STORED BY 'carbondata' TBLPROPERTIES('PARTITION_TYPE'='LIST', 'LIST_INFO'='aaaa, bbbb, (cccc, dddd), eeee') @@ -861,22 +861,22 @@ This tutorial is going to introduce all commands and data operations on CarbonDa ## PRE-AGGREGATE TABLES - Carbondata supports pre aggregating of data so that OLAP kind of queries can fetch data - much faster.Aggregate tables are created as datamaps so that the handling is as efficient as - other indexing support.Users can create as many aggregate tables they require as datamaps to - improve their query performance,provided the storage requirements and loading speeds are + CarbonData supports pre aggregating of data so that OLAP kind of queries can fetch data + much faster. Aggregate tables are created as datamaps so that the handling is as efficient as + other indexing support. Users can create as many aggregate tables they require as datamaps to + improve their query performance, provided the storage requirements and loading speeds are acceptable. For main table called **sales** which is defined as ``` CREATE TABLE sales ( - order_time timestamp, - user_id string, - sex string, - country string, - quantity int, - price bigint) + order_time timestamp, + user_id STRING, + sex STRING, + country STRING, + quantity INT, + price BIGINT) STORED BY 'carbondata' ``` @@ -944,7 +944,7 @@ pre-aggregate table to fetch the data. ##### Compacting pre-aggregate tables Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table. Running Compaction command on main table will **not automatically** compact the pre-aggregate -tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on +tables. Compaction is an optional operation for pre-aggregate table. If compaction is performed on main table but not performed on pre-aggregate table, all queries still can benefit from pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. @@ -963,7 +963,7 @@ This functionality is not supported. NOTE (RESTRICTION): Delete Segment operations are not supported on main table which has pre-aggregate tables - created on it. All the pre-aggregate tables will have to be dropped before update/delete + created on it. All the pre-aggregate tables will have to be dropped before delete segment operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually after delete segment operations are completed @@ -974,111 +974,96 @@ This functionality is not supported. Adding new column in new table does not have any affect on pre-aggregate tables. However if dropping or renaming a column has impact in pre-aggregate table, such operations will be rejected and error will be thrown. All the pre-aggregate tables will have to be dropped - before Alter Operations can be performed on the main table. Pre-aggregate tables can be rebuilt - manually after Alter Table operations are completed + before alter operations can be performed on the main table. Pre-aggregate tables can be rebuilt + manually after alter table operations are completed ### Supporting timeseries data (Alpha feature in 1.3.0) -Carbondata has built-in understanding of time hierarchy and levels: year, month, day, hour, minute. -Multiple pre-aggregate tables can be created for the hierarchy and Carbondata can do automatic +CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second. +Timeseries pre-aggregate tables can be created with different granularity and CarbonData can do automatic roll-up for the queries on these hierarchies. ``` - CREATE DATAMAP agg_year - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'year_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_month - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'month_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_day - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'day_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_sales_hour - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'hour_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_minute + CREATE DATAMAP agg_hour ON TABLE sales USING "timeseries" DMPROPERTIES ( - 'event_time'='order_time', - 'minute_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_minute - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'minute_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - ``` - - For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports + 'event_time'='order_time', + 'hour_granularity'='1', + ) + AS + SELECT + order_time, + country, + sex, + sum(quantity), + max(quantity), + count(user_id), + sum(price), + avg(price) + FROM sales + GROUP BY order_time, country, sex + ``` + User also can use other granularity: second_granularity, minute_granularity, + day_granularity, month_granularity, year_granularity + For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports UDF as ``` timeseries(timeseries column name, 'aggregation level') ``` + Examples ``` - Select timeseries(order_time, 'hour'), sum(quantity) from sales group by timeseries(order_time, - 'hour') + SELECT + timeseries(order_time, 'hour'), + sum(quantity) + FROM sales + GROUP BY timeseries(order_time, 'hour') ``` It is **not necessary** to create pre-aggregate tables for each granularity unless required for - query. Carbondata can roll-up the data and fetch it. + query. CarbonData can roll-up the data and fetch it. - For Example: For main table **sales** , If pre-aggregate tables were created as + For Example: For main table **sales**, If timeseries pre-aggregate tables were created as ``` CREATE DATAMAP agg_day - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'day_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex - - CREATE DATAMAP agg_sales_hour - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'hour_granualrity'='1', - ) AS - SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), - avg(price) FROM sales GROUP BY order_time, country, sex + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'day_granularity'='1', + ) + AS + SELECT + order_time, + country, + sex, + sum(quantity), + max(quantity), + count(user_id), + sum(price), + avg(price) + FROM sales + GROUP BY order_time, country, sex + + CREATE DATAMAP agg_sales_hour + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'hour_granularity'='1', + ) + AS + SELECT + order_time, + country, + sex, + sum(quantity), + max(quantity), + count(user_id), + sum(price), + avg(price) + FROM sales + GROUP BY order_time, country, sex ``` Queries like below will be rolled-up and fetched from pre-aggregate tables @@ -1091,10 +1076,10 @@ roll-up for the queries on these hierarchies. ``` NOTE (RESTRICTION): - * Only value of 1 is supported for hierarchy levels. Other hierarchy levels are not supported. - Other hierarchy levels are not supported - * pre-aggregate tables for the desired levels needs to be created one after the other - * pre-aggregate tables created for each level needs to be dropped separately + * Only 1 is supported for granularity value of timeseries pre-aggregate table. Other granularity value are not supported. + * Only one granularity can be defined on creating one timeseries pre-aggregate table. Other granularity are created separately. + * Pre-aggregate tables for the desired levels needs to be created one after the other + * Pre-aggregate tables are created for each level needs to be dropped separately ## BUCKETING @@ -1119,14 +1104,14 @@ roll-up for the queries on these hierarchies. Example: ``` CREATE TABLE IF NOT EXISTS productSchema.productSalesTable ( - productNumber Int, - saleQuantity Int, - productName String, - storeCity String, - storeProvince String, - productCategory String, - productBatch String, - revenue Int) + productNumber INT, + saleQuantity INT, + productName STRING, + storeCity STRING, + storeProvince STRING, + productCategory STRING, + productBatch STRING, + revenue INT) STORED BY 'carbondata' TBLPROPERTIES ('BUCKETNUMBER'='4', 'BUCKETCOLUMNS'='productName') ``` @@ -1201,7 +1186,7 @@ roll-up for the queries on these hierarchies. NOTE: carbon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only. - If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. + If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query. ``` CarbonSession.threadSet ("carbon.input.segments..",""); ``` @@ -1211,7 +1196,7 @@ roll-up for the queries on these hierarchies. SET carbon.input.segments.. = *; ``` - If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. + If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query. ``` CarbonSession.threadSet ("carbon.input.segments..","*"); ``` From 488ad0137249ba7643f6a99bb33a430a9519595d Mon Sep 17 00:00:00 2001 From: xubo245 <601450868@qq.com> Date: Fri, 2 Mar 2018 17:43:23 +0800 Subject: [PATCH 2/3] add --- docs/data-management-on-carbondata.md | 132 +++++++++++++++----------- 1 file changed, 75 insertions(+), 57 deletions(-) diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md index f4cdae3be48..c085de3986b 100644 --- a/docs/data-management-on-carbondata.md +++ b/docs/data-management-on-carbondata.md @@ -983,28 +983,66 @@ Timeseries pre-aggregate tables can be created with different granularity and Ca roll-up for the queries on these hierarchies. ``` - CREATE DATAMAP agg_hour + CREATE DATAMAP agg_year ON TABLE sales USING "timeseries" DMPROPERTIES ( - 'event_time'='order_time', - 'hour_granularity'='1', - ) - AS - SELECT - order_time, - country, - sex, - sum(quantity), - max(quantity), - count(user_id), - sum(price), - avg(price) - FROM sales - GROUP BY order_time, country, sex - ``` - User also can use other granularity: second_granularity, minute_granularity, - day_granularity, month_granularity, year_granularity + 'event_time'='order_time', + 'year_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_month + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'month_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_day + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'day_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_sales_hour + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'hour_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_minute + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'minute_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_minute + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'minute_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + ``` For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports UDF as ``` @@ -1026,44 +1064,24 @@ roll-up for the queries on these hierarchies. ``` CREATE DATAMAP agg_day - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'day_granularity'='1', - ) - AS - SELECT - order_time, - country, - sex, - sum(quantity), - max(quantity), - count(user_id), - sum(price), - avg(price) - FROM sales - GROUP BY order_time, country, sex - - CREATE DATAMAP agg_sales_hour - ON TABLE sales - USING "timeseries" - DMPROPERTIES ( - 'event_time'='order_time', - 'hour_granularity'='1', - ) - AS - SELECT - order_time, - country, - sex, - sum(quantity), - max(quantity), - count(user_id), - sum(price), - avg(price) - FROM sales - GROUP BY order_time, country, sex + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'day_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex + + CREATE DATAMAP agg_sales_hour + ON TABLE sales + USING "timeseries" + DMPROPERTIES ( + 'event_time'='order_time', + 'hour_granualrity'='1', + ) AS + SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price), + avg(price) FROM sales GROUP BY order_time, country, sex ``` Queries like below will be rolled-up and fetched from pre-aggregate tables From 5ccee90352cb234412cc140f8addeeb029538778 Mon Sep 17 00:00:00 2001 From: xubo245 <601450868@qq.com> Date: Fri, 2 Mar 2018 17:46:01 +0800 Subject: [PATCH 3/3] add --- docs/data-management-on-carbondata.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md index c085de3986b..ea80d41bfc9 100644 --- a/docs/data-management-on-carbondata.md +++ b/docs/data-management-on-carbondata.md @@ -979,7 +979,7 @@ This functionality is not supported. ### Supporting timeseries data (Alpha feature in 1.3.0) CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second. -Timeseries pre-aggregate tables can be created with different granularity and CarbonData can do automatic +Timeseries pre-aggregate tables can be created for the hierarchy and CarbonData can do automatic roll-up for the queries on these hierarchies. ```