From 8b81262b1609eea1d7201872a3eb8d50a0261648 Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Fri, 2 Mar 2018 17:42:40 +0800
Subject: [PATCH 1/3] [CARBONDATA-2144] Optimize preaggregate table
documentation, include timeseries
---
docs/data-management-on-carbondata.md | 271 ++++++++++++--------------
1 file changed, 128 insertions(+), 143 deletions(-)
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 9678a32d3c3..f4cdae3be48 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -127,14 +127,14 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
```
CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
- productNumber Int,
- productName String,
- storeCity String,
- storeProvince String,
- productCategory String,
- productBatch String,
- saleQuantity Int,
- revenue Int)
+ productNumber INT,
+ productName STRING,
+ storeCity STRING,
+ storeProvince STRING,
+ productCategory STRING,
+ productBatch STRING,
+ saleQuantity INT,
+ revenue INT)
STORED BY 'carbondata'
TBLPROPERTIES ('SORT_COLUMNS'='productName,storeCity',
'SORT_SCOPE'='NO_SORT')
@@ -647,13 +647,13 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
Example:
```
CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
- productNumber Int,
- productName String,
- storeCity String,
- storeProvince String,
- saleQuantity Int,
- revenue Int)
- PARTITIONED BY (productCategory String, productBatch String)
+ productNumber INT,
+ productName STRING,
+ storeCity STRING,
+ storeProvince STRING,
+ saleQuantity INT,
+ revenue INT)
+ PARTITIONED BY (productCategory STRING, productBatch STRING)
STORED BY 'carbondata'
```
@@ -745,12 +745,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
Example:
```
CREATE TABLE IF NOT EXISTS hash_partition_table(
- col_A String,
- col_B Int,
- col_C Long,
- col_D Decimal(10,2),
- col_F Timestamp
- ) PARTITIONED BY (col_E Long)
+ col_A STRING,
+ col_B INT,
+ col_C LONG,
+ col_D DECIMAL(10,2),
+ col_F TIMESTAMP
+ ) PARTITIONED BY (col_E LONG)
STORED BY 'carbondata' TBLPROPERTIES('PARTITION_TYPE'='HASH','NUM_PARTITIONS'='9')
```
@@ -773,11 +773,11 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
Example:
```
CREATE TABLE IF NOT EXISTS range_partition_table(
- col_A String,
- col_B Int,
- col_C Long,
- col_D Decimal(10,2),
- col_E Long
+ col_A STRING,
+ col_B INT,
+ col_C LONG,
+ col_D DECIMAL(10,2),
+ col_E LONG
) partitioned by (col_F Timestamp)
PARTITIONED BY 'carbondata'
TBLPROPERTIES('PARTITION_TYPE'='RANGE',
@@ -800,12 +800,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
Example:
```
CREATE TABLE IF NOT EXISTS list_partition_table(
- col_B Int,
- col_C Long,
- col_D Decimal(10,2),
- col_E Long,
- col_F Timestamp
- ) PARTITIONED BY (col_A String)
+ col_B INT,
+ col_C LONG,
+ col_D DECIMAL(10,2),
+ col_E LONG,
+ col_F TIMESTAMP
+ ) PARTITIONED BY (col_A STRING)
STORED BY 'carbondata'
TBLPROPERTIES('PARTITION_TYPE'='LIST',
'LIST_INFO'='aaaa, bbbb, (cccc, dddd), eeee')
@@ -861,22 +861,22 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
## PRE-AGGREGATE TABLES
- Carbondata supports pre aggregating of data so that OLAP kind of queries can fetch data
- much faster.Aggregate tables are created as datamaps so that the handling is as efficient as
- other indexing support.Users can create as many aggregate tables they require as datamaps to
- improve their query performance,provided the storage requirements and loading speeds are
+ CarbonData supports pre aggregating of data so that OLAP kind of queries can fetch data
+ much faster. Aggregate tables are created as datamaps so that the handling is as efficient as
+ other indexing support. Users can create as many aggregate tables they require as datamaps to
+ improve their query performance, provided the storage requirements and loading speeds are
acceptable.
For main table called **sales** which is defined as
```
CREATE TABLE sales (
- order_time timestamp,
- user_id string,
- sex string,
- country string,
- quantity int,
- price bigint)
+ order_time timestamp,
+ user_id STRING,
+ sex STRING,
+ country STRING,
+ quantity INT,
+ price BIGINT)
STORED BY 'carbondata'
```
@@ -944,7 +944,7 @@ pre-aggregate table to fetch the data.
##### Compacting pre-aggregate tables
Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
Running Compaction command on main table will **not automatically** compact the pre-aggregate
-tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on
+tables. Compaction is an optional operation for pre-aggregate table. If compaction is performed on
main table but not performed on pre-aggregate table, all queries still can benefit from
pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be
triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table.
@@ -963,7 +963,7 @@ This functionality is not supported.
NOTE (RESTRICTION):
Delete Segment operations are not supported on main table which has pre-aggregate tables
- created on it. All the pre-aggregate tables will have to be dropped before update/delete
+ created on it. All the pre-aggregate tables will have to be dropped before delete segment
operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually
after delete segment operations are completed
@@ -974,111 +974,96 @@ This functionality is not supported.
Adding new column in new table does not have any affect on pre-aggregate tables. However if
dropping or renaming a column has impact in pre-aggregate table, such operations will be
rejected and error will be thrown. All the pre-aggregate tables will have to be dropped
- before Alter Operations can be performed on the main table. Pre-aggregate tables can be rebuilt
- manually after Alter Table operations are completed
+ before alter operations can be performed on the main table. Pre-aggregate tables can be rebuilt
+ manually after alter table operations are completed
### Supporting timeseries data (Alpha feature in 1.3.0)
-Carbondata has built-in understanding of time hierarchy and levels: year, month, day, hour, minute.
-Multiple pre-aggregate tables can be created for the hierarchy and Carbondata can do automatic
+CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second.
+Timeseries pre-aggregate tables can be created with different granularity and CarbonData can do automatic
roll-up for the queries on these hierarchies.
```
- CREATE DATAMAP agg_year
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'year_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_month
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'month_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_day
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'day_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_sales_hour
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'hour_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_minute
+ CREATE DATAMAP agg_hour
ON TABLE sales
USING "timeseries"
DMPROPERTIES (
- 'event_time'='order_time',
- 'minute_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_minute
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'minute_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
- ```
-
- For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports
+ 'event_time'='order_time',
+ 'hour_granularity'='1',
+ )
+ AS
+ SELECT
+ order_time,
+ country,
+ sex,
+ sum(quantity),
+ max(quantity),
+ count(user_id),
+ sum(price),
+ avg(price)
+ FROM sales
+ GROUP BY order_time, country, sex
+ ```
+ User also can use other granularity: second_granularity, minute_granularity,
+ day_granularity, month_granularity, year_granularity
+ For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports
UDF as
```
timeseries(timeseries column name, 'aggregation level')
```
+ Examples
```
- Select timeseries(order_time, 'hour'), sum(quantity) from sales group by timeseries(order_time,
- 'hour')
+ SELECT
+ timeseries(order_time, 'hour'),
+ sum(quantity)
+ FROM sales
+ GROUP BY timeseries(order_time, 'hour')
```
It is **not necessary** to create pre-aggregate tables for each granularity unless required for
- query. Carbondata can roll-up the data and fetch it.
+ query. CarbonData can roll-up the data and fetch it.
- For Example: For main table **sales** , If pre-aggregate tables were created as
+ For Example: For main table **sales**, If timeseries pre-aggregate tables were created as
```
CREATE DATAMAP agg_day
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'day_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_sales_hour
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'hour_granualrity'='1',
- ) AS
- SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
- avg(price) FROM sales GROUP BY order_time, country, sex
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'day_granularity'='1',
+ )
+ AS
+ SELECT
+ order_time,
+ country,
+ sex,
+ sum(quantity),
+ max(quantity),
+ count(user_id),
+ sum(price),
+ avg(price)
+ FROM sales
+ GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_sales_hour
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'hour_granularity'='1',
+ )
+ AS
+ SELECT
+ order_time,
+ country,
+ sex,
+ sum(quantity),
+ max(quantity),
+ count(user_id),
+ sum(price),
+ avg(price)
+ FROM sales
+ GROUP BY order_time, country, sex
```
Queries like below will be rolled-up and fetched from pre-aggregate tables
@@ -1091,10 +1076,10 @@ roll-up for the queries on these hierarchies.
```
NOTE (RESTRICTION):
- * Only value of 1 is supported for hierarchy levels. Other hierarchy levels are not supported.
- Other hierarchy levels are not supported
- * pre-aggregate tables for the desired levels needs to be created one after the other
- * pre-aggregate tables created for each level needs to be dropped separately
+ * Only 1 is supported for granularity value of timeseries pre-aggregate table. Other granularity value are not supported.
+ * Only one granularity can be defined on creating one timeseries pre-aggregate table. Other granularity are created separately.
+ * Pre-aggregate tables for the desired levels needs to be created one after the other
+ * Pre-aggregate tables are created for each level needs to be dropped separately
## BUCKETING
@@ -1119,14 +1104,14 @@ roll-up for the queries on these hierarchies.
Example:
```
CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
- productNumber Int,
- saleQuantity Int,
- productName String,
- storeCity String,
- storeProvince String,
- productCategory String,
- productBatch String,
- revenue Int)
+ productNumber INT,
+ saleQuantity INT,
+ productName STRING,
+ storeCity STRING,
+ storeProvince STRING,
+ productCategory STRING,
+ productBatch STRING,
+ revenue INT)
STORED BY 'carbondata'
TBLPROPERTIES ('BUCKETNUMBER'='4', 'BUCKETCOLUMNS'='productName')
```
@@ -1201,7 +1186,7 @@ roll-up for the queries on these hierarchies.
NOTE:
carbon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only.
- If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query.
+ If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query.
```
CarbonSession.threadSet ("carbon.input.segments..","");
```
@@ -1211,7 +1196,7 @@ roll-up for the queries on these hierarchies.
SET carbon.input.segments.. = *;
```
- If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query.
+ If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query.
```
CarbonSession.threadSet ("carbon.input.segments..","*");
```
From 488ad0137249ba7643f6a99bb33a430a9519595d Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Fri, 2 Mar 2018 17:43:23 +0800
Subject: [PATCH 2/3] add
---
docs/data-management-on-carbondata.md | 132 +++++++++++++++-----------
1 file changed, 75 insertions(+), 57 deletions(-)
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index f4cdae3be48..c085de3986b 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -983,28 +983,66 @@ Timeseries pre-aggregate tables can be created with different granularity and Ca
roll-up for the queries on these hierarchies.
```
- CREATE DATAMAP agg_hour
+ CREATE DATAMAP agg_year
ON TABLE sales
USING "timeseries"
DMPROPERTIES (
- 'event_time'='order_time',
- 'hour_granularity'='1',
- )
- AS
- SELECT
- order_time,
- country,
- sex,
- sum(quantity),
- max(quantity),
- count(user_id),
- sum(price),
- avg(price)
- FROM sales
- GROUP BY order_time, country, sex
- ```
- User also can use other granularity: second_granularity, minute_granularity,
- day_granularity, month_granularity, year_granularity
+ 'event_time'='order_time',
+ 'year_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_month
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'month_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_day
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'day_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_sales_hour
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'hour_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_minute
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'minute_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_minute
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'minute_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+ ```
For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports
UDF as
```
@@ -1026,44 +1064,24 @@ roll-up for the queries on these hierarchies.
```
CREATE DATAMAP agg_day
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'day_granularity'='1',
- )
- AS
- SELECT
- order_time,
- country,
- sex,
- sum(quantity),
- max(quantity),
- count(user_id),
- sum(price),
- avg(price)
- FROM sales
- GROUP BY order_time, country, sex
-
- CREATE DATAMAP agg_sales_hour
- ON TABLE sales
- USING "timeseries"
- DMPROPERTIES (
- 'event_time'='order_time',
- 'hour_granularity'='1',
- )
- AS
- SELECT
- order_time,
- country,
- sex,
- sum(quantity),
- max(quantity),
- count(user_id),
- sum(price),
- avg(price)
- FROM sales
- GROUP BY order_time, country, sex
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'day_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+ CREATE DATAMAP agg_sales_hour
+ ON TABLE sales
+ USING "timeseries"
+ DMPROPERTIES (
+ 'event_time'='order_time',
+ 'hour_granualrity'='1',
+ ) AS
+ SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
```
Queries like below will be rolled-up and fetched from pre-aggregate tables
From 5ccee90352cb234412cc140f8addeeb029538778 Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Fri, 2 Mar 2018 17:46:01 +0800
Subject: [PATCH 3/3] add
---
docs/data-management-on-carbondata.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index c085de3986b..ea80d41bfc9 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -979,7 +979,7 @@ This functionality is not supported.
### Supporting timeseries data (Alpha feature in 1.3.0)
CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second.
-Timeseries pre-aggregate tables can be created with different granularity and CarbonData can do automatic
+Timeseries pre-aggregate tables can be created for the hierarchy and CarbonData can do automatic
roll-up for the queries on these hierarchies.
```