Skip to content
Permalink
Browse files
HIVE-25194: Add support for STORED AS ORC/PARQUET/AVRO for Iceberg (#…
…2348) (Laszlo Pinter, reviewed by Marton Bod and Peter Vary)
  • Loading branch information
lcspinter committed Jun 8, 2021
1 parent 799b2c2 commit 7a0cb27054d9bbde978ccc6ba3470487102d1a00
Showing 13 changed files with 427 additions and 15 deletions.
@@ -0,0 +1 @@
create external table test_table ( a int ) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' stored as orc;
@@ -0,0 +1 @@
FAILED: SemanticException STORED AS is not supported for storage handler org.apache.hadoop.hive.hbase.HBaseStorageHandler
@@ -60,6 +60,7 @@
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
@@ -259,6 +260,11 @@ public boolean supportsPartitionTransform() {
return true;
}

@Override
public String getFileFormatPropertyKey() {
return TableProperties.DEFAULT_FILE_FORMAT;
}

public boolean addDynamicSplitPruningEdge(org.apache.hadoop.hive.ql.metadata.Table table,
ExprNodeDesc syntheticFilterPredicate) {
try {
@@ -0,0 +1,3 @@
set hive.vectorized.execution.enabled=false;
DROP TABLE IF EXISTS ice_orc;
CREATE EXTERNAL TABLE ice_orc (i int, s string, ts timestamp, d date) STORED BY ICEBERG WITH SERDEPROPERTIES('write.format.default'='orc') STORED AS ORC;
@@ -0,0 +1,3 @@
set hive.vectorized.execution.enabled=false;
DROP TABLE IF EXISTS ice_orc;
CREATE EXTERNAL TABLE ice_orc (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS ORC TBLPROPERTIES('write.format.default'='orc');
@@ -0,0 +1,24 @@
set hive.vectorized.execution.enabled=false;
DROP TABLE IF EXISTS ice_orc;
CREATE EXTERNAL TABLE ice_orc (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS ORC;
DESCRIBE FORMATTED ice_orc;
DROP TABLE ice_orc;

DROP TABLE IF EXISTS ice_parquet;
CREATE EXTERNAL TABLE ice_parquet (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS PARQUET;
DESCRIBE FORMATTED ice_parquet;
DROP TABLE ice_parquet;

DROP TABLE IF EXISTS ice_avro;
CREATE EXTERNAL TABLE ice_avro (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS AVRO;
DESCRIBE FORMATTED ice_avro;
DROP TABLE ice_avro;

DROP TABLE IF EXISTS ice_t;
CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' STORED AS AVRO;
DESCRIBE FORMATTED ice_t;
DROP TABLE ice_t;

CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY ICEBERG WITH SERDEPROPERTIES('dummy'='dummy_value') STORED AS ORC;
DESCRIBE FORMATTED ice_t;
DROP TABLE ice_t;
@@ -0,0 +1,5 @@
PREHOOK: query: DROP TABLE IF EXISTS ice_orc
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_orc
POSTHOOK: type: DROPTABLE
FAILED: SemanticException Provide only one of the following: STORED BY ORC or WITH SERDEPROPERTIES('write.format.default'='ORC')
@@ -0,0 +1,5 @@
PREHOOK: query: DROP TABLE IF EXISTS ice_orc
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_orc
POSTHOOK: type: DROPTABLE
FAILED: SemanticException Provide only one of the following: STORED BY orc or WITH SERDEPROPERTIES('write.format.default'='orc') or TBLPROPERTIES('write.format.default'='orc')
@@ -0,0 +1,312 @@
PREHOOK: query: DROP TABLE IF EXISTS ice_orc
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_orc
POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE EXTERNAL TABLE ice_orc (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice_orc
POSTHOOK: query: CREATE EXTERNAL TABLE ice_orc (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice_orc
PREHOOK: query: DESCRIBE FORMATTED ice_orc
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice_orc
POSTHOOK: query: DESCRIBE FORMATTED ice_orc
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice_orc
# col_name data_type comment
i int from deserializer
s string from deserializer
ts timestamp from deserializer
d date from deserializer

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\",\"i\":\"true\",\"s\":\"true\",\"ts\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
engine.hive.enabled true
external.table.purge TRUE
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
rawDataSize 0
serialization.format 1
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize 0
#### A masked pattern was here ####
write.format.default ORC

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Num Buckets: 0
Bucket Columns: []
Sort Columns: []
PREHOOK: query: DROP TABLE ice_orc
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice_orc
PREHOOK: Output: default@ice_orc
POSTHOOK: query: DROP TABLE ice_orc
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice_orc
POSTHOOK: Output: default@ice_orc
PREHOOK: query: DROP TABLE IF EXISTS ice_parquet
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_parquet
POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE EXTERNAL TABLE ice_parquet (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS PARQUET
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice_parquet
POSTHOOK: query: CREATE EXTERNAL TABLE ice_parquet (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS PARQUET
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice_parquet
PREHOOK: query: DESCRIBE FORMATTED ice_parquet
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice_parquet
POSTHOOK: query: DESCRIBE FORMATTED ice_parquet
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice_parquet
# col_name data_type comment
i int from deserializer
s string from deserializer
ts timestamp from deserializer
d date from deserializer

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\",\"i\":\"true\",\"s\":\"true\",\"ts\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
engine.hive.enabled true
external.table.purge TRUE
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
rawDataSize 0
serialization.format 1
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize 0
#### A masked pattern was here ####
write.format.default PARQUET

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Num Buckets: 0
Bucket Columns: []
Sort Columns: []
PREHOOK: query: DROP TABLE ice_parquet
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice_parquet
PREHOOK: Output: default@ice_parquet
POSTHOOK: query: DROP TABLE ice_parquet
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice_parquet
POSTHOOK: Output: default@ice_parquet
PREHOOK: query: DROP TABLE IF EXISTS ice_avro
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_avro
POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE EXTERNAL TABLE ice_avro (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS AVRO
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice_avro
POSTHOOK: query: CREATE EXTERNAL TABLE ice_avro (i int, s string, ts timestamp, d date) STORED BY ICEBERG STORED AS AVRO
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice_avro
PREHOOK: query: DESCRIBE FORMATTED ice_avro
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice_avro
POSTHOOK: query: DESCRIBE FORMATTED ice_avro
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice_avro
# col_name data_type comment
i int from deserializer
s string from deserializer
ts timestamp from deserializer
d date from deserializer

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\",\"i\":\"true\",\"s\":\"true\",\"ts\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
engine.hive.enabled true
external.table.purge TRUE
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
rawDataSize 0
serialization.format 1
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize 0
#### A masked pattern was here ####
write.format.default AVRO

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Num Buckets: 0
Bucket Columns: []
Sort Columns: []
PREHOOK: query: DROP TABLE ice_avro
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice_avro
PREHOOK: Output: default@ice_avro
POSTHOOK: query: DROP TABLE ice_avro
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice_avro
POSTHOOK: Output: default@ice_avro
PREHOOK: query: DROP TABLE IF EXISTS ice_t
PREHOOK: type: DROPTABLE
POSTHOOK: query: DROP TABLE IF EXISTS ice_t
POSTHOOK: type: DROPTABLE
PREHOOK: query: CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' STORED AS AVRO
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice_t
POSTHOOK: query: CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' STORED AS AVRO
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice_t
PREHOOK: query: DESCRIBE FORMATTED ice_t
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice_t
POSTHOOK: query: DESCRIBE FORMATTED ice_t
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice_t
# col_name data_type comment
i int from deserializer
s string from deserializer
ts timestamp from deserializer
d date from deserializer

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\",\"i\":\"true\",\"s\":\"true\",\"ts\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
engine.hive.enabled true
external.table.purge TRUE
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
rawDataSize 0
serialization.format 1
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize 0
#### A masked pattern was here ####
write.format.default AVRO

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Num Buckets: 0
Bucket Columns: []
Sort Columns: []
PREHOOK: query: DROP TABLE ice_t
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice_t
PREHOOK: Output: default@ice_t
POSTHOOK: query: DROP TABLE ice_t
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice_t
POSTHOOK: Output: default@ice_t
PREHOOK: query: CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY ICEBERG WITH SERDEPROPERTIES('dummy'='dummy_value') STORED AS ORC
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@ice_t
POSTHOOK: query: CREATE EXTERNAL TABLE ice_t (i int, s string, ts timestamp, d date) STORED BY ICEBERG WITH SERDEPROPERTIES('dummy'='dummy_value') STORED AS ORC
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@ice_t
PREHOOK: query: DESCRIBE FORMATTED ice_t
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@ice_t
POSTHOOK: query: DESCRIBE FORMATTED ice_t
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@ice_t
# col_name data_type comment
i int from deserializer
s string from deserializer
ts timestamp from deserializer
d date from deserializer

# Detailed Table Information
Database: default
#### A masked pattern was here ####
Retention: 0
#### A masked pattern was here ####
Table Type: EXTERNAL_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\",\"i\":\"true\",\"s\":\"true\",\"ts\":\"true\"}}
EXTERNAL TRUE
bucketing_version 2
dummy dummy_value
engine.hive.enabled true
external.table.purge TRUE
metadata_location hdfs://### HDFS PATH ###
numFiles 0
numRows 0
rawDataSize 0
serialization.format 1
storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
table_type ICEBERG
totalSize 0
#### A masked pattern was here ####
write.format.default ORC

# Storage Information
SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
Compressed: No
Num Buckets: 0
Bucket Columns: []
Sort Columns: []
PREHOOK: query: DROP TABLE ice_t
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice_t
PREHOOK: Output: default@ice_t
POSTHOOK: query: DROP TABLE ice_t
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@ice_t
POSTHOOK: Output: default@ice_t

0 comments on commit 7a0cb27

Please sign in to comment.