-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-26551: Support CREATE TABLE LIKE FILE for ORC #3611
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,33 +20,46 @@ | |
import java.io.DataInput; | ||
import java.io.DataOutput; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.hive.metastore.api.FieldSchema; | ||
import org.apache.hadoop.hive.ql.ErrorMsg; | ||
import org.apache.hadoop.hive.serde.serdeConstants; | ||
import org.apache.hadoop.hive.serde2.AbstractSerDe; | ||
import org.apache.hadoop.hive.serde2.SchemaInference; | ||
import org.apache.hadoop.hive.serde2.SerDeException; | ||
import org.apache.hadoop.hive.serde2.SerDeSpec; | ||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; | ||
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; | ||
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; | ||
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; | ||
import org.apache.hadoop.io.Writable; | ||
import org.apache.orc.OrcFile; | ||
import org.apache.orc.Reader; | ||
import org.apache.orc.TypeDescription; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
/** | ||
* A serde class for ORC. It transparently passes the object to/from the ORC | ||
* file reader/writer. This SerDe does not support statistics, since serialized | ||
* size doesn't make sense in the context of ORC files. | ||
*/ | ||
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, OrcSerde.COMPRESSION}) | ||
public class OrcSerde extends AbstractSerDe { | ||
public class OrcSerde extends AbstractSerDe implements SchemaInference { | ||
private static final Logger LOG = LoggerFactory.getLogger(OrcSerde.class); | ||
|
||
private final OrcSerdeRow row = new OrcSerdeRow(); | ||
private ObjectInspector inspector = null; | ||
|
||
static final String COMPRESSION = "orc.compress"; | ||
static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$"); | ||
|
||
final class OrcSerdeRow implements Writable { | ||
Object realRow; | ||
|
@@ -117,4 +130,85 @@ public ObjectInspector getObjectInspector() throws SerDeException { | |
return inspector; | ||
} | ||
|
||
@Override | ||
public List<FieldSchema> readSchema(Configuration conf, String file) throws SerDeException { | ||
List<String> fieldNames; | ||
List<TypeDescription> fieldTypes; | ||
try (Reader reader = OrcFile.createReader(new Path(file), OrcFile.readerOptions(conf))) { | ||
fieldNames = reader.getSchema().getFieldNames(); | ||
fieldTypes = reader.getSchema().getChildren(); | ||
} catch (Exception e) { | ||
throw new SerDeException(ErrorMsg.ORC_FOOTER_ERROR.getErrorCodedMsg(), e); | ||
} | ||
|
||
List<FieldSchema> schema = new ArrayList<>(); | ||
for (int i = 0; i < fieldNames.size(); i++) { | ||
FieldSchema fieldSchema = convertOrcTypeToFieldSchema(fieldNames.get(i), fieldTypes.get(i)); | ||
schema.add(fieldSchema); | ||
LOG.debug("Inferred field schema {}", fieldSchema); | ||
} | ||
return schema; | ||
} | ||
|
||
private FieldSchema convertOrcTypeToFieldSchema(String fieldName, TypeDescription fieldType) { | ||
String typeName = convertOrcTypeToFieldType(fieldType); | ||
return new FieldSchema(fieldName, typeName, "Inferred from Orc file."); | ||
} | ||
|
||
private String convertOrcTypeToFieldType(TypeDescription fieldType) { | ||
if (fieldType.getCategory().isPrimitive()) { | ||
return convertPrimitiveType(fieldType); | ||
} | ||
return convertComplexType(fieldType); | ||
} | ||
|
||
private String convertPrimitiveType(TypeDescription fieldType) { | ||
if (fieldType.getCategory().getName().equals("timestamp with local time zone")) { | ||
throw new IllegalArgumentException("Unhandled ORC type " + fieldType.getCategory().getName()); | ||
} | ||
return fieldType.toString(); | ||
} | ||
|
||
private String convertComplexType(TypeDescription fieldType) { | ||
StringBuilder buffer = new StringBuilder(); | ||
buffer.append(fieldType.getCategory().getName()); | ||
switch (fieldType.getCategory()) { | ||
case LIST: | ||
case MAP: | ||
case UNION: | ||
buffer.append('<'); | ||
for (int i = 0; i < fieldType.getChildren().size(); i++) { | ||
if (i != 0) { | ||
buffer.append(','); | ||
} | ||
buffer.append(convertOrcTypeToFieldType(fieldType.getChildren().get(i))); | ||
} | ||
buffer.append('>'); | ||
break; | ||
case STRUCT: | ||
buffer.append('<'); | ||
for (int i = 0; i < fieldType.getChildren().size(); ++i) { | ||
if (i != 0) { | ||
buffer.append(','); | ||
} | ||
getStructFieldName(buffer, fieldType.getFieldNames().get(i)); | ||
buffer.append(':'); | ||
buffer.append(convertOrcTypeToFieldType(fieldType.getChildren().get(i))); | ||
} | ||
buffer.append('>'); | ||
break; | ||
default: | ||
throw new IllegalArgumentException("ORC doesn't handle " + | ||
fieldType.getCategory()); | ||
} | ||
return buffer.toString(); | ||
} | ||
|
||
static void getStructFieldName(StringBuilder buffer, String name) { | ||
if (UNQUOTED_NAMES.matcher(name).matches()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the intent of this just to ensure there aren't any quoted
Which lands up being cheaper. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, this code snippet was borrowed from ORC repo. And related ORC jira is https://issues.apache.org/jira/browse/ORC-104. I didn't do much research here, but i think it is better to be sync with ORC code. wdyt? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be fine, was just exploring a possibility. :-) |
||
buffer.append(name); | ||
} else { | ||
buffer.append('`').append(name.replace("`", "``")).append('`'); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
-- all primitive types | ||
-- timestamp_w_tz TIMESTAMP WITH LOCAL TIME ZONE is not supported by hive's orc implementation | ||
CREATE EXTERNAL TABLE test_all_orc_types(tinyint_type TINYINT, smallint_type SMALLINT, bigint_type BIGINT, int_type INT, float_type FLOAT, double_type double, decimal_type DECIMAL(4,2), timestamp_type TIMESTAMP, date_type DATE, string_type STRING, varchar_type VARCHAR(100), char_type CHAR(34), boolean_type BOOLEAN, binary_type BINARY) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_all_orc_types'; | ||
-- insert two rows (the other tables only have 1 row) | ||
INSERT INTO test_all_orc_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'), | ||
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'); | ||
SELECT * FROM test_all_orc_types; | ||
DESCRIBE test_all_orc_types; | ||
-- CREATE A LIKE table | ||
CREATE TABLE like_test_all_orc_types LIKE FILE ORC '${system:test.tmp.dir}/test_all_orc_types/000000_0'; | ||
INSERT INTO like_test_all_orc_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'), | ||
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'); | ||
SELECT * FROM like_test_all_orc_types; | ||
DESCRIBE like_test_all_orc_types; | ||
DROP TABLE test_all_orc_types; | ||
DROP TABLE like_test_all_orc_types; | ||
|
||
-- complex types (array, map, union, struct) | ||
-- array | ||
CREATE EXTERNAL TABLE test_orc_array(str_array array<string>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_array'; | ||
DESCRIBE test_orc_array; | ||
INSERT INTO test_orc_array SELECT array("bob", "sue"); | ||
SELECT * FROM test_orc_array; | ||
CREATE TABLE like_test_orc_array LIKE FILE ORC '${system:test.tmp.dir}/test_orc_array/000000_0'; | ||
DESCRIBE like_test_orc_array; | ||
INSERT INTO like_test_orc_array SELECT array("bob", "sue"); | ||
SELECT * FROM like_test_orc_array; | ||
DROP TABLE like_test_orc_array; | ||
|
||
-- map | ||
CREATE EXTERNAL TABLE test_orc_map(simple_map map<int, string>, map_to_struct map<string, struct<i : int>>, map_to_map map<date,map<int, string>>, map_to_array map<binary, array<array<int>>>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_map'; | ||
DESCRIBE test_orc_map; | ||
INSERT INTO test_orc_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3))); | ||
SELECT * FROM test_orc_map; | ||
CREATE TABLE like_test_orc_map LIKE FILE ORC '${system:test.tmp.dir}/test_orc_map/000000_0'; | ||
DESCRIBE like_test_orc_map; | ||
INSERT INTO like_test_orc_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3))); | ||
SELECT * FROM like_test_orc_map; | ||
DROP TABLE like_test_orc_map; | ||
|
||
-- union | ||
CREATE TABLE src_tbl (key STRING, value STRING) STORED AS TEXTFILE; | ||
INSERT INTO src_tbl VALUES ('hello', 'world'); | ||
CREATE TABLE test_orc_union (foo UNIONTYPE<string>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_union'; | ||
INSERT INTO test_orc_union SELECT create_union(0, key) FROM src_tbl LIMIT 2; | ||
CREATE TABLE like_test_orc_union LIKE FILE ORC '${system:test.tmp.dir}/test_orc_union/000000_0'; | ||
DESCRIBE test_orc_union; | ||
INSERT INTO like_test_orc_union SELECT create_union(0, key) FROM src_tbl LIMIT 2; | ||
SELECT * FROM like_test_orc_union; | ||
DROP TABLE like_test_orc_union; | ||
|
||
-- struct | ||
CREATE EXTERNAL TABLE test_complex_orc_struct(struct_type struct<tinyint_type : tinyint, smallint_type : smallint, bigint_type : bigint, int_type : int, float_type : float, double_type : double, decimal_type : DECIMAL(4,2), timestamp_type : TIMESTAMP, date_type : DATE, string_type : STRING, varchar_type : VARCHAR(100), char_type : CHAR(34), boolean_type : boolean, binary_type : binary>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_complex_orc_struct'; | ||
DESCRIBE test_complex_orc_struct; | ||
-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail | ||
-- non-CBO path works (HIVE-26398) | ||
SET hive.cbo.enable=false; | ||
INSERT INTO test_complex_orc_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", cast('char' as char(34)), "boolean_type", true, "binary_type", cast('binary_maybe' as binary)); | ||
SET hive.cbo.enable=true; | ||
SELECT * FROM test_complex_orc_struct; | ||
CREATE TABLE like_test_complex_orc_struct LIKE FILE ORC '${system:test.tmp.dir}/test_complex_orc_struct/000000_0'; | ||
DESCRIBE like_test_complex_orc_struct; | ||
-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail | ||
-- non-CBO path works (HIVE-26398) | ||
SET hive.cbo.enable=false; | ||
INSERT INTO like_test_complex_orc_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", cast('char' as char(34)), "boolean_type", true, "binary_type", cast('binary_maybe' as binary)); | ||
SET hive.cbo.enable=true; | ||
SELECT * FROM like_test_complex_orc_struct; | ||
DROP TABLE like_test_complex_orc_struct; | ||
|
||
-- test complex types that contain other complex types | ||
CREATE EXTERNAL TABLE test_orc_complex_complex(struct_type struct<i : int, s : string, m : map<string, array<int>>, struct_i : struct<str : string>>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_complex_complex'; | ||
DESCRIBE test_orc_complex_complex; | ||
INSERT INTO test_orc_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str")); | ||
SELECT * FROM test_orc_complex_complex; | ||
CREATE TABLE like_test_orc_complex_complex LIKE FILE ORC '${system:test.tmp.dir}/test_orc_complex_complex/000000_0'; | ||
DESCRIBE like_test_orc_complex_complex; | ||
INSERT INTO like_test_orc_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str")); | ||
SELECT * FROM like_test_orc_complex_complex; | ||
DROP TABLE like_test_orc_complex_complex; | ||
|
||
-- test adding partitioning to the destination table | ||
CREATE TABLE like_test_orc_partitioning LIKE FILE ORC '${system:test.tmp.dir}/test_all_orc_types/000000_0' PARTITIONED BY (year STRING, month STRING); | ||
DESCRIBE like_test_orc_partitioning; | ||
INSERT INTO like_test_orc_partitioning PARTITION (year='1984', month='1') VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'), | ||
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'); | ||
SELECT * FROM like_test_orc_partitioning; | ||
DROP TABLE like_test_orc_partitioning; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well the colon is there in
PARQUET_FOOTER_ERROR
but I am not sure why it is so, when this exception is thrown, post the colon it is the trace, should have been a period according to me, but let it be to be in sync with PARQUET_FOOTER_ERROR. But just out of curiosity if you know the reason behind it do let me knowThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just follow the PARQUET_FOOTER_ERROR, and we can let @jfsii give a explanation. :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The source for the change was a comment on a review I had on the patch:
"nit: add a ":" such that the exception is printed after the delimiter"
I could have pushed back on the comment.