Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ public enum ErrorMsg {
CTLF_MISSING_STORAGE_FORMAT_DESCRIPTOR(20021, "Failed to find StorageFormatDescriptor for file format ''{0}''", true),
PARQUET_FOOTER_ERROR(20022, "Failed to read parquet footer:"),
PARQUET_UNHANDLED_TYPE(20023, "Unhandled type {0}", true),
ORC_FOOTER_ERROR(20024, "Failed to read orc footer:"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well the colon is there in PARQUET_FOOTER_ERROR but I am not sure why it is so, when this exception is thrown, post the colon it is the trace, should have been a period according to me, but let it be to be in sync with PARQUET_FOOTER_ERROR. But just out of curiosity if you know the reason behind it do let me know

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just follow the PARQUET_FOOTER_ERROR, and we can let @jfsii give a explanation. :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The source for the change was a comment on a review I had on the patch:
"nit: add a ":" such that the exception is printed after the delimiter"

I could have pushed back on the comment.


// An exception from runtime that will show the full stack to client
UNRESOLVED_RT_EXCEPTION(29999, "Runtime Error: {0}", "58004", true),
Expand Down
96 changes: 95 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSerde.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,46 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SchemaInference;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.Writable;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* A serde class for ORC. It transparently passes the object to/from the ORC
* file reader/writer. This SerDe does not support statistics, since serialized
* size doesn't make sense in the context of ORC files.
*/
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, OrcSerde.COMPRESSION})
public class OrcSerde extends AbstractSerDe {
public class OrcSerde extends AbstractSerDe implements SchemaInference {
private static final Logger LOG = LoggerFactory.getLogger(OrcSerde.class);

private final OrcSerdeRow row = new OrcSerdeRow();
private ObjectInspector inspector = null;

static final String COMPRESSION = "orc.compress";
static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$");

final class OrcSerdeRow implements Writable {
Object realRow;
Expand Down Expand Up @@ -117,4 +130,85 @@ public ObjectInspector getObjectInspector() throws SerDeException {
return inspector;
}

@Override
public List<FieldSchema> readSchema(Configuration conf, String file) throws SerDeException {
List<String> fieldNames;
List<TypeDescription> fieldTypes;
try (Reader reader = OrcFile.createReader(new Path(file), OrcFile.readerOptions(conf))) {
fieldNames = reader.getSchema().getFieldNames();
fieldTypes = reader.getSchema().getChildren();
} catch (Exception e) {
throw new SerDeException(ErrorMsg.ORC_FOOTER_ERROR.getErrorCodedMsg(), e);
}

List<FieldSchema> schema = new ArrayList<>();
for (int i = 0; i < fieldNames.size(); i++) {
FieldSchema fieldSchema = convertOrcTypeToFieldSchema(fieldNames.get(i), fieldTypes.get(i));
schema.add(fieldSchema);
LOG.debug("Inferred field schema {}", fieldSchema);
}
return schema;
}

private FieldSchema convertOrcTypeToFieldSchema(String fieldName, TypeDescription fieldType) {
String typeName = convertOrcTypeToFieldType(fieldType);
return new FieldSchema(fieldName, typeName, "Inferred from Orc file.");
}

private String convertOrcTypeToFieldType(TypeDescription fieldType) {
if (fieldType.getCategory().isPrimitive()) {
return convertPrimitiveType(fieldType);
}
return convertComplexType(fieldType);
}

private String convertPrimitiveType(TypeDescription fieldType) {
if (fieldType.getCategory().getName().equals("timestamp with local time zone")) {
throw new IllegalArgumentException("Unhandled ORC type " + fieldType.getCategory().getName());
}
return fieldType.toString();
}

private String convertComplexType(TypeDescription fieldType) {
StringBuilder buffer = new StringBuilder();
buffer.append(fieldType.getCategory().getName());
switch (fieldType.getCategory()) {
case LIST:
case MAP:
case UNION:
buffer.append('<');
for (int i = 0; i < fieldType.getChildren().size(); i++) {
if (i != 0) {
buffer.append(',');
}
buffer.append(convertOrcTypeToFieldType(fieldType.getChildren().get(i)));
}
buffer.append('>');
break;
case STRUCT:
buffer.append('<');
for (int i = 0; i < fieldType.getChildren().size(); ++i) {
if (i != 0) {
buffer.append(',');
}
getStructFieldName(buffer, fieldType.getFieldNames().get(i));
buffer.append(':');
buffer.append(convertOrcTypeToFieldType(fieldType.getChildren().get(i)));
}
buffer.append('>');
break;
default:
throw new IllegalArgumentException("ORC doesn't handle " +
fieldType.getCategory());
}
return buffer.toString();
}

static void getStructFieldName(StringBuilder buffer, String name) {
if (UNQUOTED_NAMES.matcher(name).matches()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the intent of this just to ensure there aren't any quoted name, I am not sure, but do explore

!name.startsWith("'") && !name.endsWith("'")

Which lands up being cheaper.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this code snippet was borrowed from ORC repo.
https://github.com/apache/orc/blob/6a74eef74b8101c7396f196f1a27f5e124a005f6/java/core/src/java/org/apache/orc/TypeDescription.java#L667-L673

And related ORC jira is https://issues.apache.org/jira/browse/ORC-104. I didn't do much research here, but i think it is better to be sync with ORC code. wdyt?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be fine, was just exploring a possibility. :-)

buffer.append(name);
} else {
buffer.append('`').append(name.replace("`", "``")).append('`');
}
}
}
88 changes: 88 additions & 0 deletions ql/src/test/queries/clientpositive/create_table_like_file_orc.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
-- all primitive types
-- timestamp_w_tz TIMESTAMP WITH LOCAL TIME ZONE is not supported by hive's orc implementation
CREATE EXTERNAL TABLE test_all_orc_types(tinyint_type TINYINT, smallint_type SMALLINT, bigint_type BIGINT, int_type INT, float_type FLOAT, double_type double, decimal_type DECIMAL(4,2), timestamp_type TIMESTAMP, date_type DATE, string_type STRING, varchar_type VARCHAR(100), char_type CHAR(34), boolean_type BOOLEAN, binary_type BINARY) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_all_orc_types';
-- insert two rows (the other tables only have 1 row)
INSERT INTO test_all_orc_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
SELECT * FROM test_all_orc_types;
DESCRIBE test_all_orc_types;
-- CREATE A LIKE table
CREATE TABLE like_test_all_orc_types LIKE FILE ORC '${system:test.tmp.dir}/test_all_orc_types/000000_0';
INSERT INTO like_test_all_orc_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
SELECT * FROM like_test_all_orc_types;
DESCRIBE like_test_all_orc_types;
DROP TABLE test_all_orc_types;
DROP TABLE like_test_all_orc_types;

-- complex types (array, map, union, struct)
-- array
CREATE EXTERNAL TABLE test_orc_array(str_array array<string>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_array';
DESCRIBE test_orc_array;
INSERT INTO test_orc_array SELECT array("bob", "sue");
SELECT * FROM test_orc_array;
CREATE TABLE like_test_orc_array LIKE FILE ORC '${system:test.tmp.dir}/test_orc_array/000000_0';
DESCRIBE like_test_orc_array;
INSERT INTO like_test_orc_array SELECT array("bob", "sue");
SELECT * FROM like_test_orc_array;
DROP TABLE like_test_orc_array;

-- map
CREATE EXTERNAL TABLE test_orc_map(simple_map map<int, string>, map_to_struct map<string, struct<i : int>>, map_to_map map<date,map<int, string>>, map_to_array map<binary, array<array<int>>>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_map';
DESCRIBE test_orc_map;
INSERT INTO test_orc_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)));
SELECT * FROM test_orc_map;
CREATE TABLE like_test_orc_map LIKE FILE ORC '${system:test.tmp.dir}/test_orc_map/000000_0';
DESCRIBE like_test_orc_map;
INSERT INTO like_test_orc_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)));
SELECT * FROM like_test_orc_map;
DROP TABLE like_test_orc_map;

-- union
CREATE TABLE src_tbl (key STRING, value STRING) STORED AS TEXTFILE;
INSERT INTO src_tbl VALUES ('hello', 'world');
CREATE TABLE test_orc_union (foo UNIONTYPE<string>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_union';
INSERT INTO test_orc_union SELECT create_union(0, key) FROM src_tbl LIMIT 2;
CREATE TABLE like_test_orc_union LIKE FILE ORC '${system:test.tmp.dir}/test_orc_union/000000_0';
DESCRIBE test_orc_union;
INSERT INTO like_test_orc_union SELECT create_union(0, key) FROM src_tbl LIMIT 2;
SELECT * FROM like_test_orc_union;
DROP TABLE like_test_orc_union;

-- struct
CREATE EXTERNAL TABLE test_complex_orc_struct(struct_type struct<tinyint_type : tinyint, smallint_type : smallint, bigint_type : bigint, int_type : int, float_type : float, double_type : double, decimal_type : DECIMAL(4,2), timestamp_type : TIMESTAMP, date_type : DATE, string_type : STRING, varchar_type : VARCHAR(100), char_type : CHAR(34), boolean_type : boolean, binary_type : binary>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_complex_orc_struct';
DESCRIBE test_complex_orc_struct;
-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail
-- non-CBO path works (HIVE-26398)
SET hive.cbo.enable=false;
INSERT INTO test_complex_orc_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", cast('char' as char(34)), "boolean_type", true, "binary_type", cast('binary_maybe' as binary));
SET hive.cbo.enable=true;
SELECT * FROM test_complex_orc_struct;
CREATE TABLE like_test_complex_orc_struct LIKE FILE ORC '${system:test.tmp.dir}/test_complex_orc_struct/000000_0';
DESCRIBE like_test_complex_orc_struct;
-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail
-- non-CBO path works (HIVE-26398)
SET hive.cbo.enable=false;
INSERT INTO like_test_complex_orc_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", cast('char' as char(34)), "boolean_type", true, "binary_type", cast('binary_maybe' as binary));
SET hive.cbo.enable=true;
SELECT * FROM like_test_complex_orc_struct;
DROP TABLE like_test_complex_orc_struct;

-- test complex types that contain other complex types
CREATE EXTERNAL TABLE test_orc_complex_complex(struct_type struct<i : int, s : string, m : map<string, array<int>>, struct_i : struct<str : string>>) STORED AS ORC LOCATION '${system:test.tmp.dir}/test_orc_complex_complex';
DESCRIBE test_orc_complex_complex;
INSERT INTO test_orc_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"));
SELECT * FROM test_orc_complex_complex;
CREATE TABLE like_test_orc_complex_complex LIKE FILE ORC '${system:test.tmp.dir}/test_orc_complex_complex/000000_0';
DESCRIBE like_test_orc_complex_complex;
INSERT INTO like_test_orc_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"));
SELECT * FROM like_test_orc_complex_complex;
DROP TABLE like_test_orc_complex_complex;

-- test adding partitioning to the destination table
CREATE TABLE like_test_orc_partitioning LIKE FILE ORC '${system:test.tmp.dir}/test_all_orc_types/000000_0' PARTITIONED BY (year STRING, month STRING);
DESCRIBE like_test_orc_partitioning;
INSERT INTO like_test_orc_partitioning PARTITION (year='1984', month='1') VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
(1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
SELECT * FROM like_test_orc_partitioning;
DROP TABLE like_test_orc_partitioning;
Loading