-
Notifications
You must be signed in to change notification settings - Fork 3.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
'core' ORC extension #7138
'core' ORC extension #7138
Changes from 2 commits
b899190
04dbefe
58a63cd
c674c4d
90ff75d
e032ed3
920827b
c552eb5
6c665ec
64ce9e3
732346c
bd8d3af
b53fb36
fa83988
f6ea39b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,8 @@ | |
|
||
package org.apache.druid.data.input.orc; | ||
|
||
import it.unimi.dsi.fastutil.objects.Object2IntMap; | ||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; | ||
import org.apache.druid.java.util.common.DateTimes; | ||
import org.apache.druid.java.util.common.StringUtils; | ||
import org.apache.hadoop.hive.serde2.io.DateWritable; | ||
|
@@ -49,6 +51,19 @@ | |
|
||
public class OrcStructConverter | ||
{ | ||
@Nonnull | ||
private static List<Object> convertList(TypeDescription fieldDescription, OrcList orcList, boolean binaryAsString) | ||
{ | ||
// if primitive list, convert primitives | ||
TypeDescription listType = fieldDescription.getChildren().get(0); | ||
if (listType.getCategory().isPrimitive()) { | ||
return (List<Object>) orcList.stream() | ||
.map(li -> convertPrimitive(listType, (WritableComparable) li, binaryAsString)) | ||
.collect(Collectors.toList()); | ||
} | ||
return new ArrayList<Object>(orcList); | ||
} | ||
|
||
private static Map<Object, Object> convertMap( | ||
TypeDescription fieldDescription, | ||
OrcMap<? extends WritableComparable, ? extends WritableComparable> map, | ||
|
@@ -126,6 +141,7 @@ private static Object convertPrimitive(TypeDescription fieldDescription, Writabl | |
} | ||
|
||
private boolean binaryAsString; | ||
private Object2IntMap<String> fieldIndexCache; | ||
|
||
OrcStructConverter(boolean binaryAsString) | ||
{ | ||
|
@@ -139,18 +155,46 @@ private static Object convertPrimitive(TypeDescription fieldDescription, Writabl | |
* primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally, | ||
* if a field is not present, this method will return null. | ||
* | ||
* Note: "Union" types are not currently supported and will be returned as null | ||
* Note: "Union" types are not currently supported and will be returned as null. Additionally, this method | ||
* has a cache of field names to field index that is ONLY valid for the root level {@link OrcStruct}, and should | ||
* not be used for nested {@link OrcStruct} fields of the root. | ||
*/ | ||
@Nullable | ||
Object convertField(OrcStruct struct, String fieldName) | ||
{ | ||
// this cache is only valid for the root level, to skip the indexOf on fieldNames to get the fieldIndex. | ||
TypeDescription schema = struct.getSchema(); | ||
int fieldIndex = schema.getFieldNames().indexOf(fieldName); | ||
final List<String> fields = schema.getFieldNames(); | ||
if (fieldIndexCache == null) { | ||
fieldIndexCache = new Object2IntOpenHashMap<>(fields.size()); | ||
for (int i = 0; i < fields.size(); i++) { | ||
fieldIndexCache.put(fields.get(i), i); | ||
} | ||
} | ||
WritableComparable wc = struct.getFieldValue(fieldName); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable is not used. I assume that calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deleted here: #7738 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ouch, yeah this is not supposed to be there, I missed it and worse, it is sort of defeating the purpose of the field index cache since it's still causing Thanks for fixing this, I'm going to open a PR to 0.15 branch to effectively backport this part of #7738 because this is a performance issue for data with lots of fields. |
||
|
||
int fieldIndex = fieldIndexCache.getOrDefault(fieldName, -1); | ||
|
||
return convertField(struct, fieldIndex); | ||
} | ||
|
||
/** | ||
* Convert a orc struct field as though it were a map, by fieldIndex. Complex types will be transformed | ||
* into java lists and maps when possible ({@link OrcStructConverter#convertList} and | ||
* {@link OrcStructConverter#convertMap}), and | ||
* primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally, | ||
* if a field is not present, this method will return null. | ||
* | ||
* Note: "Union" types are not currently supported and will be returned as null | ||
*/ | ||
@Nullable | ||
Object convertField(OrcStruct struct, int fieldIndex) | ||
{ | ||
if (fieldIndex < 0) { | ||
return null; | ||
} | ||
|
||
TypeDescription schema = struct.getSchema(); | ||
TypeDescription fieldDescription = schema.getChildren().get(fieldIndex); | ||
WritableComparable fieldValue = struct.getFieldValue(fieldIndex); | ||
|
||
|
@@ -172,13 +216,13 @@ Object convertField(OrcStruct struct, String fieldName) | |
switch (fieldDescription.getCategory()) { | ||
case LIST: | ||
OrcList orcList = (OrcList) fieldValue; | ||
return convertList(fieldDescription, orcList); | ||
return convertList(fieldDescription, orcList, binaryAsString); | ||
case MAP: | ||
OrcMap map = (OrcMap) fieldValue; | ||
return convertMap(fieldDescription, map, binaryAsString); | ||
case STRUCT: | ||
OrcStruct structMap = (OrcStruct) fieldValue; | ||
return convertMap(structMap); | ||
return convertStructToMap(structMap); | ||
case UNION: | ||
// sorry union types :( | ||
default: | ||
|
@@ -187,25 +231,13 @@ Object convertField(OrcStruct struct, String fieldName) | |
} | ||
} | ||
|
||
@Nonnull | ||
private List<Object> convertList(TypeDescription fieldDescription, OrcList orcList) | ||
{ | ||
// if primitive list, convert primitives | ||
TypeDescription listType = fieldDescription.getChildren().get(0); | ||
if (listType.getCategory().isPrimitive()) { | ||
return (List<Object>) orcList.stream() | ||
.map(li -> convertPrimitive(listType, (WritableComparable) li, binaryAsString)) | ||
.collect(Collectors.toList()); | ||
} | ||
return new ArrayList<Object>(orcList); | ||
} | ||
|
||
|
||
private Map<String, Object> convertMap(OrcStruct map) | ||
private Map<String, Object> convertStructToMap(OrcStruct map) | ||
{ | ||
Map<String, Object> converted = new HashMap<>(); | ||
for (String key : map.getSchema().getFieldNames()) { | ||
converted.put(key, convertField(map, key)); | ||
List<String> fieldNames = map.getSchema().getFieldNames(); | ||
|
||
for (int i = 0; i < fieldNames.size(); i++) { | ||
converted.put(fieldNames.get(i), convertField(map, i)); | ||
} | ||
return converted; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This design is not foolproof enough- it's risky/errorprone, and a bit obtuse to read, because two methods called
convertField
with slightly different arguments have very different semantics. This one should be renamed toconvertRootField
and the javadoc should call out the restriction prominently, rather than in a side node.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, will fix 👍